iguvium 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/iguvium.gemspec +3 -3
- data/lib/iguvium.rb +0 -7
- data/lib/iguvium/page.rb +4 -1
- data/lib/iguvium/table.rb +18 -23
- data/lib/iguvium/version.rb +1 -1
- metadata +14 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59643f281def94149de22d6b858f1f90f4b4b05289ef4f3ef13f146e4621f6c9
|
4
|
+
data.tar.gz: 7f10445415eadcc746b8d21050443595ecf127c7f6856a8b2962f8d072825447
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d3899e42a5bcaa6359554ff96d803f4a09acdacbb53855887bdedfda9245604cf24054a3588508d58222cf4c7783d031b36bff9c6e3e0cac2476c6552a54dc4
|
7
|
+
data.tar.gz: 8b6011b52aa2232597d1dea4ef087d36d4713879a0a6c76aba3849337f348329357499e8a5d8c3ec9ace4346e64a058d5b6fd415626b6553dfa2f29f6d516278
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.1] - 2019-05-30
|
6
|
+
### Fixed
|
7
|
+
- Rare `undefined method 'begin' for nil:NilClass` error fixed
|
8
|
+
- Remove Iguvium::Table#to_a result caching
|
9
|
+
|
5
10
|
## [0.9.0] - 2018-12-07
|
6
11
|
### Added
|
7
12
|
- Open cells rendering added. Tables like this are now processed correctly:
|
data/iguvium.gemspec
CHANGED
@@ -11,8 +11,8 @@ Gem::Specification.new do |spec|
|
|
11
11
|
spec.email = ['dima@scriptangle.com']
|
12
12
|
|
13
13
|
spec.summary = 'Extract tables from PDF as a structured info'
|
14
|
-
spec.description =
|
15
|
-
then recognizes table separators optically. No OpenCV or other heavy dependencies
|
14
|
+
spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
|
15
|
+
to image, then recognizes table separators optically. No OpenCV or other heavy dependencies"
|
16
16
|
spec.homepage = 'https://github.com/adworse/iguvium'
|
17
17
|
spec.license = 'MIT'
|
18
18
|
|
@@ -29,9 +29,9 @@ then recognizes table separators optically. No OpenCV or other heavy dependencie
|
|
29
29
|
|
30
30
|
spec.require_paths = ['lib']
|
31
31
|
|
32
|
-
spec.add_dependency 'pdf-reader', '~> 2.1'
|
33
32
|
spec.add_dependency 'convolver-light', '~> 0.3.1'
|
34
33
|
spec.add_dependency 'oily_png', '~> 1.2'
|
34
|
+
spec.add_dependency 'pdf-reader', '~> 2.1'
|
35
35
|
spec.add_dependency 'slop', '~> 4.2'
|
36
36
|
|
37
37
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
data/lib/iguvium.rb
CHANGED
@@ -116,11 +116,4 @@ module Iguvium
|
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
119
|
-
# TODO: 4) Add options like maybe image thresholding
|
120
|
-
#
|
121
|
-
# TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
|
122
|
-
# __|____|_______|_____|
|
123
|
-
# __|____|_______|_____|
|
124
|
-
# __|____|_______|_____|
|
125
|
-
#
|
126
119
|
# TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
|
data/lib/iguvium/page.rb
CHANGED
@@ -75,7 +75,10 @@ module Iguvium
|
|
75
75
|
recognized = CV.new(image).recognize
|
76
76
|
@lines = recognized[:lines]
|
77
77
|
@boxes = recognized[:boxes].reject { |box| box_empty?(box) }
|
78
|
-
@tables = @boxes
|
78
|
+
@tables = @boxes
|
79
|
+
.map { |box| Table.new(box, self) }
|
80
|
+
.reject { |table| table.grid[:rows].empty? || table.grid[:columns].empty? }
|
81
|
+
.reverse
|
79
82
|
self
|
80
83
|
end
|
81
84
|
|
data/lib/iguvium/table.rb
CHANGED
@@ -32,17 +32,16 @@ module Iguvium
|
|
32
32
|
# @return [Array] 2D array of strings (content of table's cells)
|
33
33
|
#
|
34
34
|
def to_a(newlines: false, phrases: true)
|
35
|
-
|
36
|
-
grid[:rows]
|
35
|
+
grid[:rows]
|
37
36
|
.reverse
|
38
37
|
.map { |row|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
grid[:columns].map do |column|
|
39
|
+
render(
|
40
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
41
|
+
newlines: newlines
|
42
|
+
)
|
43
|
+
end
|
44
|
+
}
|
46
45
|
end
|
47
46
|
|
48
47
|
# def width
|
@@ -61,6 +60,14 @@ module Iguvium
|
|
61
60
|
# @floorless
|
62
61
|
# end
|
63
62
|
|
63
|
+
def grid
|
64
|
+
@grid ||=
|
65
|
+
{
|
66
|
+
rows: lines_to_ranges(lines[:horizontal]),
|
67
|
+
columns: lines_to_ranges(lines[:vertical])
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
64
71
|
private
|
65
72
|
|
66
73
|
attr_reader :page, :lines, :box
|
@@ -69,8 +76,8 @@ module Iguvium
|
|
69
76
|
# and adds rows and/or columns if necessary.
|
70
77
|
# @return [Iguvium::Table] with added open-cell rows and columns
|
71
78
|
def heal
|
72
|
-
heal_rows
|
73
|
-
heal_cols
|
79
|
+
heal_rows unless grid[:rows].empty?
|
80
|
+
heal_cols unless grid[:columns].empty?
|
74
81
|
self
|
75
82
|
end
|
76
83
|
|
@@ -89,7 +96,6 @@ module Iguvium
|
|
89
96
|
end
|
90
97
|
|
91
98
|
def heal_rows
|
92
|
-
# TODO: shrink box (like `box.last.end - 2`)
|
93
99
|
roofrow = box.last.begin..grid[:rows].first.begin
|
94
100
|
floorrow = grid[:rows].last.end..box.last.end
|
95
101
|
if chars_inside(box.first, roofrow).any?
|
@@ -125,18 +131,7 @@ module Iguvium
|
|
125
131
|
}
|
126
132
|
end
|
127
133
|
|
128
|
-
def grid
|
129
|
-
return @grid if @grid
|
130
|
-
|
131
|
-
@grid =
|
132
|
-
{
|
133
|
-
rows: lines_to_ranges(lines[:horizontal]),
|
134
|
-
columns: lines_to_ranges(lines[:vertical])
|
135
|
-
}
|
136
|
-
end
|
137
|
-
|
138
134
|
def lines_to_ranges(lines)
|
139
|
-
# TODO: extend box for the sake of lines select
|
140
135
|
lines.select { |line| line_in_box?(line, wide_box) }
|
141
136
|
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
142
137
|
.sort
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,57 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-05-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: convolver-light
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.3.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.3.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: oily_png
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '1.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '1.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: pdf-reader
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1
|
47
|
+
version: '2.1'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1
|
54
|
+
version: '2.1'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: slop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,9 +108,9 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '3.0'
|
111
|
-
description:
|
112
|
-
|
113
|
-
|
111
|
+
description: Extract tables from PDF as a structured info. Uses ghostscript to print
|
112
|
+
pdf to image, then recognizes table separators optically. No OpenCV or other heavy
|
113
|
+
dependencies
|
114
114
|
email:
|
115
115
|
- dima@scriptangle.com
|
116
116
|
executables:
|