iguvium 0.8.4 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +12 -11
- data/exe/iguvium +1 -0
- data/lib/iguvium/cv.rb +7 -10
- data/lib/iguvium/row.rb +32 -0
- data/lib/iguvium/table.rb +67 -12
- data/lib/iguvium/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
|
4
|
+
data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
|
7
|
+
data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,16 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.0] - 2018-12-07
|
6
|
+
### Added
|
7
|
+
- Open cells rendering added. Tables like this are now processed correctly:
|
8
|
+
```
|
9
|
+
__|____|_______|_____|
|
10
|
+
__|____|_______|_____|
|
11
|
+
__|____|_______|_____|
|
12
|
+
```
|
13
|
+
|
14
|
+
|
5
15
|
## [0.8.4] - 2018-11-24
|
6
16
|
### Changed
|
7
17
|
- Render phrases before cell assembly option of Iguvium::Table#to_a method is now true by default.
|
data/README.md
CHANGED
@@ -24,7 +24,8 @@ Get this table:
|
|
24
24
|
|
25
25
|
* Characters extraction is done by [PDF::Reader gem](https://github.com/yob/pdf-reader). Some PDFs are so messed up it can't extract meaningful text from them. If so, so does Iguvium.
|
26
26
|
|
27
|
-
* Current version extracts regular (with constant number of rows per column and vise versa)
|
27
|
+
* Current version extracts regular (with constant number of rows per column and vise versa)
|
28
|
+
tables with explicit lines formatting, like this:
|
28
29
|
|
29
30
|
```
|
30
31
|
.__________________.
|
@@ -32,7 +33,15 @@ Get this table:
|
|
32
33
|
|____|_______|_____|
|
33
34
|
|____|_______|_____|
|
34
35
|
```
|
35
|
-
|
36
|
+
And, after version 0.9.0, like this:
|
37
|
+
```
|
38
|
+
__|____|_______|_____|
|
39
|
+
__|____|_______|_____|
|
40
|
+
__|____|_______|_____|
|
41
|
+
```
|
42
|
+
|
43
|
+
|
44
|
+
Merged cells content is split as if cells were not merged unless you use `:phrases` option.
|
36
45
|
|
37
46
|
* Performance: considering the fact it has computer vision under the hood, the gem is reasonably fast. Full page extraction takes up to 1 second on modern CPUs and up to 2 seconds on the older ones.
|
38
47
|
|
@@ -107,15 +116,7 @@ Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea
|
|
107
116
|
|
108
117
|
## Roadmap
|
109
118
|
|
110
|
-
The next version will
|
111
|
-
|
112
|
-
```
|
113
|
-
__|____|_______|_____|
|
114
|
-
__|____|_______|_____|
|
115
|
-
__|____|_______|_____|
|
116
|
-
```
|
117
|
-
|
118
|
-
It also will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
|
119
|
+
The next version will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
|
119
120
|
|
120
121
|
The final one will recognize tables with merged cells.
|
121
122
|
|
data/exe/iguvium
CHANGED
data/lib/iguvium/cv.rb
CHANGED
@@ -39,11 +39,11 @@ module Iguvium
|
|
39
39
|
# Prepares image for recognition: initial blur
|
40
40
|
# @param image [ChunkyPNG::Image] from {Iguvium::Image.read}
|
41
41
|
def initialize(image)
|
42
|
-
@
|
42
|
+
@blurred = blur(image)
|
43
|
+
@image = to_narray(image).to_a
|
43
44
|
end
|
44
45
|
|
45
|
-
|
46
|
-
attr_reader :image
|
46
|
+
attr_reader :image, :blurred
|
47
47
|
|
48
48
|
# @return [Recognized]
|
49
49
|
# lines most probably forming table cells and tables' outer borders as boxes
|
@@ -62,9 +62,8 @@ module Iguvium
|
|
62
62
|
{
|
63
63
|
vertical: Labeler.new(verticals)
|
64
64
|
.lines
|
65
|
-
.map { |line| flip_line line }
|
66
|
-
|
67
|
-
horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }.sort_by { |_, y| [y] }
|
65
|
+
.map { |line| flip_line line },
|
66
|
+
horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }
|
68
67
|
}
|
69
68
|
end
|
70
69
|
|
@@ -80,14 +79,14 @@ module Iguvium
|
|
80
79
|
|
81
80
|
def verticals(threshold = 3)
|
82
81
|
Matrix
|
83
|
-
.rows(convolve(NArray[*horizontal_scan(
|
82
|
+
.rows(convolve(NArray[*horizontal_scan(blurred)], VERTICAL, 0).to_a)
|
84
83
|
.map { |pix| pix < threshold ? nil : pix }
|
85
84
|
.to_a
|
86
85
|
end
|
87
86
|
|
88
87
|
def horizontals(threshold = 3)
|
89
88
|
Matrix
|
90
|
-
.rows(convolve(NArray[*vertical_scan(
|
89
|
+
.rows(convolve(NArray[*vertical_scan(blurred)], HORIZONTAL, 0).to_a)
|
91
90
|
.map { |pix| pix < threshold ? nil : pix }
|
92
91
|
.to_a
|
93
92
|
end
|
@@ -186,8 +185,6 @@ module Iguvium
|
|
186
185
|
def box(coord_array)
|
187
186
|
ax, bx = coord_array.map(&:last).minmax
|
188
187
|
ay, by = coord_array.map(&:first).minmax
|
189
|
-
# additional pixels removed from the box definition
|
190
|
-
# [ax - 1..bx + 1, ay - 1..by + 1]
|
191
188
|
[ax..bx, flip_range(ay..by)]
|
192
189
|
end
|
193
190
|
end
|
data/lib/iguvium/row.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
|
5
|
+
class Row
|
6
|
+
# gets characters limited by yrange and set of column ranges
|
7
|
+
def initialize(columns, characters, phrases: true)
|
8
|
+
@columns = columns
|
9
|
+
if phrases
|
10
|
+
characters =
|
11
|
+
characters
|
12
|
+
.sort
|
13
|
+
.chunk_while { |a, b| a.mergable?(b) }
|
14
|
+
.map { |chunk| chunk.inject(:+) }
|
15
|
+
end
|
16
|
+
@characters = characters
|
17
|
+
end
|
18
|
+
|
19
|
+
def cells
|
20
|
+
@columns.map { |range|
|
21
|
+
@characters.select { |character| range.cover?(character.x) }
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return rendered row array
|
26
|
+
def render(newlines: false)
|
27
|
+
end
|
28
|
+
|
29
|
+
def merge(other)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/iguvium/table.rb
CHANGED
@@ -16,6 +16,8 @@ module Iguvium
|
|
16
16
|
@box = box
|
17
17
|
@lines = page.lines
|
18
18
|
@page = page
|
19
|
+
grid
|
20
|
+
heal
|
19
21
|
end
|
20
22
|
|
21
23
|
# Renders the table into an array of strings.
|
@@ -30,24 +32,74 @@ module Iguvium
|
|
30
32
|
# @return [Array] 2D array of strings (content of table's cells)
|
31
33
|
#
|
32
34
|
def to_a(newlines: false, phrases: true)
|
33
|
-
|
35
|
+
@to_a ||=
|
36
|
+
grid[:rows]
|
34
37
|
.reverse
|
35
38
|
.map { |row|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
grid[:columns].map do |column|
|
40
|
+
render(
|
41
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
42
|
+
newlines: newlines
|
43
|
+
)
|
44
|
+
end
|
45
|
+
}
|
43
46
|
end
|
44
47
|
|
48
|
+
# def width
|
49
|
+
# grid[:columns].count
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def mergeable?(other)
|
53
|
+
# width == other.width
|
54
|
+
# end
|
55
|
+
|
56
|
+
# def roofless?
|
57
|
+
# @roofless
|
58
|
+
# end
|
59
|
+
|
60
|
+
# def floorless?
|
61
|
+
# @floorless
|
62
|
+
# end
|
63
|
+
|
45
64
|
private
|
46
65
|
|
47
66
|
attr_reader :page, :lines, :box
|
48
67
|
|
49
|
-
|
50
|
-
|
68
|
+
# Looks if there are characters inside the box but outside of already detected cells
|
69
|
+
# and adds rows and/or columns if necessary.
|
70
|
+
# @return [Iguvium::Table] with added open-cell rows and columns
|
71
|
+
def heal
|
72
|
+
heal_rows
|
73
|
+
heal_cols
|
74
|
+
self
|
75
|
+
end
|
76
|
+
|
77
|
+
def wide_box
|
78
|
+
@wide_box ||= [
|
79
|
+
box.first.begin - 2..box.first.end + 2,
|
80
|
+
box.last.begin - 2..box.last.end + 2
|
81
|
+
]
|
82
|
+
end
|
83
|
+
|
84
|
+
def heal_cols
|
85
|
+
leftcol = box.first.begin..grid[:columns].first.begin
|
86
|
+
rightcol = grid[:columns].last.end..box.first.end
|
87
|
+
@grid[:columns].unshift(leftcol) if chars_inside(leftcol, box.last).any?
|
88
|
+
@grid[:columns].append(rightcol) if chars_inside(rightcol, box.last).any?
|
89
|
+
end
|
90
|
+
|
91
|
+
def heal_rows
|
92
|
+
# TODO: shrink box (like `box.last.end - 2`)
|
93
|
+
roofrow = box.last.begin..grid[:rows].first.begin
|
94
|
+
floorrow = grid[:rows].last.end..box.last.end
|
95
|
+
if chars_inside(box.first, roofrow).any?
|
96
|
+
@grid[:rows].unshift(roofrow)
|
97
|
+
@roofless = true
|
98
|
+
end
|
99
|
+
if chars_inside(box.first, floorrow).any?
|
100
|
+
@grid[:rows].append(floorrow)
|
101
|
+
@floorless = true
|
102
|
+
end
|
51
103
|
end
|
52
104
|
|
53
105
|
def characters
|
@@ -74,7 +126,9 @@ module Iguvium
|
|
74
126
|
end
|
75
127
|
|
76
128
|
def grid
|
77
|
-
@grid
|
129
|
+
return @grid if @grid
|
130
|
+
|
131
|
+
@grid =
|
78
132
|
{
|
79
133
|
rows: lines_to_ranges(lines[:horizontal]),
|
80
134
|
columns: lines_to_ranges(lines[:vertical])
|
@@ -82,7 +136,8 @@ module Iguvium
|
|
82
136
|
end
|
83
137
|
|
84
138
|
def lines_to_ranges(lines)
|
85
|
-
|
139
|
+
# TODO: extend box for the sake of lines select
|
140
|
+
lines.select { |line| line_in_box?(line, wide_box) }
|
86
141
|
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
87
142
|
.sort
|
88
143
|
.uniq
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|
@@ -135,6 +135,7 @@ files:
|
|
135
135
|
- lib/iguvium/image.rb
|
136
136
|
- lib/iguvium/labeler.rb
|
137
137
|
- lib/iguvium/page.rb
|
138
|
+
- lib/iguvium/row.rb
|
138
139
|
- lib/iguvium/table.rb
|
139
140
|
- lib/iguvium/version.rb
|
140
141
|
homepage: https://github.com/adworse/iguvium
|