iguvium 0.8.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +12 -11
- data/exe/iguvium +1 -0
- data/lib/iguvium/cv.rb +7 -10
- data/lib/iguvium/row.rb +32 -0
- data/lib/iguvium/table.rb +67 -12
- data/lib/iguvium/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
|
4
|
+
data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
|
7
|
+
data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,16 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.0] - 2018-12-07
|
6
|
+
### Added
|
7
|
+
- Open cells rendering added. Tables like this are now processed correctly:
|
8
|
+
```
|
9
|
+
__|____|_______|_____|
|
10
|
+
__|____|_______|_____|
|
11
|
+
__|____|_______|_____|
|
12
|
+
```
|
13
|
+
|
14
|
+
|
5
15
|
## [0.8.4] - 2018-11-24
|
6
16
|
### Changed
|
7
17
|
- Render phrases before cell assembly option of Iguvium::Table#to_a method is now true by default.
|
data/README.md
CHANGED
@@ -24,7 +24,8 @@ Get this table:
|
|
24
24
|
|
25
25
|
* Characters extraction is done by [PDF::Reader gem](https://github.com/yob/pdf-reader). Some PDFs are so messed up it can't extract meaningful text from them. If so, so does Iguvium.
|
26
26
|
|
27
|
-
* Current version extracts regular (with constant number of rows per column and vise versa)
|
27
|
+
* Current version extracts regular (with constant number of rows per column and vise versa)
|
28
|
+
tables with explicit lines formatting, like this:
|
28
29
|
|
29
30
|
```
|
30
31
|
.__________________.
|
@@ -32,7 +33,15 @@ Get this table:
|
|
32
33
|
|____|_______|_____|
|
33
34
|
|____|_______|_____|
|
34
35
|
```
|
35
|
-
|
36
|
+
And, after version 0.9.0, like this:
|
37
|
+
```
|
38
|
+
__|____|_______|_____|
|
39
|
+
__|____|_______|_____|
|
40
|
+
__|____|_______|_____|
|
41
|
+
```
|
42
|
+
|
43
|
+
|
44
|
+
Merged cells content is split as if cells were not merged unless you use `:phrases` option.
|
36
45
|
|
37
46
|
* Performance: considering the fact it has computer vision under the hood, the gem is reasonably fast. Full page extraction takes up to 1 second on modern CPUs and up to 2 seconds on the older ones.
|
38
47
|
|
@@ -107,15 +116,7 @@ Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea
|
|
107
116
|
|
108
117
|
## Roadmap
|
109
118
|
|
110
|
-
The next version will
|
111
|
-
|
112
|
-
```
|
113
|
-
__|____|_______|_____|
|
114
|
-
__|____|_______|_____|
|
115
|
-
__|____|_______|_____|
|
116
|
-
```
|
117
|
-
|
118
|
-
It also will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
|
119
|
+
The next version will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
|
119
120
|
|
120
121
|
The final one will recognize tables with merged cells.
|
121
122
|
|
data/exe/iguvium
CHANGED
data/lib/iguvium/cv.rb
CHANGED
@@ -39,11 +39,11 @@ module Iguvium
|
|
39
39
|
# Prepares image for recognition: initial blur
|
40
40
|
# @param image [ChunkyPNG::Image] from {Iguvium::Image.read}
|
41
41
|
def initialize(image)
|
42
|
-
@
|
42
|
+
@blurred = blur(image)
|
43
|
+
@image = to_narray(image).to_a
|
43
44
|
end
|
44
45
|
|
45
|
-
|
46
|
-
attr_reader :image
|
46
|
+
attr_reader :image, :blurred
|
47
47
|
|
48
48
|
# @return [Recognized]
|
49
49
|
# lines most probably forming table cells and tables' outer borders as boxes
|
@@ -62,9 +62,8 @@ module Iguvium
|
|
62
62
|
{
|
63
63
|
vertical: Labeler.new(verticals)
|
64
64
|
.lines
|
65
|
-
.map { |line| flip_line line }
|
66
|
-
|
67
|
-
horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }.sort_by { |_, y| [y] }
|
65
|
+
.map { |line| flip_line line },
|
66
|
+
horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }
|
68
67
|
}
|
69
68
|
end
|
70
69
|
|
@@ -80,14 +79,14 @@ module Iguvium
|
|
80
79
|
|
81
80
|
def verticals(threshold = 3)
|
82
81
|
Matrix
|
83
|
-
.rows(convolve(NArray[*horizontal_scan(
|
82
|
+
.rows(convolve(NArray[*horizontal_scan(blurred)], VERTICAL, 0).to_a)
|
84
83
|
.map { |pix| pix < threshold ? nil : pix }
|
85
84
|
.to_a
|
86
85
|
end
|
87
86
|
|
88
87
|
def horizontals(threshold = 3)
|
89
88
|
Matrix
|
90
|
-
.rows(convolve(NArray[*vertical_scan(
|
89
|
+
.rows(convolve(NArray[*vertical_scan(blurred)], HORIZONTAL, 0).to_a)
|
91
90
|
.map { |pix| pix < threshold ? nil : pix }
|
92
91
|
.to_a
|
93
92
|
end
|
@@ -186,8 +185,6 @@ module Iguvium
|
|
186
185
|
def box(coord_array)
|
187
186
|
ax, bx = coord_array.map(&:last).minmax
|
188
187
|
ay, by = coord_array.map(&:first).minmax
|
189
|
-
# additional pixels removed from the box definition
|
190
|
-
# [ax - 1..bx + 1, ay - 1..by + 1]
|
191
188
|
[ax..bx, flip_range(ay..by)]
|
192
189
|
end
|
193
190
|
end
|
data/lib/iguvium/row.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Iguvium
|
4
|
+
|
5
|
+
class Row
|
6
|
+
# gets characters limited by yrange and set of column ranges
|
7
|
+
def initialize(columns, characters, phrases: true)
|
8
|
+
@columns = columns
|
9
|
+
if phrases
|
10
|
+
characters =
|
11
|
+
characters
|
12
|
+
.sort
|
13
|
+
.chunk_while { |a, b| a.mergable?(b) }
|
14
|
+
.map { |chunk| chunk.inject(:+) }
|
15
|
+
end
|
16
|
+
@characters = characters
|
17
|
+
end
|
18
|
+
|
19
|
+
def cells
|
20
|
+
@columns.map { |range|
|
21
|
+
@characters.select { |character| range.cover?(character.x) }
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return rendered row array
|
26
|
+
def render(newlines: false)
|
27
|
+
end
|
28
|
+
|
29
|
+
def merge(other)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/iguvium/table.rb
CHANGED
@@ -16,6 +16,8 @@ module Iguvium
|
|
16
16
|
@box = box
|
17
17
|
@lines = page.lines
|
18
18
|
@page = page
|
19
|
+
grid
|
20
|
+
heal
|
19
21
|
end
|
20
22
|
|
21
23
|
# Renders the table into an array of strings.
|
@@ -30,24 +32,74 @@ module Iguvium
|
|
30
32
|
# @return [Array] 2D array of strings (content of table's cells)
|
31
33
|
#
|
32
34
|
def to_a(newlines: false, phrases: true)
|
33
|
-
|
35
|
+
@to_a ||=
|
36
|
+
grid[:rows]
|
34
37
|
.reverse
|
35
38
|
.map { |row|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
grid[:columns].map do |column|
|
40
|
+
render(
|
41
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
42
|
+
newlines: newlines
|
43
|
+
)
|
44
|
+
end
|
45
|
+
}
|
43
46
|
end
|
44
47
|
|
48
|
+
# def width
|
49
|
+
# grid[:columns].count
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def mergeable?(other)
|
53
|
+
# width == other.width
|
54
|
+
# end
|
55
|
+
|
56
|
+
# def roofless?
|
57
|
+
# @roofless
|
58
|
+
# end
|
59
|
+
|
60
|
+
# def floorless?
|
61
|
+
# @floorless
|
62
|
+
# end
|
63
|
+
|
45
64
|
private
|
46
65
|
|
47
66
|
attr_reader :page, :lines, :box
|
48
67
|
|
49
|
-
|
50
|
-
|
68
|
+
# Looks if there are characters inside the box but outside of already detected cells
|
69
|
+
# and adds rows and/or columns if necessary.
|
70
|
+
# @return [Iguvium::Table] with added open-cell rows and columns
|
71
|
+
def heal
|
72
|
+
heal_rows
|
73
|
+
heal_cols
|
74
|
+
self
|
75
|
+
end
|
76
|
+
|
77
|
+
def wide_box
|
78
|
+
@wide_box ||= [
|
79
|
+
box.first.begin - 2..box.first.end + 2,
|
80
|
+
box.last.begin - 2..box.last.end + 2
|
81
|
+
]
|
82
|
+
end
|
83
|
+
|
84
|
+
def heal_cols
|
85
|
+
leftcol = box.first.begin..grid[:columns].first.begin
|
86
|
+
rightcol = grid[:columns].last.end..box.first.end
|
87
|
+
@grid[:columns].unshift(leftcol) if chars_inside(leftcol, box.last).any?
|
88
|
+
@grid[:columns].append(rightcol) if chars_inside(rightcol, box.last).any?
|
89
|
+
end
|
90
|
+
|
91
|
+
def heal_rows
|
92
|
+
# TODO: shrink box (like `box.last.end - 2`)
|
93
|
+
roofrow = box.last.begin..grid[:rows].first.begin
|
94
|
+
floorrow = grid[:rows].last.end..box.last.end
|
95
|
+
if chars_inside(box.first, roofrow).any?
|
96
|
+
@grid[:rows].unshift(roofrow)
|
97
|
+
@roofless = true
|
98
|
+
end
|
99
|
+
if chars_inside(box.first, floorrow).any?
|
100
|
+
@grid[:rows].append(floorrow)
|
101
|
+
@floorless = true
|
102
|
+
end
|
51
103
|
end
|
52
104
|
|
53
105
|
def characters
|
@@ -74,7 +126,9 @@ module Iguvium
|
|
74
126
|
end
|
75
127
|
|
76
128
|
def grid
|
77
|
-
@grid
|
129
|
+
return @grid if @grid
|
130
|
+
|
131
|
+
@grid =
|
78
132
|
{
|
79
133
|
rows: lines_to_ranges(lines[:horizontal]),
|
80
134
|
columns: lines_to_ranges(lines[:vertical])
|
@@ -82,7 +136,8 @@ module Iguvium
|
|
82
136
|
end
|
83
137
|
|
84
138
|
def lines_to_ranges(lines)
|
85
|
-
|
139
|
+
# TODO: extend box for the sake of lines select
|
140
|
+
lines.select { |line| line_in_box?(line, wide_box) }
|
86
141
|
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
87
142
|
.sort
|
88
143
|
.uniq
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pdf-reader
|
@@ -135,6 +135,7 @@ files:
|
|
135
135
|
- lib/iguvium/image.rb
|
136
136
|
- lib/iguvium/labeler.rb
|
137
137
|
- lib/iguvium/page.rb
|
138
|
+
- lib/iguvium/row.rb
|
138
139
|
- lib/iguvium/table.rb
|
139
140
|
- lib/iguvium/version.rb
|
140
141
|
homepage: https://github.com/adworse/iguvium
|