iguvium 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/iguvium.gemspec +3 -3
- data/lib/iguvium.rb +0 -7
- data/lib/iguvium/page.rb +4 -1
- data/lib/iguvium/table.rb +18 -23
- data/lib/iguvium/version.rb +1 -1
- metadata +14 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59643f281def94149de22d6b858f1f90f4b4b05289ef4f3ef13f146e4621f6c9
|
4
|
+
data.tar.gz: 7f10445415eadcc746b8d21050443595ecf127c7f6856a8b2962f8d072825447
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d3899e42a5bcaa6359554ff96d803f4a09acdacbb53855887bdedfda9245604cf24054a3588508d58222cf4c7783d031b36bff9c6e3e0cac2476c6552a54dc4
|
7
|
+
data.tar.gz: 8b6011b52aa2232597d1dea4ef087d36d4713879a0a6c76aba3849337f348329357499e8a5d8c3ec9ace4346e64a058d5b6fd415626b6553dfa2f29f6d516278
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.1] - 2019-05-30
|
6
|
+
### Fixed
|
7
|
+
- Rare `undefined method 'begin' for nil:NilClass` error fixed
|
8
|
+
- Remove Iguvium::Table#to_a result caching
|
9
|
+
|
5
10
|
## [0.9.0] - 2018-12-07
|
6
11
|
### Added
|
7
12
|
- Open cells rendering added. Tables like this are now processed correctly:
|
data/iguvium.gemspec
CHANGED
@@ -11,8 +11,8 @@ Gem::Specification.new do |spec|
|
|
11
11
|
spec.email = ['dima@scriptangle.com']
|
12
12
|
|
13
13
|
spec.summary = 'Extract tables from PDF as a structured info'
|
14
|
-
spec.description =
|
15
|
-
then recognizes table separators optically. No OpenCV or other heavy dependencies
|
14
|
+
spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
|
15
|
+
to image, then recognizes table separators optically. No OpenCV or other heavy dependencies"
|
16
16
|
spec.homepage = 'https://github.com/adworse/iguvium'
|
17
17
|
spec.license = 'MIT'
|
18
18
|
|
@@ -29,9 +29,9 @@ then recognizes table separators optically. No OpenCV or other heavy dependencie
|
|
29
29
|
|
30
30
|
spec.require_paths = ['lib']
|
31
31
|
|
32
|
-
spec.add_dependency 'pdf-reader', '~> 2.1'
|
33
32
|
spec.add_dependency 'convolver-light', '~> 0.3.1'
|
34
33
|
spec.add_dependency 'oily_png', '~> 1.2'
|
34
|
+
spec.add_dependency 'pdf-reader', '~> 2.1'
|
35
35
|
spec.add_dependency 'slop', '~> 4.2'
|
36
36
|
|
37
37
|
spec.add_development_dependency 'bundler', '~> 1.16'
|
data/lib/iguvium.rb
CHANGED
@@ -116,11 +116,4 @@ module Iguvium
|
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
119
|
-
# TODO: 4) Add options like maybe image thresholding
|
120
|
-
#
|
121
|
-
# TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
|
122
|
-
# __|____|_______|_____|
|
123
|
-
# __|____|_______|_____|
|
124
|
-
# __|____|_______|_____|
|
125
|
-
#
|
126
119
|
# TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
|
data/lib/iguvium/page.rb
CHANGED
@@ -75,7 +75,10 @@ module Iguvium
|
|
75
75
|
recognized = CV.new(image).recognize
|
76
76
|
@lines = recognized[:lines]
|
77
77
|
@boxes = recognized[:boxes].reject { |box| box_empty?(box) }
|
78
|
-
@tables = @boxes
|
78
|
+
@tables = @boxes
|
79
|
+
.map { |box| Table.new(box, self) }
|
80
|
+
.reject { |table| table.grid[:rows].empty? || table.grid[:columns].empty? }
|
81
|
+
.reverse
|
79
82
|
self
|
80
83
|
end
|
81
84
|
|
data/lib/iguvium/table.rb
CHANGED
@@ -32,17 +32,16 @@ module Iguvium
|
|
32
32
|
# @return [Array] 2D array of strings (content of table's cells)
|
33
33
|
#
|
34
34
|
def to_a(newlines: false, phrases: true)
|
35
|
-
|
36
|
-
grid[:rows]
|
35
|
+
grid[:rows]
|
37
36
|
.reverse
|
38
37
|
.map { |row|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
grid[:columns].map do |column|
|
39
|
+
render(
|
40
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
41
|
+
newlines: newlines
|
42
|
+
)
|
43
|
+
end
|
44
|
+
}
|
46
45
|
end
|
47
46
|
|
48
47
|
# def width
|
@@ -61,6 +60,14 @@ module Iguvium
|
|
61
60
|
# @floorless
|
62
61
|
# end
|
63
62
|
|
63
|
+
def grid
|
64
|
+
@grid ||=
|
65
|
+
{
|
66
|
+
rows: lines_to_ranges(lines[:horizontal]),
|
67
|
+
columns: lines_to_ranges(lines[:vertical])
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
64
71
|
private
|
65
72
|
|
66
73
|
attr_reader :page, :lines, :box
|
@@ -69,8 +76,8 @@ module Iguvium
|
|
69
76
|
# and adds rows and/or columns if necessary.
|
70
77
|
# @return [Iguvium::Table] with added open-cell rows and columns
|
71
78
|
def heal
|
72
|
-
heal_rows
|
73
|
-
heal_cols
|
79
|
+
heal_rows unless grid[:rows].empty?
|
80
|
+
heal_cols unless grid[:columns].empty?
|
74
81
|
self
|
75
82
|
end
|
76
83
|
|
@@ -89,7 +96,6 @@ module Iguvium
|
|
89
96
|
end
|
90
97
|
|
91
98
|
def heal_rows
|
92
|
-
# TODO: shrink box (like `box.last.end - 2`)
|
93
99
|
roofrow = box.last.begin..grid[:rows].first.begin
|
94
100
|
floorrow = grid[:rows].last.end..box.last.end
|
95
101
|
if chars_inside(box.first, roofrow).any?
|
@@ -125,18 +131,7 @@ module Iguvium
|
|
125
131
|
}
|
126
132
|
end
|
127
133
|
|
128
|
-
def grid
|
129
|
-
return @grid if @grid
|
130
|
-
|
131
|
-
@grid =
|
132
|
-
{
|
133
|
-
rows: lines_to_ranges(lines[:horizontal]),
|
134
|
-
columns: lines_to_ranges(lines[:vertical])
|
135
|
-
}
|
136
|
-
end
|
137
|
-
|
138
134
|
def lines_to_ranges(lines)
|
139
|
-
# TODO: extend box for the sake of lines select
|
140
135
|
lines.select { |line| line_in_box?(line, wide_box) }
|
141
136
|
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
142
137
|
.sort
|
data/lib/iguvium/version.rb
CHANGED
metadata
CHANGED
@@ -1,57 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-05-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: convolver-light
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.3.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.3.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: oily_png
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '1.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '1.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: pdf-reader
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1
|
47
|
+
version: '2.1'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1
|
54
|
+
version: '2.1'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: slop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,9 +108,9 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '3.0'
|
111
|
-
description:
|
112
|
-
|
113
|
-
|
111
|
+
description: Extract tables from PDF as a structured info. Uses ghostscript to print
|
112
|
+
pdf to image, then recognizes table separators optically. No OpenCV or other heavy
|
113
|
+
dependencies
|
114
114
|
email:
|
115
115
|
- dima@scriptangle.com
|
116
116
|
executables:
|