iguvium 0.9.0 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +10 -0
- data/README.md +2 -0
- data/iguvium.gemspec +6 -5
- data/lib/iguvium/image.rb +0 -1
- data/lib/iguvium/page.rb +5 -2
- data/lib/iguvium/table.rb +18 -23
- data/lib/iguvium/version.rb +1 -1
- data/lib/iguvium.rb +1 -8
- metadata +37 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d6b546079935086775c26461dd4ea774d9f0474a4dcae211c71d01544536479
|
4
|
+
data.tar.gz: 8f96f1e0ae92923a268c3eee0431264f6d4324ed7e067e620ec376c1b7325917
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cdca6bd0a04ab1ba897cd9888acebecc5ee689ffb5c814b0ecce674e96fcbefbb8035c84b6d9eabf3192411153b2aaa43425771e37cd6736ac9dbe52fe4dc364
|
7
|
+
data.tar.gz: 82637dd89b1d3015503f7fff7ae198525a8a0fd0b0050b70285a57e27865e374620a4cc78c93027d650d20840eb9377171271b34bb4b1fc9b44a893ba738822e
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,16 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.2] - 2019-05-30
|
6
|
+
### Fixed
|
7
|
+
- Solving deprecated use of keyword arguments in method calls in Ruby 3
|
8
|
+
- Matrix gem needs to be an explicit dependency as of Ruby 3.1.
|
9
|
+
|
10
|
+
## [0.9.1] - 2019-05-30
|
11
|
+
### Fixed
|
12
|
+
- Rare `undefined method 'begin' for nil:NilClass` error fixed
|
13
|
+
- Remove Iguvium::Table#to_a result caching
|
14
|
+
|
5
15
|
## [0.9.0] - 2018-12-07
|
6
16
|
### Added
|
7
17
|
- Open cells rendering added. Tables like this are now processed correctly:
|
data/README.md
CHANGED
@@ -110,6 +110,8 @@ just page text. The latter is useful in case of whitespace-separated fixed-width
|
|
110
110
|
There are usually no actual tables in PDFs, only characters with coordinates,
|
111
111
|
and some fancy lines. Human eye interprets this as a table. Iguvium behaves quite similarly. It prints PDF to an image file with GhostScript, then analyses the image.
|
112
112
|
|
113
|
+
(Later clarification as per request. It only prints anything but text and images (-dFILTERTEXT -dFILTERIMAGE params of GhostScript, which lefts lines, curves, etc.) to analyze table structure. Text fields are extracted from pdf codepoints, if there are any. Trying to do otherwise would imply a full-blown OCR solution, something like FineReader. So with scanned image-only pdfs it is like an ideal unmatch: nothing is actually printed and there's no text to extract.)
|
114
|
+
|
113
115
|
Long enough continuous edges are interpreted as possible cell borders. Gaussian blur is applied beforehand to get rid of possible inconsistencies and style features.
|
114
116
|
|
115
117
|
Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea of image analysis to detect table structure. Besides this idea, is an independent work. Image recognition is written in Ruby, no OpenCV or other heavy computer vision libraries are used. Line detection algorithms are different too. The functionality of Camelot is significantly broader.
|
data/iguvium.gemspec
CHANGED
@@ -8,11 +8,11 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.name = 'iguvium'
|
9
9
|
spec.version = Iguvium::VERSION
|
10
10
|
spec.authors = ['Dima Ermilov']
|
11
|
-
spec.email = ['
|
11
|
+
spec.email = ['adworse@erlef.org']
|
12
12
|
|
13
13
|
spec.summary = 'Extract tables from PDF as a structured info'
|
14
|
-
spec.description =
|
15
|
-
then recognizes table separators optically. No OpenCV or other heavy dependencies
|
14
|
+
spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
|
15
|
+
to image, then recognizes table separators optically. No OpenCV or other heavy dependencies"
|
16
16
|
spec.homepage = 'https://github.com/adworse/iguvium'
|
17
17
|
spec.license = 'MIT'
|
18
18
|
|
@@ -29,12 +29,13 @@ then recognizes table separators optically. No OpenCV or other heavy dependencie
|
|
29
29
|
|
30
30
|
spec.require_paths = ['lib']
|
31
31
|
|
32
|
-
spec.add_dependency 'pdf-reader', '~> 2.1'
|
33
32
|
spec.add_dependency 'convolver-light', '~> 0.3.1'
|
34
33
|
spec.add_dependency 'oily_png', '~> 1.2'
|
34
|
+
spec.add_dependency 'pdf-reader', '~> 2.1'
|
35
35
|
spec.add_dependency 'slop', '~> 4.2'
|
36
|
+
spec.add_dependency 'matrix'
|
36
37
|
|
37
|
-
spec.add_development_dependency 'bundler'
|
38
|
+
spec.add_development_dependency 'bundler'
|
38
39
|
spec.add_development_dependency 'rake', '~> 10.0'
|
39
40
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
40
41
|
end
|
data/lib/iguvium/image.rb
CHANGED
@@ -15,7 +15,6 @@ module Iguvium
|
|
15
15
|
#
|
16
16
|
# @return [ChunkyPNG::Image]
|
17
17
|
def self.read(path, pagenumber = 1, **opts)
|
18
|
-
puts path.shellescape
|
19
18
|
rgb = path.gsub(/\.pdf$/, '.rgb')
|
20
19
|
Iguvium.logger.info `#{opts[:gspath]} -dSAFER -dBATCH -dNOPAUSE -sDEVICE=pnggray -dGraphicsAlphaBits=4 \
|
21
20
|
-r72 -dFirstPage=#{pagenumber} -dLastPage=#{pagenumber} \
|
data/lib/iguvium/page.rb
CHANGED
@@ -71,11 +71,14 @@ module Iguvium
|
|
71
71
|
private
|
72
72
|
|
73
73
|
def recognize!
|
74
|
-
image = Image.read(@path, @reader_page.number,
|
74
|
+
image = Image.read(@path, @reader_page.number, **@opts)
|
75
75
|
recognized = CV.new(image).recognize
|
76
76
|
@lines = recognized[:lines]
|
77
77
|
@boxes = recognized[:boxes].reject { |box| box_empty?(box) }
|
78
|
-
@tables = @boxes
|
78
|
+
@tables = @boxes
|
79
|
+
.map { |box| Table.new(box, self) }
|
80
|
+
.reject { |table| table.grid[:rows].empty? || table.grid[:columns].empty? }
|
81
|
+
.reverse
|
79
82
|
self
|
80
83
|
end
|
81
84
|
|
data/lib/iguvium/table.rb
CHANGED
@@ -32,17 +32,16 @@ module Iguvium
|
|
32
32
|
# @return [Array] 2D array of strings (content of table's cells)
|
33
33
|
#
|
34
34
|
def to_a(newlines: false, phrases: true)
|
35
|
-
|
36
|
-
grid[:rows]
|
35
|
+
grid[:rows]
|
37
36
|
.reverse
|
38
37
|
.map { |row|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
grid[:columns].map do |column|
|
39
|
+
render(
|
40
|
+
phrases ? words_inside(column, row) : chars_inside(column, row),
|
41
|
+
newlines: newlines
|
42
|
+
)
|
43
|
+
end
|
44
|
+
}
|
46
45
|
end
|
47
46
|
|
48
47
|
# def width
|
@@ -61,6 +60,14 @@ module Iguvium
|
|
61
60
|
# @floorless
|
62
61
|
# end
|
63
62
|
|
63
|
+
def grid
|
64
|
+
@grid ||=
|
65
|
+
{
|
66
|
+
rows: lines_to_ranges(lines[:horizontal]),
|
67
|
+
columns: lines_to_ranges(lines[:vertical])
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
64
71
|
private
|
65
72
|
|
66
73
|
attr_reader :page, :lines, :box
|
@@ -69,8 +76,8 @@ module Iguvium
|
|
69
76
|
# and adds rows and/or columns if necessary.
|
70
77
|
# @return [Iguvium::Table] with added open-cell rows and columns
|
71
78
|
def heal
|
72
|
-
heal_rows
|
73
|
-
heal_cols
|
79
|
+
heal_rows unless grid[:rows].empty?
|
80
|
+
heal_cols unless grid[:columns].empty?
|
74
81
|
self
|
75
82
|
end
|
76
83
|
|
@@ -89,7 +96,6 @@ module Iguvium
|
|
89
96
|
end
|
90
97
|
|
91
98
|
def heal_rows
|
92
|
-
# TODO: shrink box (like `box.last.end - 2`)
|
93
99
|
roofrow = box.last.begin..grid[:rows].first.begin
|
94
100
|
floorrow = grid[:rows].last.end..box.last.end
|
95
101
|
if chars_inside(box.first, roofrow).any?
|
@@ -125,18 +131,7 @@ module Iguvium
|
|
125
131
|
}
|
126
132
|
end
|
127
133
|
|
128
|
-
def grid
|
129
|
-
return @grid if @grid
|
130
|
-
|
131
|
-
@grid =
|
132
|
-
{
|
133
|
-
rows: lines_to_ranges(lines[:horizontal]),
|
134
|
-
columns: lines_to_ranges(lines[:vertical])
|
135
|
-
}
|
136
|
-
end
|
137
|
-
|
138
134
|
def lines_to_ranges(lines)
|
139
|
-
# TODO: extend box for the sake of lines select
|
140
135
|
lines.select { |line| line_in_box?(line, wide_box) }
|
141
136
|
.map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
|
142
137
|
.sort
|
data/lib/iguvium/version.rb
CHANGED
data/lib/iguvium.rb
CHANGED
@@ -73,7 +73,7 @@ module Iguvium
|
|
73
73
|
opts[:gspath] ||= gs_nix?
|
74
74
|
end
|
75
75
|
|
76
|
-
PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, opts) }
|
76
|
+
PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, **opts) }
|
77
77
|
end
|
78
78
|
|
79
79
|
# Creates and gives access to Ruby Logger. Default [Logger::Level] is Logger::ERROR.
|
@@ -116,11 +116,4 @@ module Iguvium
|
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
119
|
-
# TODO: 4) Add options like maybe image thresholding
|
120
|
-
#
|
121
|
-
# TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
|
122
|
-
# __|____|_______|_____|
|
123
|
-
# __|____|_______|_____|
|
124
|
-
# __|____|_______|_____|
|
125
|
-
#
|
126
119
|
# TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
|
metadata
CHANGED
@@ -1,57 +1,57 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: convolver-light
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.3.1
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.3.1
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: oily_png
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '1.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '1.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: pdf-reader
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1
|
47
|
+
version: '2.1'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1
|
54
|
+
version: '2.1'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: slop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,20 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '4.2'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: matrix
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- - "
|
87
|
+
- - ">="
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
89
|
+
version: '0'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- - "
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rake
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,11 +122,11 @@ dependencies:
|
|
108
122
|
- - "~>"
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '3.0'
|
111
|
-
description:
|
112
|
-
|
113
|
-
|
125
|
+
description: Extract tables from PDF as a structured info. Uses ghostscript to print
|
126
|
+
pdf to image, then recognizes table separators optically. No OpenCV or other heavy
|
127
|
+
dependencies
|
114
128
|
email:
|
115
|
-
-
|
129
|
+
- adworse@erlef.org
|
116
130
|
executables:
|
117
131
|
- iguvium
|
118
132
|
extensions: []
|
@@ -142,7 +156,7 @@ homepage: https://github.com/adworse/iguvium
|
|
142
156
|
licenses:
|
143
157
|
- MIT
|
144
158
|
metadata: {}
|
145
|
-
post_install_message:
|
159
|
+
post_install_message:
|
146
160
|
rdoc_options: []
|
147
161
|
require_paths:
|
148
162
|
- lib
|
@@ -157,9 +171,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
171
|
- !ruby/object:Gem::Version
|
158
172
|
version: '0'
|
159
173
|
requirements: []
|
160
|
-
|
161
|
-
|
162
|
-
signing_key:
|
174
|
+
rubygems_version: 3.0.6
|
175
|
+
signing_key:
|
163
176
|
specification_version: 4
|
164
177
|
summary: Extract tables from PDF as a structured info
|
165
178
|
test_files: []
|