iguvium 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/CHANGELOG.md +5 -0
- data/README.md +2 -0
- data/iguvium.gemspec +3 -2
- data/lib/iguvium/image.rb +0 -1
- data/lib/iguvium/page.rb +1 -1
- data/lib/iguvium/version.rb +1 -1
- data/lib/iguvium.rb +1 -1
- metadata +25 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d6b546079935086775c26461dd4ea774d9f0474a4dcae211c71d01544536479
|
4
|
+
data.tar.gz: 8f96f1e0ae92923a268c3eee0431264f6d4324ed7e067e620ec376c1b7325917
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cdca6bd0a04ab1ba897cd9888acebecc5ee689ffb5c814b0ecce674e96fcbefbb8035c84b6d9eabf3192411153b2aaa43425771e37cd6736ac9dbe52fe4dc364
|
7
|
+
data.tar.gz: 82637dd89b1d3015503f7fff7ae198525a8a0fd0b0050b70285a57e27865e374620a4cc78c93027d650d20840eb9377171271b34bb4b1fc9b44a893ba738822e
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
## [Unreleased]
|
4
4
|
|
5
|
+
## [0.9.2] - 2019-05-30
|
6
|
+
### Fixed
|
7
|
+
- Solving deprecated use of keyword arguments in method calls in Ruby 3
|
8
|
+
- Matrix gem needs to be an explicit dependency as of Ruby 3.1.
|
9
|
+
|
5
10
|
## [0.9.1] - 2019-05-30
|
6
11
|
### Fixed
|
7
12
|
- Rare `undefined method 'begin' for nil:NilClass` error fixed
|
data/README.md
CHANGED
@@ -110,6 +110,8 @@ just page text. The latter is useful in case of whitespace-separated fixed-width
|
|
110
110
|
There are usually no actual tables in PDFs, only characters with coordinates,
|
111
111
|
and some fancy lines. Human eye interprets this as a table. Iguvium behaves quite similarly. It prints PDF to an image file with GhostScript, then analyses the image.
|
112
112
|
|
113
|
+
(Later clarification as per request. It only prints anything but text and images (-dFILTERTEXT -dFILTERIMAGE params of GhostScript, which lefts lines, curves, etc.) to analyze table structure. Text fields are extracted from pdf codepoints, if there are any. Trying to do otherwise would imply a full-blown OCR solution, something like FineReader. So with scanned image-only pdfs it is like an ideal unmatch: nothing is actually printed and there's no text to extract.)
|
114
|
+
|
113
115
|
Long enough continuous edges are interpreted as possible cell borders. Gaussian blur is applied beforehand to get rid of possible inconsistencies and style features.
|
114
116
|
|
115
117
|
Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea of image analysis to detect table structure. Besides this idea, is an independent work. Image recognition is written in Ruby, no OpenCV or other heavy computer vision libraries are used. Line detection algorithms are different too. The functionality of Camelot is significantly broader.
|
data/iguvium.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.name = 'iguvium'
|
9
9
|
spec.version = Iguvium::VERSION
|
10
10
|
spec.authors = ['Dima Ermilov']
|
11
|
-
spec.email = ['
|
11
|
+
spec.email = ['adworse@erlef.org']
|
12
12
|
|
13
13
|
spec.summary = 'Extract tables from PDF as a structured info'
|
14
14
|
spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
|
@@ -33,8 +33,9 @@ to image, then recognizes table separators optically. No OpenCV or other heavy d
|
|
33
33
|
spec.add_dependency 'oily_png', '~> 1.2'
|
34
34
|
spec.add_dependency 'pdf-reader', '~> 2.1'
|
35
35
|
spec.add_dependency 'slop', '~> 4.2'
|
36
|
+
spec.add_dependency 'matrix'
|
36
37
|
|
37
|
-
spec.add_development_dependency 'bundler'
|
38
|
+
spec.add_development_dependency 'bundler'
|
38
39
|
spec.add_development_dependency 'rake', '~> 10.0'
|
39
40
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
40
41
|
end
|
data/lib/iguvium/image.rb
CHANGED
@@ -15,7 +15,6 @@ module Iguvium
|
|
15
15
|
#
|
16
16
|
# @return [ChunkyPNG::Image]
|
17
17
|
def self.read(path, pagenumber = 1, **opts)
|
18
|
-
puts path.shellescape
|
19
18
|
rgb = path.gsub(/\.pdf$/, '.rgb')
|
20
19
|
Iguvium.logger.info `#{opts[:gspath]} -dSAFER -dBATCH -dNOPAUSE -sDEVICE=pnggray -dGraphicsAlphaBits=4 \
|
21
20
|
-r72 -dFirstPage=#{pagenumber} -dLastPage=#{pagenumber} \
|
data/lib/iguvium/page.rb
CHANGED
@@ -71,7 +71,7 @@ module Iguvium
|
|
71
71
|
private
|
72
72
|
|
73
73
|
def recognize!
|
74
|
-
image = Image.read(@path, @reader_page.number,
|
74
|
+
image = Image.read(@path, @reader_page.number, **@opts)
|
75
75
|
recognized = CV.new(image).recognize
|
76
76
|
@lines = recognized[:lines]
|
77
77
|
@boxes = recognized[:boxes].reject { |box| box_empty?(box) }
|
data/lib/iguvium/version.rb
CHANGED
data/lib/iguvium.rb
CHANGED
@@ -73,7 +73,7 @@ module Iguvium
|
|
73
73
|
opts[:gspath] ||= gs_nix?
|
74
74
|
end
|
75
75
|
|
76
|
-
PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, opts) }
|
76
|
+
PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, **opts) }
|
77
77
|
end
|
78
78
|
|
79
79
|
# Creates and gives access to Ruby Logger. Default [Logger::Level] is Logger::ERROR.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iguvium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dima Ermilov
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: convolver-light
|
@@ -66,20 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '4.2'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: matrix
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- - "
|
87
|
+
- - ">="
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
89
|
+
version: '0'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- - "
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rake
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -112,7 +126,7 @@ description: Extract tables from PDF as a structured info. Uses ghostscript to p
|
|
112
126
|
pdf to image, then recognizes table separators optically. No OpenCV or other heavy
|
113
127
|
dependencies
|
114
128
|
email:
|
115
|
-
-
|
129
|
+
- adworse@erlef.org
|
116
130
|
executables:
|
117
131
|
- iguvium
|
118
132
|
extensions: []
|
@@ -142,7 +156,7 @@ homepage: https://github.com/adworse/iguvium
|
|
142
156
|
licenses:
|
143
157
|
- MIT
|
144
158
|
metadata: {}
|
145
|
-
post_install_message:
|
159
|
+
post_install_message:
|
146
160
|
rdoc_options: []
|
147
161
|
require_paths:
|
148
162
|
- lib
|
@@ -157,9 +171,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
157
171
|
- !ruby/object:Gem::Version
|
158
172
|
version: '0'
|
159
173
|
requirements: []
|
160
|
-
|
161
|
-
|
162
|
-
signing_key:
|
174
|
+
rubygems_version: 3.0.6
|
175
|
+
signing_key:
|
163
176
|
specification_version: 4
|
164
177
|
summary: Extract tables from PDF as a structured info
|
165
178
|
test_files: []
|