iguvium 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 59643f281def94149de22d6b858f1f90f4b4b05289ef4f3ef13f146e4621f6c9
4
- data.tar.gz: 7f10445415eadcc746b8d21050443595ecf127c7f6856a8b2962f8d072825447
3
+ metadata.gz: 8d6b546079935086775c26461dd4ea774d9f0474a4dcae211c71d01544536479
4
+ data.tar.gz: 8f96f1e0ae92923a268c3eee0431264f6d4324ed7e067e620ec376c1b7325917
5
5
  SHA512:
6
- metadata.gz: 7d3899e42a5bcaa6359554ff96d803f4a09acdacbb53855887bdedfda9245604cf24054a3588508d58222cf4c7783d031b36bff9c6e3e0cac2476c6552a54dc4
7
- data.tar.gz: 8b6011b52aa2232597d1dea4ef087d36d4713879a0a6c76aba3849337f348329357499e8a5d8c3ec9ace4346e64a058d5b6fd415626b6553dfa2f29f6d516278
6
+ metadata.gz: cdca6bd0a04ab1ba897cd9888acebecc5ee689ffb5c814b0ecce674e96fcbefbb8035c84b6d9eabf3192411153b2aaa43425771e37cd6736ac9dbe52fe4dc364
7
+ data.tar.gz: 82637dd89b1d3015503f7fff7ae198525a8a0fd0b0050b70285a57e27865e374620a4cc78c93027d650d20840eb9377171271b34bb4b1fc9b44a893ba738822e
data/.travis.yml CHANGED
@@ -4,4 +4,5 @@ rvm:
4
4
  - 2.5.1
5
5
  - 2.4.5
6
6
  - 2.3.8
7
+ - 3.1.3
7
8
  before_install: gem install bundler && sudo apt-get install ghostscript
data/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.9.2] - 2019-05-30
6
+ ### Fixed
7
+ - Solving deprecated use of keyword arguments in method calls in Ruby 3
8
+ - Matrix gem needs to be an explicit dependency as of Ruby 3.1.
9
+
5
10
  ## [0.9.1] - 2019-05-30
6
11
  ### Fixed
7
12
  - Rare `undefined method 'begin' for nil:NilClass` error fixed
data/README.md CHANGED
@@ -110,6 +110,8 @@ just page text. The latter is useful in case of whitespace-separated fixed-width
110
110
  There are usually no actual tables in PDFs, only characters with coordinates,
111
111
  and some fancy lines. Human eye interprets this as a table. Iguvium behaves quite similarly. It prints PDF to an image file with GhostScript, then analyses the image.
112
112
 
113
+ (Later clarification as per request. It only prints anything but text and images (-dFILTERTEXT -dFILTERIMAGE params of GhostScript, which lefts lines, curves, etc.) to analyze table structure. Text fields are extracted from pdf codepoints, if there are any. Trying to do otherwise would imply a full-blown OCR solution, something like FineReader. So with scanned image-only pdfs it is like an ideal unmatch: nothing is actually printed and there's no text to extract.)
114
+
113
115
  Long enough continuous edges are interpreted as possible cell borders. Gaussian blur is applied beforehand to get rid of possible inconsistencies and style features.
114
116
 
115
117
  Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea of image analysis to detect table structure. Besides this idea, is an independent work. Image recognition is written in Ruby, no OpenCV or other heavy computer vision libraries are used. Line detection algorithms are different too. The functionality of Camelot is significantly broader.
data/iguvium.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.name = 'iguvium'
9
9
  spec.version = Iguvium::VERSION
10
10
  spec.authors = ['Dima Ermilov']
11
- spec.email = ['dima@scriptangle.com']
11
+ spec.email = ['adworse@erlef.org']
12
12
 
13
13
  spec.summary = 'Extract tables from PDF as a structured info'
14
14
  spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
@@ -33,8 +33,9 @@ to image, then recognizes table separators optically. No OpenCV or other heavy d
33
33
  spec.add_dependency 'oily_png', '~> 1.2'
34
34
  spec.add_dependency 'pdf-reader', '~> 2.1'
35
35
  spec.add_dependency 'slop', '~> 4.2'
36
+ spec.add_dependency 'matrix'
36
37
 
37
- spec.add_development_dependency 'bundler', '~> 1.16'
38
+ spec.add_development_dependency 'bundler'
38
39
  spec.add_development_dependency 'rake', '~> 10.0'
39
40
  spec.add_development_dependency 'rspec', '~> 3.0'
40
41
  end
data/lib/iguvium/image.rb CHANGED
@@ -15,7 +15,6 @@ module Iguvium
15
15
  #
16
16
  # @return [ChunkyPNG::Image]
17
17
  def self.read(path, pagenumber = 1, **opts)
18
- puts path.shellescape
19
18
  rgb = path.gsub(/\.pdf$/, '.rgb')
20
19
  Iguvium.logger.info `#{opts[:gspath]} -dSAFER -dBATCH -dNOPAUSE -sDEVICE=pnggray -dGraphicsAlphaBits=4 \
21
20
  -r72 -dFirstPage=#{pagenumber} -dLastPage=#{pagenumber} \
data/lib/iguvium/page.rb CHANGED
@@ -71,7 +71,7 @@ module Iguvium
71
71
  private
72
72
 
73
73
  def recognize!
74
- image = Image.read(@path, @reader_page.number, @opts)
74
+ image = Image.read(@path, @reader_page.number, **@opts)
75
75
  recognized = CV.new(image).recognize
76
76
  @lines = recognized[:lines]
77
77
  @boxes = recognized[:boxes].reject { |box| box_empty?(box) }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.9.1'
4
+ VERSION = '0.9.2'
5
5
  end
data/lib/iguvium.rb CHANGED
@@ -73,7 +73,7 @@ module Iguvium
73
73
  opts[:gspath] ||= gs_nix?
74
74
  end
75
75
 
76
- PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, opts) }
76
+ PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, **opts) }
77
77
  end
78
78
 
79
79
  # Creates and gives access to Ruby Logger. Default [Logger::Level] is Logger::ERROR.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-05-30 00:00:00.000000000 Z
11
+ date: 2022-12-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: convolver-light
@@ -66,20 +66,34 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '4.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: matrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - "~>"
87
+ - - ">="
74
88
  - !ruby/object:Gem::Version
75
- version: '1.16'
89
+ version: '0'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - "~>"
94
+ - - ">="
81
95
  - !ruby/object:Gem::Version
82
- version: '1.16'
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rake
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -112,7 +126,7 @@ description: Extract tables from PDF as a structured info. Uses ghostscript to p
112
126
  pdf to image, then recognizes table separators optically. No OpenCV or other heavy
113
127
  dependencies
114
128
  email:
115
- - dima@scriptangle.com
129
+ - adworse@erlef.org
116
130
  executables:
117
131
  - iguvium
118
132
  extensions: []
@@ -142,7 +156,7 @@ homepage: https://github.com/adworse/iguvium
142
156
  licenses:
143
157
  - MIT
144
158
  metadata: {}
145
- post_install_message:
159
+ post_install_message:
146
160
  rdoc_options: []
147
161
  require_paths:
148
162
  - lib
@@ -157,9 +171,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
171
  - !ruby/object:Gem::Version
158
172
  version: '0'
159
173
  requirements: []
160
- rubyforge_project:
161
- rubygems_version: 2.7.6
162
- signing_key:
174
+ rubygems_version: 3.0.6
175
+ signing_key:
163
176
  specification_version: 4
164
177
  summary: Extract tables from PDF as a structured info
165
178
  test_files: []