iguvium 0.9.0 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
4
- data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
3
+ metadata.gz: 8d6b546079935086775c26461dd4ea774d9f0474a4dcae211c71d01544536479
4
+ data.tar.gz: 8f96f1e0ae92923a268c3eee0431264f6d4324ed7e067e620ec376c1b7325917
5
5
  SHA512:
6
- metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
7
- data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
6
+ metadata.gz: cdca6bd0a04ab1ba897cd9888acebecc5ee689ffb5c814b0ecce674e96fcbefbb8035c84b6d9eabf3192411153b2aaa43425771e37cd6736ac9dbe52fe4dc364
7
+ data.tar.gz: 82637dd89b1d3015503f7fff7ae198525a8a0fd0b0050b70285a57e27865e374620a4cc78c93027d650d20840eb9377171271b34bb4b1fc9b44a893ba738822e
data/.travis.yml CHANGED
@@ -4,4 +4,5 @@ rvm:
4
4
  - 2.5.1
5
5
  - 2.4.5
6
6
  - 2.3.8
7
+ - 3.1.3
7
8
  before_install: gem install bundler && sudo apt-get install ghostscript
data/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.9.2] - 2019-05-30
6
+ ### Fixed
7
+ - Solving deprecated use of keyword arguments in method calls in Ruby 3
8
+ - Matrix gem needs to be an explicit dependency as of Ruby 3.1.
9
+
10
+ ## [0.9.1] - 2019-05-30
11
+ ### Fixed
12
+ - Rare `undefined method 'begin' for nil:NilClass` error fixed
13
+ - Remove Iguvium::Table#to_a result caching
14
+
5
15
  ## [0.9.0] - 2018-12-07
6
16
  ### Added
7
17
  - Open cells rendering added. Tables like this are now processed correctly:
data/README.md CHANGED
@@ -110,6 +110,8 @@ just page text. The latter is useful in case of whitespace-separated fixed-width
110
110
  There are usually no actual tables in PDFs, only characters with coordinates,
111
111
  and some fancy lines. Human eye interprets this as a table. Iguvium behaves quite similarly. It prints PDF to an image file with GhostScript, then analyses the image.
112
112
 
113
+ (Later clarification as per request. It only prints anything but text and images (-dFILTERTEXT -dFILTERIMAGE params of GhostScript, which lefts lines, curves, etc.) to analyze table structure. Text fields are extracted from pdf codepoints, if there are any. Trying to do otherwise would imply a full-blown OCR solution, something like FineReader. So with scanned image-only pdfs it is like an ideal unmatch: nothing is actually printed and there's no text to extract.)
114
+
113
115
  Long enough continuous edges are interpreted as possible cell borders. Gaussian blur is applied beforehand to get rid of possible inconsistencies and style features.
114
116
 
115
117
  Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea of image analysis to detect table structure. Besides this idea, is an independent work. Image recognition is written in Ruby, no OpenCV or other heavy computer vision libraries are used. Line detection algorithms are different too. The functionality of Camelot is significantly broader.
data/iguvium.gemspec CHANGED
@@ -8,11 +8,11 @@ Gem::Specification.new do |spec|
8
8
  spec.name = 'iguvium'
9
9
  spec.version = Iguvium::VERSION
10
10
  spec.authors = ['Dima Ermilov']
11
- spec.email = ['dima@scriptangle.com']
11
+ spec.email = ['adworse@erlef.org']
12
12
 
13
13
  spec.summary = 'Extract tables from PDF as a structured info'
14
- spec.description = 'Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
15
- then recognizes table separators optically. No OpenCV or other heavy dependencies'
14
+ spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
15
+ to image, then recognizes table separators optically. No OpenCV or other heavy dependencies"
16
16
  spec.homepage = 'https://github.com/adworse/iguvium'
17
17
  spec.license = 'MIT'
18
18
 
@@ -29,12 +29,13 @@ then recognizes table separators optically. No OpenCV or other heavy dependencie
29
29
 
30
30
  spec.require_paths = ['lib']
31
31
 
32
- spec.add_dependency 'pdf-reader', '~> 2.1'
33
32
  spec.add_dependency 'convolver-light', '~> 0.3.1'
34
33
  spec.add_dependency 'oily_png', '~> 1.2'
34
+ spec.add_dependency 'pdf-reader', '~> 2.1'
35
35
  spec.add_dependency 'slop', '~> 4.2'
36
+ spec.add_dependency 'matrix'
36
37
 
37
- spec.add_development_dependency 'bundler', '~> 1.16'
38
+ spec.add_development_dependency 'bundler'
38
39
  spec.add_development_dependency 'rake', '~> 10.0'
39
40
  spec.add_development_dependency 'rspec', '~> 3.0'
40
41
  end
data/lib/iguvium/image.rb CHANGED
@@ -15,7 +15,6 @@ module Iguvium
15
15
  #
16
16
  # @return [ChunkyPNG::Image]
17
17
  def self.read(path, pagenumber = 1, **opts)
18
- puts path.shellescape
19
18
  rgb = path.gsub(/\.pdf$/, '.rgb')
20
19
  Iguvium.logger.info `#{opts[:gspath]} -dSAFER -dBATCH -dNOPAUSE -sDEVICE=pnggray -dGraphicsAlphaBits=4 \
21
20
  -r72 -dFirstPage=#{pagenumber} -dLastPage=#{pagenumber} \
data/lib/iguvium/page.rb CHANGED
@@ -71,11 +71,14 @@ module Iguvium
71
71
  private
72
72
 
73
73
  def recognize!
74
- image = Image.read(@path, @reader_page.number, @opts)
74
+ image = Image.read(@path, @reader_page.number, **@opts)
75
75
  recognized = CV.new(image).recognize
76
76
  @lines = recognized[:lines]
77
77
  @boxes = recognized[:boxes].reject { |box| box_empty?(box) }
78
- @tables = @boxes.map { |box| Table.new(box, self) }.reverse
78
+ @tables = @boxes
79
+ .map { |box| Table.new(box, self) }
80
+ .reject { |table| table.grid[:rows].empty? || table.grid[:columns].empty? }
81
+ .reverse
79
82
  self
80
83
  end
81
84
 
data/lib/iguvium/table.rb CHANGED
@@ -32,17 +32,16 @@ module Iguvium
32
32
  # @return [Array] 2D array of strings (content of table's cells)
33
33
  #
34
34
  def to_a(newlines: false, phrases: true)
35
- @to_a ||=
36
- grid[:rows]
35
+ grid[:rows]
37
36
  .reverse
38
37
  .map { |row|
39
- grid[:columns].map do |column|
40
- render(
41
- phrases ? words_inside(column, row) : chars_inside(column, row),
42
- newlines: newlines
43
- )
44
- end
45
- }
38
+ grid[:columns].map do |column|
39
+ render(
40
+ phrases ? words_inside(column, row) : chars_inside(column, row),
41
+ newlines: newlines
42
+ )
43
+ end
44
+ }
46
45
  end
47
46
 
48
47
  # def width
@@ -61,6 +60,14 @@ module Iguvium
61
60
  # @floorless
62
61
  # end
63
62
 
63
+ def grid
64
+ @grid ||=
65
+ {
66
+ rows: lines_to_ranges(lines[:horizontal]),
67
+ columns: lines_to_ranges(lines[:vertical])
68
+ }
69
+ end
70
+
64
71
  private
65
72
 
66
73
  attr_reader :page, :lines, :box
@@ -69,8 +76,8 @@ module Iguvium
69
76
  # and adds rows and/or columns if necessary.
70
77
  # @return [Iguvium::Table] with added open-cell rows and columns
71
78
  def heal
72
- heal_rows
73
- heal_cols
79
+ heal_rows unless grid[:rows].empty?
80
+ heal_cols unless grid[:columns].empty?
74
81
  self
75
82
  end
76
83
 
@@ -89,7 +96,6 @@ module Iguvium
89
96
  end
90
97
 
91
98
  def heal_rows
92
- # TODO: shrink box (like `box.last.end - 2`)
93
99
  roofrow = box.last.begin..grid[:rows].first.begin
94
100
  floorrow = grid[:rows].last.end..box.last.end
95
101
  if chars_inside(box.first, roofrow).any?
@@ -125,18 +131,7 @@ module Iguvium
125
131
  }
126
132
  end
127
133
 
128
- def grid
129
- return @grid if @grid
130
-
131
- @grid =
132
- {
133
- rows: lines_to_ranges(lines[:horizontal]),
134
- columns: lines_to_ranges(lines[:vertical])
135
- }
136
- end
137
-
138
134
  def lines_to_ranges(lines)
139
- # TODO: extend box for the sake of lines select
140
135
  lines.select { |line| line_in_box?(line, wide_box) }
141
136
  .map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
142
137
  .sort
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.9.0'
4
+ VERSION = '0.9.2'
5
5
  end
data/lib/iguvium.rb CHANGED
@@ -73,7 +73,7 @@ module Iguvium
73
73
  opts[:gspath] ||= gs_nix?
74
74
  end
75
75
 
76
- PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, opts) }
76
+ PDF::Reader.new(path, opts).pages.map { |page| Page.new(page, path, **opts) }
77
77
  end
78
78
 
79
79
  # Creates and gives access to Ruby Logger. Default [Logger::Level] is Logger::ERROR.
@@ -116,11 +116,4 @@ module Iguvium
116
116
  end
117
117
  end
118
118
 
119
- # TODO: 4) Add options like maybe image thresholding
120
- #
121
- # TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
122
- # __|____|_______|_____|
123
- # __|____|_______|_____|
124
- # __|____|_______|_____|
125
- #
126
119
  # TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
metadata CHANGED
@@ -1,57 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-12-07 00:00:00.000000000 Z
11
+ date: 2022-12-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: pdf-reader
14
+ name: convolver-light
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.3.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.3.1
27
27
  - !ruby/object:Gem::Dependency
28
- name: convolver-light
28
+ name: oily_png
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 0.3.1
33
+ version: '1.2'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 0.3.1
40
+ version: '1.2'
41
41
  - !ruby/object:Gem::Dependency
42
- name: oily_png
42
+ name: pdf-reader
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.2'
47
+ version: '2.1'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.2'
54
+ version: '2.1'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: slop
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -66,20 +66,34 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '4.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: matrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - "~>"
87
+ - - ">="
74
88
  - !ruby/object:Gem::Version
75
- version: '1.16'
89
+ version: '0'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - "~>"
94
+ - - ">="
81
95
  - !ruby/object:Gem::Version
82
- version: '1.16'
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rake
85
99
  requirement: !ruby/object:Gem::Requirement
@@ -108,11 +122,11 @@ dependencies:
108
122
  - - "~>"
109
123
  - !ruby/object:Gem::Version
110
124
  version: '3.0'
111
- description: |-
112
- Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
113
- then recognizes table separators optically. No OpenCV or other heavy dependencies
125
+ description: Extract tables from PDF as a structured info. Uses ghostscript to print
126
+ pdf to image, then recognizes table separators optically. No OpenCV or other heavy
127
+ dependencies
114
128
  email:
115
- - dima@scriptangle.com
129
+ - adworse@erlef.org
116
130
  executables:
117
131
  - iguvium
118
132
  extensions: []
@@ -142,7 +156,7 @@ homepage: https://github.com/adworse/iguvium
142
156
  licenses:
143
157
  - MIT
144
158
  metadata: {}
145
- post_install_message:
159
+ post_install_message:
146
160
  rdoc_options: []
147
161
  require_paths:
148
162
  - lib
@@ -157,9 +171,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
171
  - !ruby/object:Gem::Version
158
172
  version: '0'
159
173
  requirements: []
160
- rubyforge_project:
161
- rubygems_version: 2.7.6
162
- signing_key:
174
+ rubygems_version: 3.0.6
175
+ signing_key:
163
176
  specification_version: 4
164
177
  summary: Extract tables from PDF as a structured info
165
178
  test_files: []