iguvium 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
4
- data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
3
+ metadata.gz: 59643f281def94149de22d6b858f1f90f4b4b05289ef4f3ef13f146e4621f6c9
4
+ data.tar.gz: 7f10445415eadcc746b8d21050443595ecf127c7f6856a8b2962f8d072825447
5
5
  SHA512:
6
- metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
7
- data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
6
+ metadata.gz: 7d3899e42a5bcaa6359554ff96d803f4a09acdacbb53855887bdedfda9245604cf24054a3588508d58222cf4c7783d031b36bff9c6e3e0cac2476c6552a54dc4
7
+ data.tar.gz: 8b6011b52aa2232597d1dea4ef087d36d4713879a0a6c76aba3849337f348329357499e8a5d8c3ec9ace4346e64a058d5b6fd415626b6553dfa2f29f6d516278
@@ -2,6 +2,11 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.9.1] - 2019-05-30
6
+ ### Fixed
7
+ - Rare `undefined method 'begin' for nil:NilClass` error fixed
8
+ - Remove Iguvium::Table#to_a result caching
9
+
5
10
  ## [0.9.0] - 2018-12-07
6
11
  ### Added
7
12
  - Open cells rendering added. Tables like this are now processed correctly:
@@ -11,8 +11,8 @@ Gem::Specification.new do |spec|
11
11
  spec.email = ['dima@scriptangle.com']
12
12
 
13
13
  spec.summary = 'Extract tables from PDF as a structured info'
14
- spec.description = 'Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
15
- then recognizes table separators optically. No OpenCV or other heavy dependencies'
14
+ spec.description = "Extract tables from PDF as a structured info. Uses ghostscript to print pdf \
15
+ to image, then recognizes table separators optically. No OpenCV or other heavy dependencies"
16
16
  spec.homepage = 'https://github.com/adworse/iguvium'
17
17
  spec.license = 'MIT'
18
18
 
@@ -29,9 +29,9 @@ then recognizes table separators optically. No OpenCV or other heavy dependencie
29
29
 
30
30
  spec.require_paths = ['lib']
31
31
 
32
- spec.add_dependency 'pdf-reader', '~> 2.1'
33
32
  spec.add_dependency 'convolver-light', '~> 0.3.1'
34
33
  spec.add_dependency 'oily_png', '~> 1.2'
34
+ spec.add_dependency 'pdf-reader', '~> 2.1'
35
35
  spec.add_dependency 'slop', '~> 4.2'
36
36
 
37
37
  spec.add_development_dependency 'bundler', '~> 1.16'
@@ -116,11 +116,4 @@ module Iguvium
116
116
  end
117
117
  end
118
118
 
119
- # TODO: 4) Add options like maybe image thresholding
120
- #
121
- # TODO: 6) 0.9 - version capable of reading tables with open outer cells, like this:
122
- # __|____|_______|_____|
123
- # __|____|_______|_____|
124
- # __|____|_______|_____|
125
- #
126
119
  # TODO: 7) 1.0 - in addition it should deal with merged cells (move result to the upper left cell).
@@ -75,7 +75,10 @@ module Iguvium
75
75
  recognized = CV.new(image).recognize
76
76
  @lines = recognized[:lines]
77
77
  @boxes = recognized[:boxes].reject { |box| box_empty?(box) }
78
- @tables = @boxes.map { |box| Table.new(box, self) }.reverse
78
+ @tables = @boxes
79
+ .map { |box| Table.new(box, self) }
80
+ .reject { |table| table.grid[:rows].empty? || table.grid[:columns].empty? }
81
+ .reverse
79
82
  self
80
83
  end
81
84
 
@@ -32,17 +32,16 @@ module Iguvium
32
32
  # @return [Array] 2D array of strings (content of table's cells)
33
33
  #
34
34
  def to_a(newlines: false, phrases: true)
35
- @to_a ||=
36
- grid[:rows]
35
+ grid[:rows]
37
36
  .reverse
38
37
  .map { |row|
39
- grid[:columns].map do |column|
40
- render(
41
- phrases ? words_inside(column, row) : chars_inside(column, row),
42
- newlines: newlines
43
- )
44
- end
45
- }
38
+ grid[:columns].map do |column|
39
+ render(
40
+ phrases ? words_inside(column, row) : chars_inside(column, row),
41
+ newlines: newlines
42
+ )
43
+ end
44
+ }
46
45
  end
47
46
 
48
47
  # def width
@@ -61,6 +60,14 @@ module Iguvium
61
60
  # @floorless
62
61
  # end
63
62
 
63
+ def grid
64
+ @grid ||=
65
+ {
66
+ rows: lines_to_ranges(lines[:horizontal]),
67
+ columns: lines_to_ranges(lines[:vertical])
68
+ }
69
+ end
70
+
64
71
  private
65
72
 
66
73
  attr_reader :page, :lines, :box
@@ -69,8 +76,8 @@ module Iguvium
69
76
  # and adds rows and/or columns if necessary.
70
77
  # @return [Iguvium::Table] with added open-cell rows and columns
71
78
  def heal
72
- heal_rows
73
- heal_cols
79
+ heal_rows unless grid[:rows].empty?
80
+ heal_cols unless grid[:columns].empty?
74
81
  self
75
82
  end
76
83
 
@@ -89,7 +96,6 @@ module Iguvium
89
96
  end
90
97
 
91
98
  def heal_rows
92
- # TODO: shrink box (like `box.last.end - 2`)
93
99
  roofrow = box.last.begin..grid[:rows].first.begin
94
100
  floorrow = grid[:rows].last.end..box.last.end
95
101
  if chars_inside(box.first, roofrow).any?
@@ -125,18 +131,7 @@ module Iguvium
125
131
  }
126
132
  end
127
133
 
128
- def grid
129
- return @grid if @grid
130
-
131
- @grid =
132
- {
133
- rows: lines_to_ranges(lines[:horizontal]),
134
- columns: lines_to_ranges(lines[:vertical])
135
- }
136
- end
137
-
138
134
  def lines_to_ranges(lines)
139
- # TODO: extend box for the sake of lines select
140
135
  lines.select { |line| line_in_box?(line, wide_box) }
141
136
  .map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
142
137
  .sort
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.9.0'
4
+ VERSION = '0.9.1'
5
5
  end
metadata CHANGED
@@ -1,57 +1,57 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-12-07 00:00:00.000000000 Z
11
+ date: 2019-05-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: pdf-reader
14
+ name: convolver-light
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.3.1
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.3.1
27
27
  - !ruby/object:Gem::Dependency
28
- name: convolver-light
28
+ name: oily_png
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 0.3.1
33
+ version: '1.2'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 0.3.1
40
+ version: '1.2'
41
41
  - !ruby/object:Gem::Dependency
42
- name: oily_png
42
+ name: pdf-reader
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.2'
47
+ version: '2.1'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.2'
54
+ version: '2.1'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: slop
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -108,9 +108,9 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '3.0'
111
- description: |-
112
- Extract tables from PDF as a structured info. Uses ghostscript to print pdf to image, \
113
- then recognizes table separators optically. No OpenCV or other heavy dependencies
111
+ description: Extract tables from PDF as a structured info. Uses ghostscript to print
112
+ pdf to image, then recognizes table separators optically. No OpenCV or other heavy
113
+ dependencies
114
114
  email:
115
115
  - dima@scriptangle.com
116
116
  executables: