iguvium 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 622824e3031b9d70a8385ea50d8283c375c6e15924dee4ce5b03cc282415de48
4
- data.tar.gz: daa366102b6815f9df955d10396f665347529d036531829dbd63ec7d61307f83
3
+ metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
4
+ data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
5
5
  SHA512:
6
- metadata.gz: da44bc6dd6806fd66f74d388b8ab3334cde54a0efda764bf28564f97377f0aac0da82c69473e1ce73429b3642f547f426581b4bbd39c94f9fa6865147911a021
7
- data.tar.gz: 9685fbe9866ac87931d978e3f588a38ea9bf47c7ffaf5f58e823d3deae0608ed6a0e5db2596da52e46e511644b6aa7df3be12a441a82b64451c118d33c7ffb7d
6
+ metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
7
+ data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
data/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.9.0] - 2018-12-07
6
+ ### Added
7
+ - Open cells rendering added. Tables like this are now processed correctly:
8
+ ```
9
+ __|____|_______|_____|
10
+ __|____|_______|_____|
11
+ __|____|_______|_____|
12
+ ```
13
+
14
+
5
15
  ## [0.8.4] - 2018-11-24
6
16
  ### Changed
7
17
  - Render phrases before cell assembly option of Iguvium::Table#to_a method is now true by default.
data/README.md CHANGED
@@ -24,7 +24,8 @@ Get this table:
24
24
 
25
25
  * Characters extraction is done by [PDF::Reader gem](https://github.com/yob/pdf-reader). Some PDFs are so messed up it can't extract meaningful text from them. If so, so does Iguvium.
26
26
 
27
- * Current version extracts regular (with constant number of rows per column and vise versa) tables with explicit lines formatting, like this:
27
+ * Current version extracts regular (with constant number of rows per column and vise versa)
28
+ tables with explicit lines formatting, like this:
28
29
 
29
30
  ```
30
31
  .__________________.
@@ -32,7 +33,15 @@ Get this table:
32
33
  |____|_______|_____|
33
34
  |____|_______|_____|
34
35
  ```
35
- Merged cells content is split as if cells were not merged.
36
+ And, after version 0.9.0, like this:
37
+ ```
38
+ __|____|_______|_____|
39
+ __|____|_______|_____|
40
+ __|____|_______|_____|
41
+ ```
42
+
43
+
44
+ Merged cells content is split as if cells were not merged unless you use `:phrases` option.
36
45
 
37
46
  * Performance: considering the fact it has computer vision under the hood, the gem is reasonably fast. Full page extraction takes up to 1 second on modern CPUs and up to 2 seconds on the older ones.
38
47
 
@@ -107,15 +116,7 @@ Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea
107
116
 
108
117
  ## Roadmap
109
118
 
110
- The next version will deal with open-edged tables like
111
-
112
- ```
113
- __|____|_______|_____|
114
- __|____|_______|_____|
115
- __|____|_______|_____|
116
- ```
117
-
118
- It also will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
119
+ The next version will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
119
120
 
120
121
  The final one will recognize tables with merged cells.
121
122
 
data/exe/iguvium CHANGED
@@ -18,6 +18,7 @@ opts = Slop.parse { |o|
18
18
  end
19
19
  o.on '-h', '--help', 'show help' do
20
20
  puts o.to_s.gsub(/(usage:).+(iguvium)/, '\1 \2 filename.pdf')
21
+ puts Iguvium::VERSION
21
22
  exit
22
23
  end
23
24
  }
data/lib/iguvium/cv.rb CHANGED
@@ -39,11 +39,11 @@ module Iguvium
39
39
  # Prepares image for recognition: initial blur
40
40
  # @param image [ChunkyPNG::Image] from {Iguvium::Image.read}
41
41
  def initialize(image)
42
- @image = blur image
42
+ @blurred = blur(image)
43
+ @image = to_narray(image).to_a
43
44
  end
44
45
 
45
- # @return [Array] 8-bit representation of an image
46
- attr_reader :image
46
+ attr_reader :image, :blurred
47
47
 
48
48
  # @return [Recognized]
49
49
  # lines most probably forming table cells and tables' outer borders as boxes
@@ -62,9 +62,8 @@ module Iguvium
62
62
  {
63
63
  vertical: Labeler.new(verticals)
64
64
  .lines
65
- .map { |line| flip_line line }
66
- .sort_by { |x, yrange| [yrange.begin, x] },
67
- horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }.sort_by { |_, y| [y] }
65
+ .map { |line| flip_line line },
66
+ horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }
68
67
  }
69
68
  end
70
69
 
@@ -80,14 +79,14 @@ module Iguvium
80
79
 
81
80
  def verticals(threshold = 3)
82
81
  Matrix
83
- .rows(convolve(NArray[*horizontal_scan(image)], VERTICAL, 0).to_a)
82
+ .rows(convolve(NArray[*horizontal_scan(blurred)], VERTICAL, 0).to_a)
84
83
  .map { |pix| pix < threshold ? nil : pix }
85
84
  .to_a
86
85
  end
87
86
 
88
87
  def horizontals(threshold = 3)
89
88
  Matrix
90
- .rows(convolve(NArray[*vertical_scan(image)], HORIZONTAL, 0).to_a)
89
+ .rows(convolve(NArray[*vertical_scan(blurred)], HORIZONTAL, 0).to_a)
91
90
  .map { |pix| pix < threshold ? nil : pix }
92
91
  .to_a
93
92
  end
@@ -186,8 +185,6 @@ module Iguvium
186
185
  def box(coord_array)
187
186
  ax, bx = coord_array.map(&:last).minmax
188
187
  ay, by = coord_array.map(&:first).minmax
189
- # additional pixels removed from the box definition
190
- # [ax - 1..bx + 1, ay - 1..by + 1]
191
188
  [ax..bx, flip_range(ay..by)]
192
189
  end
193
190
  end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iguvium
4
+
5
+ class Row
6
+ # gets characters limited by yrange and set of column ranges
7
+ def initialize(columns, characters, phrases: true)
8
+ @columns = columns
9
+ if phrases
10
+ characters =
11
+ characters
12
+ .sort
13
+ .chunk_while { |a, b| a.mergable?(b) }
14
+ .map { |chunk| chunk.inject(:+) }
15
+ end
16
+ @characters = characters
17
+ end
18
+
19
+ def cells
20
+ @columns.map { |range|
21
+ @characters.select { |character| range.cover?(character.x) }
22
+ }
23
+ end
24
+
25
+ # @return rendered row array
26
+ def render(newlines: false)
27
+ end
28
+
29
+ def merge(other)
30
+ end
31
+ end
32
+ end
data/lib/iguvium/table.rb CHANGED
@@ -16,6 +16,8 @@ module Iguvium
16
16
  @box = box
17
17
  @lines = page.lines
18
18
  @page = page
19
+ grid
20
+ heal
19
21
  end
20
22
 
21
23
  # Renders the table into an array of strings.
@@ -30,24 +32,74 @@ module Iguvium
30
32
  # @return [Array] 2D array of strings (content of table's cells)
31
33
  #
32
34
  def to_a(newlines: false, phrases: true)
33
- grid[:rows]
35
+ @to_a ||=
36
+ grid[:rows]
34
37
  .reverse
35
38
  .map { |row|
36
- grid[:columns].map do |column|
37
- render(
38
- phrases ? words_inside(column, row) : chars_inside(column, row),
39
- newlines: newlines
40
- )
41
- end
42
- }
39
+ grid[:columns].map do |column|
40
+ render(
41
+ phrases ? words_inside(column, row) : chars_inside(column, row),
42
+ newlines: newlines
43
+ )
44
+ end
45
+ }
43
46
  end
44
47
 
48
+ # def width
49
+ # grid[:columns].count
50
+ # end
51
+
52
+ # def mergeable?(other)
53
+ # width == other.width
54
+ # end
55
+
56
+ # def roofless?
57
+ # @roofless
58
+ # end
59
+
60
+ # def floorless?
61
+ # @floorless
62
+ # end
63
+
45
64
  private
46
65
 
47
66
  attr_reader :page, :lines, :box
48
67
 
49
- def enhancer(grid)
50
- # @todo write grid enhancer to detect cells between outer grid lines and box borders
68
+ # Looks if there are characters inside the box but outside of already detected cells
69
+ # and adds rows and/or columns if necessary.
70
+ # @return [Iguvium::Table] with added open-cell rows and columns
71
+ def heal
72
+ heal_rows
73
+ heal_cols
74
+ self
75
+ end
76
+
77
+ def wide_box
78
+ @wide_box ||= [
79
+ box.first.begin - 2..box.first.end + 2,
80
+ box.last.begin - 2..box.last.end + 2
81
+ ]
82
+ end
83
+
84
+ def heal_cols
85
+ leftcol = box.first.begin..grid[:columns].first.begin
86
+ rightcol = grid[:columns].last.end..box.first.end
87
+ @grid[:columns].unshift(leftcol) if chars_inside(leftcol, box.last).any?
88
+ @grid[:columns].append(rightcol) if chars_inside(rightcol, box.last).any?
89
+ end
90
+
91
+ def heal_rows
92
+ # TODO: shrink box (like `box.last.end - 2`)
93
+ roofrow = box.last.begin..grid[:rows].first.begin
94
+ floorrow = grid[:rows].last.end..box.last.end
95
+ if chars_inside(box.first, roofrow).any?
96
+ @grid[:rows].unshift(roofrow)
97
+ @roofless = true
98
+ end
99
+ if chars_inside(box.first, floorrow).any?
100
+ @grid[:rows].append(floorrow)
101
+ @floorless = true
102
+ end
51
103
  end
52
104
 
53
105
  def characters
@@ -74,7 +126,9 @@ module Iguvium
74
126
  end
75
127
 
76
128
  def grid
77
- @grid ||=
129
+ return @grid if @grid
130
+
131
+ @grid =
78
132
  {
79
133
  rows: lines_to_ranges(lines[:horizontal]),
80
134
  columns: lines_to_ranges(lines[:vertical])
@@ -82,7 +136,8 @@ module Iguvium
82
136
  end
83
137
 
84
138
  def lines_to_ranges(lines)
85
- lines.select { |line| line_in_box?(line, box) }
139
+ # TODO: extend box for the sake of lines select
140
+ lines.select { |line| line_in_box?(line, wide_box) }
86
141
  .map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
87
142
  .sort
88
143
  .uniq
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.8.4'
4
+ VERSION = '0.9.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.4
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-24 00:00:00.000000000 Z
11
+ date: 2018-12-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pdf-reader
@@ -135,6 +135,7 @@ files:
135
135
  - lib/iguvium/image.rb
136
136
  - lib/iguvium/labeler.rb
137
137
  - lib/iguvium/page.rb
138
+ - lib/iguvium/row.rb
138
139
  - lib/iguvium/table.rb
139
140
  - lib/iguvium/version.rb
140
141
  homepage: https://github.com/adworse/iguvium