iguvium 0.8.4 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 622824e3031b9d70a8385ea50d8283c375c6e15924dee4ce5b03cc282415de48
4
- data.tar.gz: daa366102b6815f9df955d10396f665347529d036531829dbd63ec7d61307f83
3
+ metadata.gz: cf00782b6d5ec4d4e06d12ce386ff8ece314f8ba896bb70117232ec100ec5cdb
4
+ data.tar.gz: 544dbce4b3cfacd303f711e774284cd497b9b3dcdc8de0590b4825f84ff0e399
5
5
  SHA512:
6
- metadata.gz: da44bc6dd6806fd66f74d388b8ab3334cde54a0efda764bf28564f97377f0aac0da82c69473e1ce73429b3642f547f426581b4bbd39c94f9fa6865147911a021
7
- data.tar.gz: 9685fbe9866ac87931d978e3f588a38ea9bf47c7ffaf5f58e823d3deae0608ed6a0e5db2596da52e46e511644b6aa7df3be12a441a82b64451c118d33c7ffb7d
6
+ metadata.gz: b6340302943733d007a37e16206337c712646199ab9133699d328cb44c4a1ea6c1e4f65fb61c559467b018d1232ccd35c03e48b760f67645292b66dbc2640ed1
7
+ data.tar.gz: 0b9dcce49fc880a1fa7a7d0b512e3037512800dd4531caad230d788cb5297647190591fd27057e0216d92ed32465b9b60bbbc38eb34d68c6a324f5f06edb6d43
data/CHANGELOG.md CHANGED
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.9.0] - 2018-12-07
6
+ ### Added
7
+ - Open cells rendering added. Tables like this are now processed correctly:
8
+ ```
9
+ __|____|_______|_____|
10
+ __|____|_______|_____|
11
+ __|____|_______|_____|
12
+ ```
13
+
14
+
5
15
  ## [0.8.4] - 2018-11-24
6
16
  ### Changed
7
17
  - Render phrases before cell assembly option of Iguvium::Table#to_a method is now true by default.
data/README.md CHANGED
@@ -24,7 +24,8 @@ Get this table:
24
24
 
25
25
  * Characters extraction is done by [PDF::Reader gem](https://github.com/yob/pdf-reader). Some PDFs are so messed up it can't extract meaningful text from them. If so, so does Iguvium.
26
26
 
27
- * Current version extracts regular (with constant number of rows per column and vise versa) tables with explicit lines formatting, like this:
27
+ * Current version extracts regular (with constant number of rows per column and vise versa)
28
+ tables with explicit lines formatting, like this:
28
29
 
29
30
  ```
30
31
  .__________________.
@@ -32,7 +33,15 @@ Get this table:
32
33
  |____|_______|_____|
33
34
  |____|_______|_____|
34
35
  ```
35
- Merged cells content is split as if cells were not merged.
36
+ And, after version 0.9.0, like this:
37
+ ```
38
+ __|____|_______|_____|
39
+ __|____|_______|_____|
40
+ __|____|_______|_____|
41
+ ```
42
+
43
+
44
+ Merged cells content is split as if cells were not merged unless you use `:phrases` option.
36
45
 
37
46
  * Performance: considering the fact it has computer vision under the hood, the gem is reasonably fast. Full page extraction takes up to 1 second on modern CPUs and up to 2 seconds on the older ones.
38
47
 
@@ -107,15 +116,7 @@ Initially inspired by [camelot](https://github.com/socialcopsdev/camelot/) idea
107
116
 
108
117
  ## Roadmap
109
118
 
110
- The next version will deal with open-edged tables like
111
-
112
- ```
113
- __|____|_______|_____|
114
- __|____|_______|_____|
115
- __|____|_______|_____|
116
- ```
117
-
118
- It also will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
119
+ The next version will keep open-edged rows metadata ('floorless' and 'roofless') for the needs of multipage tables merger.
119
120
 
120
121
  The final one will recognize tables with merged cells.
121
122
 
data/exe/iguvium CHANGED
@@ -18,6 +18,7 @@ opts = Slop.parse { |o|
18
18
  end
19
19
  o.on '-h', '--help', 'show help' do
20
20
  puts o.to_s.gsub(/(usage:).+(iguvium)/, '\1 \2 filename.pdf')
21
+ puts Iguvium::VERSION
21
22
  exit
22
23
  end
23
24
  }
data/lib/iguvium/cv.rb CHANGED
@@ -39,11 +39,11 @@ module Iguvium
39
39
  # Prepares image for recognition: initial blur
40
40
  # @param image [ChunkyPNG::Image] from {Iguvium::Image.read}
41
41
  def initialize(image)
42
- @image = blur image
42
+ @blurred = blur(image)
43
+ @image = to_narray(image).to_a
43
44
  end
44
45
 
45
- # @return [Array] 8-bit representation of an image
46
- attr_reader :image
46
+ attr_reader :image, :blurred
47
47
 
48
48
  # @return [Recognized]
49
49
  # lines most probably forming table cells and tables' outer borders as boxes
@@ -62,9 +62,8 @@ module Iguvium
62
62
  {
63
63
  vertical: Labeler.new(verticals)
64
64
  .lines
65
- .map { |line| flip_line line }
66
- .sort_by { |x, yrange| [yrange.begin, x] },
67
- horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }.sort_by { |_, y| [y] }
65
+ .map { |line| flip_line line },
66
+ horizontal: Labeler.new(horizontals).lines.map { |line| flip_line line }
68
67
  }
69
68
  end
70
69
 
@@ -80,14 +79,14 @@ module Iguvium
80
79
 
81
80
  def verticals(threshold = 3)
82
81
  Matrix
83
- .rows(convolve(NArray[*horizontal_scan(image)], VERTICAL, 0).to_a)
82
+ .rows(convolve(NArray[*horizontal_scan(blurred)], VERTICAL, 0).to_a)
84
83
  .map { |pix| pix < threshold ? nil : pix }
85
84
  .to_a
86
85
  end
87
86
 
88
87
  def horizontals(threshold = 3)
89
88
  Matrix
90
- .rows(convolve(NArray[*vertical_scan(image)], HORIZONTAL, 0).to_a)
89
+ .rows(convolve(NArray[*vertical_scan(blurred)], HORIZONTAL, 0).to_a)
91
90
  .map { |pix| pix < threshold ? nil : pix }
92
91
  .to_a
93
92
  end
@@ -186,8 +185,6 @@ module Iguvium
186
185
  def box(coord_array)
187
186
  ax, bx = coord_array.map(&:last).minmax
188
187
  ay, by = coord_array.map(&:first).minmax
189
- # additional pixels removed from the box definition
190
- # [ax - 1..bx + 1, ay - 1..by + 1]
191
188
  [ax..bx, flip_range(ay..by)]
192
189
  end
193
190
  end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iguvium
4
+
5
+ class Row
6
+ # gets characters limited by yrange and set of column ranges
7
+ def initialize(columns, characters, phrases: true)
8
+ @columns = columns
9
+ if phrases
10
+ characters =
11
+ characters
12
+ .sort
13
+ .chunk_while { |a, b| a.mergable?(b) }
14
+ .map { |chunk| chunk.inject(:+) }
15
+ end
16
+ @characters = characters
17
+ end
18
+
19
+ def cells
20
+ @columns.map { |range|
21
+ @characters.select { |character| range.cover?(character.x) }
22
+ }
23
+ end
24
+
25
+ # @return rendered row array
26
+ def render(newlines: false)
27
+ end
28
+
29
+ def merge(other)
30
+ end
31
+ end
32
+ end
data/lib/iguvium/table.rb CHANGED
@@ -16,6 +16,8 @@ module Iguvium
16
16
  @box = box
17
17
  @lines = page.lines
18
18
  @page = page
19
+ grid
20
+ heal
19
21
  end
20
22
 
21
23
  # Renders the table into an array of strings.
@@ -30,24 +32,74 @@ module Iguvium
30
32
  # @return [Array] 2D array of strings (content of table's cells)
31
33
  #
32
34
  def to_a(newlines: false, phrases: true)
33
- grid[:rows]
35
+ @to_a ||=
36
+ grid[:rows]
34
37
  .reverse
35
38
  .map { |row|
36
- grid[:columns].map do |column|
37
- render(
38
- phrases ? words_inside(column, row) : chars_inside(column, row),
39
- newlines: newlines
40
- )
41
- end
42
- }
39
+ grid[:columns].map do |column|
40
+ render(
41
+ phrases ? words_inside(column, row) : chars_inside(column, row),
42
+ newlines: newlines
43
+ )
44
+ end
45
+ }
43
46
  end
44
47
 
48
+ # def width
49
+ # grid[:columns].count
50
+ # end
51
+
52
+ # def mergeable?(other)
53
+ # width == other.width
54
+ # end
55
+
56
+ # def roofless?
57
+ # @roofless
58
+ # end
59
+
60
+ # def floorless?
61
+ # @floorless
62
+ # end
63
+
45
64
  private
46
65
 
47
66
  attr_reader :page, :lines, :box
48
67
 
49
- def enhancer(grid)
50
- # @todo write grid enhancer to detect cells between outer grid lines and box borders
68
+ # Looks if there are characters inside the box but outside of already detected cells
69
+ # and adds rows and/or columns if necessary.
70
+ # @return [Iguvium::Table] with added open-cell rows and columns
71
+ def heal
72
+ heal_rows
73
+ heal_cols
74
+ self
75
+ end
76
+
77
+ def wide_box
78
+ @wide_box ||= [
79
+ box.first.begin - 2..box.first.end + 2,
80
+ box.last.begin - 2..box.last.end + 2
81
+ ]
82
+ end
83
+
84
+ def heal_cols
85
+ leftcol = box.first.begin..grid[:columns].first.begin
86
+ rightcol = grid[:columns].last.end..box.first.end
87
+ @grid[:columns].unshift(leftcol) if chars_inside(leftcol, box.last).any?
88
+ @grid[:columns].append(rightcol) if chars_inside(rightcol, box.last).any?
89
+ end
90
+
91
+ def heal_rows
92
+ # TODO: shrink box (like `box.last.end - 2`)
93
+ roofrow = box.last.begin..grid[:rows].first.begin
94
+ floorrow = grid[:rows].last.end..box.last.end
95
+ if chars_inside(box.first, roofrow).any?
96
+ @grid[:rows].unshift(roofrow)
97
+ @roofless = true
98
+ end
99
+ if chars_inside(box.first, floorrow).any?
100
+ @grid[:rows].append(floorrow)
101
+ @floorless = true
102
+ end
51
103
  end
52
104
 
53
105
  def characters
@@ -74,7 +126,9 @@ module Iguvium
74
126
  end
75
127
 
76
128
  def grid
77
- @grid ||=
129
+ return @grid if @grid
130
+
131
+ @grid =
78
132
  {
79
133
  rows: lines_to_ranges(lines[:horizontal]),
80
134
  columns: lines_to_ranges(lines[:vertical])
@@ -82,7 +136,8 @@ module Iguvium
82
136
  end
83
137
 
84
138
  def lines_to_ranges(lines)
85
- lines.select { |line| line_in_box?(line, box) }
139
+ # TODO: extend box for the sake of lines select
140
+ lines.select { |line| line_in_box?(line, wide_box) }
86
141
  .map { |line| line.first.is_a?(Numeric) ? line.first : line.last }
87
142
  .sort
88
143
  .uniq
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iguvium
4
- VERSION = '0.8.4'
4
+ VERSION = '0.9.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iguvium
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.4
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dima Ermilov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-11-24 00:00:00.000000000 Z
11
+ date: 2018-12-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pdf-reader
@@ -135,6 +135,7 @@ files:
135
135
  - lib/iguvium/image.rb
136
136
  - lib/iguvium/labeler.rb
137
137
  - lib/iguvium/page.rb
138
+ - lib/iguvium/row.rb
138
139
  - lib/iguvium/table.rb
139
140
  - lib/iguvium/version.rb
140
141
  homepage: https://github.com/adworse/iguvium