tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
data/ext/lsd.h CHANGED
@@ -135,10 +135,10 @@
135
135
  line segment number 'n+1' are obtained with
136
136
  'out[7*n+0]' to 'out[7*n+6]'.
137
137
  */
138
- double * LineSegmentDetection( int * n_out,
139
- double * img, int X, int Y,
140
- double scale, double sigma_scale, double quant,
141
- double ang_th, double log_eps, double density_th,
138
+ float * LineSegmentDetection( int * n_out,
139
+ float * img, int X, int Y,
140
+ float scale, float sigma_scale, float quant,
141
+ float ang_th, float log_eps, float density_th,
142
142
  int n_bins,
143
143
  int ** reg_img, int * reg_x, int * reg_y );
144
144
 
@@ -203,8 +203,8 @@ double * LineSegmentDetection( int * n_out,
203
203
  line segment number 'n+1' are obtained with
204
204
  'out[7*n+0]' to 'out[7*n+6]'.
205
205
  */
206
- double * lsd_scale_region( int * n_out,
207
- double * img, int X, int Y, double scale,
206
+ float * lsd_scale_region( int * n_out,
207
+ float * img, int X, int Y, float scale,
208
208
  int ** reg_img, int * reg_x, int * reg_y );
209
209
 
210
210
  /*----------------------------------------------------------------------------*/
@@ -244,7 +244,7 @@ double * lsd_scale_region( int * n_out,
244
244
  line segment number 'n+1' are obtained with
245
245
  'out[7*n+0]' to 'out[7*n+6]'.
246
246
  */
247
- double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
247
+ float * lsd_scale(int * n_out, float * img, int X, int Y, float scale);
248
248
 
249
249
  /*----------------------------------------------------------------------------*/
250
250
  /** LSD Simple Interface
@@ -275,9 +275,9 @@ double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
275
275
  line segment number 'n+1' are obtained with
276
276
  'out[7*n+0]' to 'out[7*n+6]'.
277
277
  */
278
- double * lsd(int * n_out, double * img, int X, int Y);
278
+ float * lsd(int * n_out, float * img, int X, int Y);
279
279
 
280
- void free_values(double * p);
280
+ void free_values(float * p);
281
281
 
282
282
  #endif /* !LSD_HEADER */
283
283
  /*----------------------------------------------------------------------------*/
data/lib/tabula.rb CHANGED
@@ -1,13 +1,30 @@
1
1
  module Tabula
2
2
  PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
3
+ ONLY_SPACES_RE = Regexp.new('^\s+$')
3
4
  end
4
5
 
6
+ require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
7
+
8
+ import 'java.util.logging.LogManager'
9
+ import 'java.util.logging.Level'
10
+
11
+ lm = LogManager.log_manager
12
+ lm.logger_names.each do |name|
13
+ if name == "" #rootlogger is apparently the logger PDFBox is talking to.
14
+ l = lm.get_logger(name)
15
+ l.level = Level::OFF
16
+ l.handlers.each do |h|
17
+ h.level = Level::OFF
18
+ end
19
+ end
20
+ end
21
+
22
+
5
23
  require_relative './tabula/version'
24
+ require_relative './tabula/core_ext'
6
25
  require_relative './tabula/entities'
7
- require_relative './tabula/pdf_dump'
26
+ require_relative './tabula/extraction'
8
27
  require_relative './tabula/table_extractor'
9
28
  require_relative './tabula/writers'
10
- require_relative './tabula/table_guesser'
11
29
  require_relative './tabula/line_segment_detector'
12
30
  require_relative './tabula/pdf_render'
13
- #require_relative './tabula/whitespace'
@@ -1,3 +1,19 @@
1
+ java_import java.awt.geom.Point2D
2
+ java_import java.awt.geom.Line2D
3
+ java_import java.awt.geom.Rectangle2D
4
+ java_import java.awt.Rectangle
5
+
6
+ class Array
7
+ def rpad(padding, target_size)
8
+ if self.size < target_size
9
+ self + [padding] * (target_size - self.size)
10
+ else
11
+ self
12
+ end
13
+ end
14
+ end
15
+
16
+
1
17
  module Enumerable
2
18
 
3
19
  def sum
@@ -23,3 +39,248 @@ module Enumerable
23
39
  end
24
40
 
25
41
  end
42
+
43
+ class Point2D::Float
44
+ def inspect
45
+ toString
46
+ end
47
+
48
+ def to_json(*args)
49
+ [self.getX, self.getY].to_json(*args)
50
+ end
51
+
52
+ def hash
53
+ "#{self.getX},#{self.getY}".hash
54
+ end
55
+
56
+ def <=>(other)
57
+ return 1 if self.y > other.y
58
+ return -1 if self.y < other.y
59
+ return 1 if self.x > other.x
60
+ return -1 if self.x < other.x
61
+ return 0
62
+ end
63
+
64
+ def x_first_cmp(other)
65
+ return 1 if self.x > other.x
66
+ return -1 if self.x < other.x
67
+ return 1 if self.y > other.y
68
+ return -1 if self.y < other.y
69
+ return 0
70
+ end
71
+
72
+ def ==(other)
73
+ return self.x == other.x && self.y == other.y
74
+ end
75
+
76
+ end
77
+
78
+ class Line2D::Float
79
+ def to_json(*args)
80
+ [self.getX1, self.getY1, self.getX2, self.getY2].to_json(*args)
81
+ end
82
+
83
+ def inspect
84
+ "<Line2D::Float[(#{self.getX1},#{self.getY1}),(#{self.getX2},#{self.getY2})]>"
85
+ end
86
+
87
+ def rotate!(pointX, pointY, amount)
88
+ px1 = self.getX1 - pointX; px2 = self.getX2 - pointX
89
+ py1 = self.getY1 - pointY; py2 = self.getY2 - pointY
90
+
91
+ if amount == 90 || amount == -270
92
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], pointX - py2, pointY + px1, pointX - py1, pointY + px2
93
+ elsif amount == 270 || amount == -90
94
+ self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], pointX + py1, pointY - px2, pointX + py2, pointY - px1
95
+ end
96
+
97
+ end
98
+
99
+ def transform!(affine_transform)
100
+ newP1, newP2 = Point2D::Float.new, Point2D::Float.new
101
+ affine_transform.transform(self.getP1, newP1)
102
+ affine_transform.transform(self.getP2, newP2)
103
+ setLine(newP1, newP2)
104
+ self
105
+ end
106
+
107
+ def snap!(cell_size)
108
+ newP1, newP2 = Point2D::Float.new, Point2D::Float.new
109
+ newP1.java_send :setLocation, [Java::float, Java::float], (self.getX1 / cell_size).round * cell_size, (self.getY1 / cell_size).round * cell_size
110
+ newP2.java_send :setLocation, [Java::float, Java::float], (self.getX2 / cell_size).round * cell_size, (self.getY2 / cell_size).round * cell_size
111
+ setLine(newP1, newP2)
112
+ end
113
+
114
+ def horizontal?(threshold=0.00001)
115
+ (self.getY2 - self.getY1).abs < threshold
116
+ end
117
+
118
+ def vertical?(threshold=0.00001)
119
+ (self.getX2 - self.getX1).abs < threshold
120
+ end
121
+
122
+ end
123
+
124
+ class Rectangle2D::Float
125
+ SIMILARITY_DIVISOR = 20
126
+
127
+ alias_method :top, :minY
128
+ alias_method :right, :maxX
129
+ alias_method :left, :minX
130
+ alias_method :bottom, :maxY
131
+
132
+
133
+ # Implement geometry stuff
134
+ #-------------------------
135
+
136
+ def dims(*format)
137
+ if format
138
+ format.map{|method| self.send(method)}
139
+ else
140
+ [self.x, self.y, self.width, self.height]
141
+ end
142
+ end
143
+
144
+ def top=(new_y)
145
+ delta_height = new_y - self.y
146
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, new_y, self.width, (self.height - delta_height)
147
+
148
+ #used to be: (fixes test_vertical_rulings_splitting_words)
149
+ # self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, new_y, self.width, self.height
150
+ end
151
+
152
+ def bottom=(new_y2)
153
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, self.y, self.width, new_y2 - self.y
154
+ end
155
+
156
+ def left=(new_x)
157
+ delta_width = new_x - self.x
158
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], new_x, self.y, (self.width - delta_width), self.height
159
+ #used to be: (fixes test_vertical_rulings_splitting_words)
160
+ # self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], new_x, self.y, self.width, self.height
161
+ end
162
+
163
+ def right=(new_x2)
164
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, self.y, new_x2 - self.x, self.height
165
+ end
166
+
167
+ def area
168
+ self.width * self.height
169
+ end
170
+
171
+ # [x, y]
172
+ def midpoint
173
+ [horizontal_midpoint, vertical_midpoint]
174
+ end
175
+
176
+ def horizontal_midpoint
177
+ self.left + (self.width / 2)
178
+ end
179
+
180
+ def vertical_midpoint
181
+ self.top + (self.height / 2)
182
+ end
183
+
184
+ def horizontal_distance(other)
185
+ (other.left - self.right).abs
186
+ end
187
+
188
+ def vertical_distance(other)
189
+ (other.bottom - self.bottom).abs
190
+ end
191
+
192
+
193
+ # Various ways that rectangles can overlap one another
194
+ #------------------------------
195
+
196
+ # Roughly, detects if self and other belong to the same line
197
+ def vertically_overlaps?(other)
198
+ vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
199
+ vertical_overlap > 0
200
+ end
201
+
202
+ # detects if self and other belong to the same column
203
+ def horizontally_overlaps?(other)
204
+ horizontal_overlap = [0, [self.right, other.right].min - [self.left, other.left].max].max
205
+ horizontal_overlap > 0
206
+ end
207
+
208
+ def overlaps?(other)
209
+ self.intersects(*other.dims(:x, :y, :width, :height))
210
+ end
211
+
212
+ def overlaps_with_ratio?(other, ratio_tolerance=0.00001)
213
+ self.overlap_ratio(other) > ratio_tolerance
214
+ end
215
+
216
+ def overlap_ratio(other)
217
+ intersection_width = [0, [self.right, other.right].min - [self.left, other.left].max].max
218
+ intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
219
+ intersection_area = [0, intersection_height * intersection_width].max
220
+
221
+ union_area = self.area + other.area - intersection_area
222
+ intersection_area / union_area
223
+ end
224
+
225
+ # as defined by PDF-TREX paper
226
+ def horizontal_overlap_ratio(other)
227
+ delta = [self.bottom - self.top, other.bottom - other.top].min
228
+ if [other.top, self.top, other.bottom, self.bottom].sorted?
229
+ (other.bottom - self.top) / delta
230
+ elsif [self.top, other.top, self.bottom, other.bottom].sorted?
231
+ (self.bottom - other.top) / delta
232
+ elsif [self.top, other.top, other.bottom, self.bottom].sorted?
233
+ (other.bottom - other.top) / delta
234
+ elsif [other.top, self.top, self.bottom, other.bottom].sorted?
235
+ (self.bottom - self.top) / delta
236
+ else
237
+ 0
238
+ end
239
+ end
240
+
241
+
242
+ # Funky custom methods (i.e. not just geometry)
243
+ #----------------------------------------------
244
+
245
+ #used for "deduping" similar rectangles detected via CV.
246
+ def similarity_hash
247
+ [self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
248
+ end
249
+
250
+ def self.unionize(non_overlapping_rectangles, next_rect)
251
+ #if next_rect doesn't overlap any of non_overlapping_rectangles
252
+ if !(overlapping = non_overlapping_rectangles.compact.select{|r| next_rect.overlaps? r}).empty? &&
253
+ !non_overlapping_rectangles.empty?
254
+ #remove all of those that it overlaps from non_overlapping_rectangles and
255
+ non_overlapping_rectangles -= overlapping
256
+ #add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
257
+ non_overlapping_rectangles << overlapping.inject(next_rect) do |memo, overlap|
258
+ #all we're doing is unioning `overlap` and `memo` and setting that result to `memo`
259
+ union(overlap, memo, memo) #I </3 Java.
260
+ memo
261
+ end
262
+ else
263
+ non_overlapping_rectangles << next_rect
264
+ end
265
+ end
266
+
267
+ def to_h
268
+ hash = {}
269
+ [:top, :left, :width, :height].each do |m|
270
+ hash[m] = self.send(m)
271
+ end
272
+ hash
273
+ end
274
+
275
+ def inspect
276
+ "#<Rectangle2D dims:[#{top}, #{left}, #{bottom}, #{right}]>"
277
+ end
278
+
279
+ end
280
+
281
+ # used only in GetBounds2D in an intermediate step in HasCells#find_spreadsheets_from_cells
282
+ class Rectangle #java.awt.Rectangle
283
+ def inspect
284
+ "#<Rectangle dims:[x:#{x}, y:#{y}, w:#{width}, h:#{height}]>"
285
+ end
286
+ end
@@ -1,456 +1,11 @@
1
- module Tabula
2
-
3
- class ZoneEntity
4
- attr_accessor :top, :left, :width, :height
5
-
6
- attr_accessor :texts
7
-
8
- def initialize(top, left, width, height)
9
- self.top = top
10
- self.left = left
11
- self.width = width
12
- self.height = height
13
- self.texts = []
14
- end
15
-
16
- def bottom
17
- self.top + self.height
18
- end
19
-
20
- def right
21
- self.left + self.width
22
- end
23
-
24
- # [x, y]
25
- def midpoint
26
- [self.left + (self.width / 2), self.top + (self.height / 2)]
27
- end
28
-
29
- def area
30
- self.width * self.height
31
- end
32
-
33
- def merge!(other)
34
- self.top = [self.top, other.top].min
35
- self.left = [self.left, other.left].min
36
- self.width = [self.right, other.right].max - left
37
- self.height = [self.bottom, other.bottom].max - top
38
- end
39
-
40
- def horizontal_distance(other)
41
- (other.left - self.right).abs
42
- end
43
-
44
- def vertical_distance(other)
45
- (other.bottom - self.bottom).abs
46
- end
47
-
48
- # Roughly, detects if self and other belong to the same line
49
- def vertically_overlaps?(other)
50
- vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
51
- vertical_overlap > 0
52
- end
53
-
54
- # detects if self and other belong to the same column
55
- def horizontally_overlaps?(other)
56
- horizontal_overlap = [0, [self.right, other.right].min - [self.left, other.left].max].max
57
- horizontal_overlap > 0
58
- end
59
-
60
- def overlaps?(other, ratio_tolerance=0.00001)
61
- self.overlap_ratio(other) > ratio_tolerance
62
- end
63
-
64
- def overlap_ratio(other)
65
- intersection_width = [0, [self.right, other.right].min - [self.left, other.left].max].max
66
- intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
67
- intersection_area = [0, intersection_height * intersection_width].max
68
-
69
- union_area = self.area + other.area - intersection_area
70
- intersection_area / union_area
71
- end
72
-
73
- # as defined by PDF-TREX paper
74
- def horizontal_overlap_ratio(other)
75
- delta = [self.bottom - self.top, other.bottom - other.top].min
76
- if [other.top, self.top, other.bottom, self.bottom].sorted?
77
- (other.bottom - self.top) / delta
78
- elsif [self.top, other.top, self.bottom, other.bottom].sorted?
79
- (self.bottom - other.top) / delta
80
- elsif [self.top, other.top, other.bottom, self.bottom].sorted?
81
- (other.bottom - other.top) / delta
82
- elsif [other.top, self.top, self.bottom, other.bottom].sorted?
83
- (self.bottom - self.top) / delta
84
- else
85
- 0
86
- end
87
- end
88
-
89
- def to_h
90
- hash = {}
91
- [:top, :left, :width, :height].each do |m|
92
- hash[m] = self.send(m)
93
- end
94
- hash
95
- end
96
-
97
- def to_json(options={})
98
- self.to_h.to_json
99
- end
100
- end
101
-
102
- class Page < ZoneEntity
103
- attr_reader :rotation, :number
104
-
105
- def initialize(width, height, rotation, number, texts=[])
106
- super(0, 0, width, height)
107
- @rotation = rotation
108
- @number = number
109
- self.texts = texts
110
- end
111
-
112
- # get text, optionally from a provided area in the page [top, left, bottom, right]
113
- def get_text(area=nil)
114
- area = [0, 0, width, height] if area.nil?
115
-
116
- # spaces are not detected, b/c they have height == 0
117
- # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
118
- # self.texts.select { |t| t.overlaps? ze }
119
- self.texts.select do |t|
120
- t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
121
- end
122
- end
123
-
124
- def to_json(options={})
125
- { :width => self.width,
126
- :height => self.height,
127
- :number => self.number,
128
- :rotation => self.rotation,
129
- :texts => self.texts
130
- }.to_json(options)
131
- end
132
-
133
- end
134
-
135
- class TextElement < ZoneEntity
136
- attr_accessor :font, :font_size, :text, :width_of_space
137
-
138
- CHARACTER_DISTANCE_THRESHOLD = 1.5
139
- TOLERANCE_FACTOR = 0.25 #25
140
-
141
- def initialize(top, left, width, height, font, font_size, text, width_of_space)
142
- super(top, left, width, height)
143
- self.font = font
144
- self.font_size = font_size
145
- self.text = text
146
- self.width_of_space = width_of_space
147
- end
148
-
149
- # more or less returns True if distance < tolerance
150
- def should_merge?(other)
151
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
152
- overlaps = self.vertically_overlaps?(other)
153
-
154
- tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
155
-
156
- overlaps or
157
- (self.height == 0 and other.height != 0) or
158
- (other.height == 0 and self.height != 0) and
159
- self.horizontal_distance(other) < tolerance
160
- end
161
-
162
- # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
163
- def should_add_space?(other)
164
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
165
- overlaps = self.vertically_overlaps?(other)
166
-
167
- up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
168
- down_tolerance = 0.90 #90?
169
-
170
- dist = self.horizontal_distance(other).abs
171
-
172
- rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
173
- rv
174
- end
175
-
176
- def merge!(other)
177
- raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
178
- # unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
179
- # raise ArgumentError, "won't merge TextElements that don't overlap"
180
- # end
181
- if self.horizontally_overlaps?(other) and other.top < self.top
182
- self.text = other.text + self.text
183
- else
184
- self.text << other.text
185
- end
186
- super(other)
187
- end
188
-
189
- def to_h
190
- hash = super
191
- [:font, :text].each do |m|
192
- hash[m] = self.send(m)
193
- end
194
- hash
195
- end
196
- end
197
-
198
- class Table
199
- attr_reader :lines
200
- def initialize(line_count, separators)
201
- @separators = separators
202
- @lines = (0...line_count).inject([]) { |m| m << Line.new }
203
- end
204
-
205
- def add_text_element(text_element, i, j)
206
- if @lines.size <= i
207
- @lines[i] = Line.new
208
- end
209
- if @lines[i].text_elements[j]
210
- @lines[i].text_elements[j].merge!(text_element)
211
- else
212
- @lines[i].text_elements[j] = text_element
213
- end
214
- end
215
- end
216
-
217
- class Line < ZoneEntity
218
- attr_accessor :text_elements
219
- attr_reader :index
220
-
221
- def initialize(index=nil)
222
- self.text_elements = []
223
- @index = index
224
- end
225
-
226
- def <<(t)
227
- if self.text_elements.size == 0
228
- self.text_elements << t
229
- self.top = t.top
230
- self.left = t.left
231
- self.width = t.width
232
- self.height = t.height
233
- else
234
- if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
235
- #sometimes a space needs to be added here
236
- unless in_same_column.vertically_overlaps?(t)
237
- t.text = " " + t.text
238
- end
239
- in_same_column.merge!(t)
240
- else
241
- self.text_elements << t
242
- self.merge!(t)
243
- end
244
- end
245
- end
246
-
247
-
248
- end
249
-
250
- class Column < ZoneEntity
251
- attr_accessor :text_elements
252
-
253
- def initialize(left, width, text_elements=[])
254
- super(0, left, width, 0)
255
- @text_elements = text_elements
256
- end
257
-
258
- def <<(te)
259
- self.text_elements << te
260
- self.update_boundaries!(te)
261
- self.text_elements.sort_by! { |t| t.top }
262
- end
263
-
264
- def update_boundaries!(text_element)
265
- self.merge!(text_element)
266
- end
267
-
268
- # this column can be merged with other_column?
269
- def contains?(other_column)
270
- self.horizontally_overlaps?(other_column)
271
- end
272
-
273
- def average_line_distance
274
- # avg distance between lines
275
- # this might help to MERGE lines that are shouldn't be split
276
- # e.g. cells with > 1 lines of text
277
- 1.upto(self.text_elements.size - 1).map { |i|
278
- self.text_elements[i].top - self.text_elements[i - 1].top
279
- }.inject{ |sum, el| sum + el }.to_f / self.text_elements.size
280
- end
281
-
282
- def inspect
283
- vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
284
- texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
285
- "<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
286
- end
287
-
288
- end
289
-
290
- require_relative './core_ext'
291
-
292
- class Ruling < ZoneEntity
293
- # 2D line intersection test taken from comp.graphics.algorithms FAQ
294
- def intersects?(other)
295
- r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
296
- / ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
297
-
298
- s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
299
- / ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
300
-
301
- r >= 0 and r < 1 and s >= 0 and s < 1
302
- end
303
-
304
- def length
305
- Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
306
- end
307
-
308
- def vertical?
309
- left == right
310
- end
311
-
312
- def horizontal?
313
- top == bottom
314
- end
315
-
316
- def right
317
- left + width
318
- end
319
- def bottom
320
- top + height
321
- end
322
-
323
- def to_json(arg)
324
- [left, top, right, bottom].to_json
325
- end
326
-
327
- def to_xml
328
- "<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
329
- % [left, top, right, bottom]
330
- end
331
-
332
- def self.clean_rulings(rulings, max_distance=4)
333
-
334
- # merge horizontal and vertical lines
335
- # TODO this should be iterative
336
-
337
- skip = false
338
-
339
- horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
340
- .group_by(&:top)
341
- .values.reduce([]) do |memo, rs|
342
-
343
- rs = rs.sort_by(&:left)
344
- if rs.size > 1
345
- memo +=
346
- rs.each_cons(2)
347
- .chunk { |p| p[1].left - p[0].right < 7 }
348
- .select { |c| c[0] }
349
- .map { |group|
350
- group = group.last.flatten.uniq
351
- Tabula::Ruling.new(group[0].top,
352
- group[0].left,
353
- group[-1].right - group[0].left,
354
- 0)
355
- }
356
- Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
357
- else
358
- memo << rs.first
359
- end
360
- memo
361
- end
362
- .sort_by(&:top)
363
-
364
- h = []
365
- horiz.size.times do |i|
366
-
367
- if i == horiz.size - 1
368
- h << horiz[-1]
369
- break
370
- end
371
-
372
- if skip
373
- skip = false;
374
- next
375
- end
376
- d = (horiz[i+1].top - horiz[i].top).abs
377
-
378
- h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
379
- skip = true
380
- Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
381
- else
382
- horiz[i]
383
- end
384
- end
385
- horiz = h
386
-
387
- vert = rulings.select { |r| r.vertical? && r.height > max_distance }
388
- .group_by(&:left)
389
- .values
390
- .reduce([]) do |memo, rs|
391
-
392
- rs = rs.sort_by(&:top)
393
-
394
- if rs.size > 1
395
- # Here be dragons:
396
- # merge consecutive segments of lines that are close enough
397
- memo +=
398
- rs.each_cons(2)
399
- .chunk { |p| p[1].top - p[0].bottom < 7 }
400
- .select { |c| c[0] }
401
- .map { |group|
402
- group = group.last.flatten.uniq
403
- Tabula::Ruling.new(group[0].top,
404
- group[0].left,
405
- 0,
406
- group[-1].bottom - group[0].top)
407
- }
408
- else
409
- memo << rs.first
410
- end
411
- memo
412
- end.sort_by(&:left)
413
-
414
- # v = []
415
-
416
- # vert.size.times do |i|
417
- # if i == vert.size - 1
418
- # v << vert[-1]
419
- # break
420
- # end
421
-
422
- # if skip
423
- # skip = false;
424
- # next
425
- # end
426
- # d = (vert[i+1].left - vert[i].left).abs
427
-
428
- # v << if d < 4 # THRESHOLD DISTANCE between vertical lines
429
- # skip = true
430
- # Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
431
- # else
432
- # vert[i]
433
- # end
434
- # end
435
- # vert = v
436
-
437
-
438
- # - only keep horizontal rulings that intersect with at least one vertical ruling
439
- # - only keep vertical rulings that intersect with at least one horizontal ruling
440
- # yeah, it's a naive heuristic. but hey, it works.
441
-
442
- # h_mean = horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
443
- # horiz.reject { |h| h.width < h_mean }
444
-
445
- #vert.delete_if { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
446
- #horiz.delete_if { |h| !vert.any? { |v| v.intersects?(h) } } unless vert.empty?
447
-
448
- return horiz += vert
449
- end
450
-
451
-
452
-
453
-
454
- end
455
-
456
- end
1
+ require_relative './entities/zone_entity'
2
+ require_relative './entities/cell'
3
+ require_relative './entities/has_cells'
4
+ require_relative './entities/line'
5
+ require_relative './entities/page'
6
+ require_relative './entities/page_area'
7
+ require_relative './entities/ruling'
8
+ require_relative './entities/spreadsheet'
9
+ require_relative './entities/table'
10
+ require_relative './entities/text_chunk'
11
+ require_relative './entities/text_element'