tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
data/ext/lsd.h
CHANGED
@@ -135,10 +135,10 @@
|
|
135
135
|
line segment number 'n+1' are obtained with
|
136
136
|
'out[7*n+0]' to 'out[7*n+6]'.
|
137
137
|
*/
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
138
|
+
float * LineSegmentDetection( int * n_out,
|
139
|
+
float * img, int X, int Y,
|
140
|
+
float scale, float sigma_scale, float quant,
|
141
|
+
float ang_th, float log_eps, float density_th,
|
142
142
|
int n_bins,
|
143
143
|
int ** reg_img, int * reg_x, int * reg_y );
|
144
144
|
|
@@ -203,8 +203,8 @@ double * LineSegmentDetection( int * n_out,
|
|
203
203
|
line segment number 'n+1' are obtained with
|
204
204
|
'out[7*n+0]' to 'out[7*n+6]'.
|
205
205
|
*/
|
206
|
-
|
207
|
-
|
206
|
+
float * lsd_scale_region( int * n_out,
|
207
|
+
float * img, int X, int Y, float scale,
|
208
208
|
int ** reg_img, int * reg_x, int * reg_y );
|
209
209
|
|
210
210
|
/*----------------------------------------------------------------------------*/
|
@@ -244,7 +244,7 @@ double * lsd_scale_region( int * n_out,
|
|
244
244
|
line segment number 'n+1' are obtained with
|
245
245
|
'out[7*n+0]' to 'out[7*n+6]'.
|
246
246
|
*/
|
247
|
-
|
247
|
+
float * lsd_scale(int * n_out, float * img, int X, int Y, float scale);
|
248
248
|
|
249
249
|
/*----------------------------------------------------------------------------*/
|
250
250
|
/** LSD Simple Interface
|
@@ -275,9 +275,9 @@ double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
|
|
275
275
|
line segment number 'n+1' are obtained with
|
276
276
|
'out[7*n+0]' to 'out[7*n+6]'.
|
277
277
|
*/
|
278
|
-
|
278
|
+
float * lsd(int * n_out, float * img, int X, int Y);
|
279
279
|
|
280
|
-
void free_values(
|
280
|
+
void free_values(float * p);
|
281
281
|
|
282
282
|
#endif /* !LSD_HEADER */
|
283
283
|
/*----------------------------------------------------------------------------*/
|
data/lib/tabula.rb
CHANGED
@@ -1,13 +1,30 @@
|
|
1
1
|
module Tabula
|
2
2
|
PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
|
3
|
+
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
3
4
|
end
|
4
5
|
|
6
|
+
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
7
|
+
|
8
|
+
import 'java.util.logging.LogManager'
|
9
|
+
import 'java.util.logging.Level'
|
10
|
+
|
11
|
+
lm = LogManager.log_manager
|
12
|
+
lm.logger_names.each do |name|
|
13
|
+
if name == "" #rootlogger is apparently the logger PDFBox is talking to.
|
14
|
+
l = lm.get_logger(name)
|
15
|
+
l.level = Level::OFF
|
16
|
+
l.handlers.each do |h|
|
17
|
+
h.level = Level::OFF
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
5
23
|
require_relative './tabula/version'
|
24
|
+
require_relative './tabula/core_ext'
|
6
25
|
require_relative './tabula/entities'
|
7
|
-
require_relative './tabula/
|
26
|
+
require_relative './tabula/extraction'
|
8
27
|
require_relative './tabula/table_extractor'
|
9
28
|
require_relative './tabula/writers'
|
10
|
-
require_relative './tabula/table_guesser'
|
11
29
|
require_relative './tabula/line_segment_detector'
|
12
30
|
require_relative './tabula/pdf_render'
|
13
|
-
#require_relative './tabula/whitespace'
|
data/lib/tabula/core_ext.rb
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
java_import java.awt.geom.Point2D
|
2
|
+
java_import java.awt.geom.Line2D
|
3
|
+
java_import java.awt.geom.Rectangle2D
|
4
|
+
java_import java.awt.Rectangle
|
5
|
+
|
6
|
+
class Array
|
7
|
+
def rpad(padding, target_size)
|
8
|
+
if self.size < target_size
|
9
|
+
self + [padding] * (target_size - self.size)
|
10
|
+
else
|
11
|
+
self
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
1
17
|
module Enumerable
|
2
18
|
|
3
19
|
def sum
|
@@ -23,3 +39,248 @@ module Enumerable
|
|
23
39
|
end
|
24
40
|
|
25
41
|
end
|
42
|
+
|
43
|
+
class Point2D::Float
|
44
|
+
def inspect
|
45
|
+
toString
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_json(*args)
|
49
|
+
[self.getX, self.getY].to_json(*args)
|
50
|
+
end
|
51
|
+
|
52
|
+
def hash
|
53
|
+
"#{self.getX},#{self.getY}".hash
|
54
|
+
end
|
55
|
+
|
56
|
+
def <=>(other)
|
57
|
+
return 1 if self.y > other.y
|
58
|
+
return -1 if self.y < other.y
|
59
|
+
return 1 if self.x > other.x
|
60
|
+
return -1 if self.x < other.x
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
|
64
|
+
def x_first_cmp(other)
|
65
|
+
return 1 if self.x > other.x
|
66
|
+
return -1 if self.x < other.x
|
67
|
+
return 1 if self.y > other.y
|
68
|
+
return -1 if self.y < other.y
|
69
|
+
return 0
|
70
|
+
end
|
71
|
+
|
72
|
+
def ==(other)
|
73
|
+
return self.x == other.x && self.y == other.y
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
class Line2D::Float
|
79
|
+
def to_json(*args)
|
80
|
+
[self.getX1, self.getY1, self.getX2, self.getY2].to_json(*args)
|
81
|
+
end
|
82
|
+
|
83
|
+
def inspect
|
84
|
+
"<Line2D::Float[(#{self.getX1},#{self.getY1}),(#{self.getX2},#{self.getY2})]>"
|
85
|
+
end
|
86
|
+
|
87
|
+
def rotate!(pointX, pointY, amount)
|
88
|
+
px1 = self.getX1 - pointX; px2 = self.getX2 - pointX
|
89
|
+
py1 = self.getY1 - pointY; py2 = self.getY2 - pointY
|
90
|
+
|
91
|
+
if amount == 90 || amount == -270
|
92
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], pointX - py2, pointY + px1, pointX - py1, pointY + px2
|
93
|
+
elsif amount == 270 || amount == -90
|
94
|
+
self.java_send :setLine, [Java::float, Java::float, Java::float, Java::float,], pointX + py1, pointY - px2, pointX + py2, pointY - px1
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
def transform!(affine_transform)
|
100
|
+
newP1, newP2 = Point2D::Float.new, Point2D::Float.new
|
101
|
+
affine_transform.transform(self.getP1, newP1)
|
102
|
+
affine_transform.transform(self.getP2, newP2)
|
103
|
+
setLine(newP1, newP2)
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
def snap!(cell_size)
|
108
|
+
newP1, newP2 = Point2D::Float.new, Point2D::Float.new
|
109
|
+
newP1.java_send :setLocation, [Java::float, Java::float], (self.getX1 / cell_size).round * cell_size, (self.getY1 / cell_size).round * cell_size
|
110
|
+
newP2.java_send :setLocation, [Java::float, Java::float], (self.getX2 / cell_size).round * cell_size, (self.getY2 / cell_size).round * cell_size
|
111
|
+
setLine(newP1, newP2)
|
112
|
+
end
|
113
|
+
|
114
|
+
def horizontal?(threshold=0.00001)
|
115
|
+
(self.getY2 - self.getY1).abs < threshold
|
116
|
+
end
|
117
|
+
|
118
|
+
def vertical?(threshold=0.00001)
|
119
|
+
(self.getX2 - self.getX1).abs < threshold
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
class Rectangle2D::Float
|
125
|
+
SIMILARITY_DIVISOR = 20
|
126
|
+
|
127
|
+
alias_method :top, :minY
|
128
|
+
alias_method :right, :maxX
|
129
|
+
alias_method :left, :minX
|
130
|
+
alias_method :bottom, :maxY
|
131
|
+
|
132
|
+
|
133
|
+
# Implement geometry stuff
|
134
|
+
#-------------------------
|
135
|
+
|
136
|
+
def dims(*format)
|
137
|
+
if format
|
138
|
+
format.map{|method| self.send(method)}
|
139
|
+
else
|
140
|
+
[self.x, self.y, self.width, self.height]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def top=(new_y)
|
145
|
+
delta_height = new_y - self.y
|
146
|
+
self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, new_y, self.width, (self.height - delta_height)
|
147
|
+
|
148
|
+
#used to be: (fixes test_vertical_rulings_splitting_words)
|
149
|
+
# self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, new_y, self.width, self.height
|
150
|
+
end
|
151
|
+
|
152
|
+
def bottom=(new_y2)
|
153
|
+
self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, self.y, self.width, new_y2 - self.y
|
154
|
+
end
|
155
|
+
|
156
|
+
def left=(new_x)
|
157
|
+
delta_width = new_x - self.x
|
158
|
+
self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], new_x, self.y, (self.width - delta_width), self.height
|
159
|
+
#used to be: (fixes test_vertical_rulings_splitting_words)
|
160
|
+
# self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], new_x, self.y, self.width, self.height
|
161
|
+
end
|
162
|
+
|
163
|
+
def right=(new_x2)
|
164
|
+
self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.x, self.y, new_x2 - self.x, self.height
|
165
|
+
end
|
166
|
+
|
167
|
+
def area
|
168
|
+
self.width * self.height
|
169
|
+
end
|
170
|
+
|
171
|
+
# [x, y]
|
172
|
+
def midpoint
|
173
|
+
[horizontal_midpoint, vertical_midpoint]
|
174
|
+
end
|
175
|
+
|
176
|
+
def horizontal_midpoint
|
177
|
+
self.left + (self.width / 2)
|
178
|
+
end
|
179
|
+
|
180
|
+
def vertical_midpoint
|
181
|
+
self.top + (self.height / 2)
|
182
|
+
end
|
183
|
+
|
184
|
+
def horizontal_distance(other)
|
185
|
+
(other.left - self.right).abs
|
186
|
+
end
|
187
|
+
|
188
|
+
def vertical_distance(other)
|
189
|
+
(other.bottom - self.bottom).abs
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
# Various ways that rectangles can overlap one another
|
194
|
+
#------------------------------
|
195
|
+
|
196
|
+
# Roughly, detects if self and other belong to the same line
|
197
|
+
def vertically_overlaps?(other)
|
198
|
+
vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
|
199
|
+
vertical_overlap > 0
|
200
|
+
end
|
201
|
+
|
202
|
+
# detects if self and other belong to the same column
|
203
|
+
def horizontally_overlaps?(other)
|
204
|
+
horizontal_overlap = [0, [self.right, other.right].min - [self.left, other.left].max].max
|
205
|
+
horizontal_overlap > 0
|
206
|
+
end
|
207
|
+
|
208
|
+
def overlaps?(other)
|
209
|
+
self.intersects(*other.dims(:x, :y, :width, :height))
|
210
|
+
end
|
211
|
+
|
212
|
+
def overlaps_with_ratio?(other, ratio_tolerance=0.00001)
|
213
|
+
self.overlap_ratio(other) > ratio_tolerance
|
214
|
+
end
|
215
|
+
|
216
|
+
def overlap_ratio(other)
|
217
|
+
intersection_width = [0, [self.right, other.right].min - [self.left, other.left].max].max
|
218
|
+
intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
|
219
|
+
intersection_area = [0, intersection_height * intersection_width].max
|
220
|
+
|
221
|
+
union_area = self.area + other.area - intersection_area
|
222
|
+
intersection_area / union_area
|
223
|
+
end
|
224
|
+
|
225
|
+
# as defined by PDF-TREX paper
|
226
|
+
def horizontal_overlap_ratio(other)
|
227
|
+
delta = [self.bottom - self.top, other.bottom - other.top].min
|
228
|
+
if [other.top, self.top, other.bottom, self.bottom].sorted?
|
229
|
+
(other.bottom - self.top) / delta
|
230
|
+
elsif [self.top, other.top, self.bottom, other.bottom].sorted?
|
231
|
+
(self.bottom - other.top) / delta
|
232
|
+
elsif [self.top, other.top, other.bottom, self.bottom].sorted?
|
233
|
+
(other.bottom - other.top) / delta
|
234
|
+
elsif [other.top, self.top, self.bottom, other.bottom].sorted?
|
235
|
+
(self.bottom - self.top) / delta
|
236
|
+
else
|
237
|
+
0
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
# Funky custom methods (i.e. not just geometry)
|
243
|
+
#----------------------------------------------
|
244
|
+
|
245
|
+
#used for "deduping" similar rectangles detected via CV.
|
246
|
+
def similarity_hash
|
247
|
+
[self.x.to_i / SIMILARITY_DIVISOR, self.y.to_i / SIMILARITY_DIVISOR, self.width.to_i / SIMILARITY_DIVISOR, self.height.to_i / SIMILARITY_DIVISOR].to_s
|
248
|
+
end
|
249
|
+
|
250
|
+
def self.unionize(non_overlapping_rectangles, next_rect)
|
251
|
+
#if next_rect doesn't overlap any of non_overlapping_rectangles
|
252
|
+
if !(overlapping = non_overlapping_rectangles.compact.select{|r| next_rect.overlaps? r}).empty? &&
|
253
|
+
!non_overlapping_rectangles.empty?
|
254
|
+
#remove all of those that it overlaps from non_overlapping_rectangles and
|
255
|
+
non_overlapping_rectangles -= overlapping
|
256
|
+
#add to non_overlapping_rectangles the bounding box of the overlapping rectangles.
|
257
|
+
non_overlapping_rectangles << overlapping.inject(next_rect) do |memo, overlap|
|
258
|
+
#all we're doing is unioning `overlap` and `memo` and setting that result to `memo`
|
259
|
+
union(overlap, memo, memo) #I </3 Java.
|
260
|
+
memo
|
261
|
+
end
|
262
|
+
else
|
263
|
+
non_overlapping_rectangles << next_rect
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def to_h
|
268
|
+
hash = {}
|
269
|
+
[:top, :left, :width, :height].each do |m|
|
270
|
+
hash[m] = self.send(m)
|
271
|
+
end
|
272
|
+
hash
|
273
|
+
end
|
274
|
+
|
275
|
+
def inspect
|
276
|
+
"#<Rectangle2D dims:[#{top}, #{left}, #{bottom}, #{right}]>"
|
277
|
+
end
|
278
|
+
|
279
|
+
end
|
280
|
+
|
281
|
+
# used only in GetBounds2D in an intermediate step in HasCells#find_spreadsheets_from_cells
|
282
|
+
class Rectangle #java.awt.Rectangle
|
283
|
+
def inspect
|
284
|
+
"#<Rectangle dims:[x:#{x}, y:#{y}, w:#{width}, h:#{height}]>"
|
285
|
+
end
|
286
|
+
end
|
data/lib/tabula/entities.rb
CHANGED
@@ -1,456 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
self.height = height
|
13
|
-
self.texts = []
|
14
|
-
end
|
15
|
-
|
16
|
-
def bottom
|
17
|
-
self.top + self.height
|
18
|
-
end
|
19
|
-
|
20
|
-
def right
|
21
|
-
self.left + self.width
|
22
|
-
end
|
23
|
-
|
24
|
-
# [x, y]
|
25
|
-
def midpoint
|
26
|
-
[self.left + (self.width / 2), self.top + (self.height / 2)]
|
27
|
-
end
|
28
|
-
|
29
|
-
def area
|
30
|
-
self.width * self.height
|
31
|
-
end
|
32
|
-
|
33
|
-
def merge!(other)
|
34
|
-
self.top = [self.top, other.top].min
|
35
|
-
self.left = [self.left, other.left].min
|
36
|
-
self.width = [self.right, other.right].max - left
|
37
|
-
self.height = [self.bottom, other.bottom].max - top
|
38
|
-
end
|
39
|
-
|
40
|
-
def horizontal_distance(other)
|
41
|
-
(other.left - self.right).abs
|
42
|
-
end
|
43
|
-
|
44
|
-
def vertical_distance(other)
|
45
|
-
(other.bottom - self.bottom).abs
|
46
|
-
end
|
47
|
-
|
48
|
-
# Roughly, detects if self and other belong to the same line
|
49
|
-
def vertically_overlaps?(other)
|
50
|
-
vertical_overlap = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
|
51
|
-
vertical_overlap > 0
|
52
|
-
end
|
53
|
-
|
54
|
-
# detects if self and other belong to the same column
|
55
|
-
def horizontally_overlaps?(other)
|
56
|
-
horizontal_overlap = [0, [self.right, other.right].min - [self.left, other.left].max].max
|
57
|
-
horizontal_overlap > 0
|
58
|
-
end
|
59
|
-
|
60
|
-
def overlaps?(other, ratio_tolerance=0.00001)
|
61
|
-
self.overlap_ratio(other) > ratio_tolerance
|
62
|
-
end
|
63
|
-
|
64
|
-
def overlap_ratio(other)
|
65
|
-
intersection_width = [0, [self.right, other.right].min - [self.left, other.left].max].max
|
66
|
-
intersection_height = [0, [self.bottom, other.bottom].min - [self.top, other.top].max].max
|
67
|
-
intersection_area = [0, intersection_height * intersection_width].max
|
68
|
-
|
69
|
-
union_area = self.area + other.area - intersection_area
|
70
|
-
intersection_area / union_area
|
71
|
-
end
|
72
|
-
|
73
|
-
# as defined by PDF-TREX paper
|
74
|
-
def horizontal_overlap_ratio(other)
|
75
|
-
delta = [self.bottom - self.top, other.bottom - other.top].min
|
76
|
-
if [other.top, self.top, other.bottom, self.bottom].sorted?
|
77
|
-
(other.bottom - self.top) / delta
|
78
|
-
elsif [self.top, other.top, self.bottom, other.bottom].sorted?
|
79
|
-
(self.bottom - other.top) / delta
|
80
|
-
elsif [self.top, other.top, other.bottom, self.bottom].sorted?
|
81
|
-
(other.bottom - other.top) / delta
|
82
|
-
elsif [other.top, self.top, self.bottom, other.bottom].sorted?
|
83
|
-
(self.bottom - self.top) / delta
|
84
|
-
else
|
85
|
-
0
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def to_h
|
90
|
-
hash = {}
|
91
|
-
[:top, :left, :width, :height].each do |m|
|
92
|
-
hash[m] = self.send(m)
|
93
|
-
end
|
94
|
-
hash
|
95
|
-
end
|
96
|
-
|
97
|
-
def to_json(options={})
|
98
|
-
self.to_h.to_json
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
class Page < ZoneEntity
|
103
|
-
attr_reader :rotation, :number
|
104
|
-
|
105
|
-
def initialize(width, height, rotation, number, texts=[])
|
106
|
-
super(0, 0, width, height)
|
107
|
-
@rotation = rotation
|
108
|
-
@number = number
|
109
|
-
self.texts = texts
|
110
|
-
end
|
111
|
-
|
112
|
-
# get text, optionally from a provided area in the page [top, left, bottom, right]
|
113
|
-
def get_text(area=nil)
|
114
|
-
area = [0, 0, width, height] if area.nil?
|
115
|
-
|
116
|
-
# spaces are not detected, b/c they have height == 0
|
117
|
-
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
118
|
-
# self.texts.select { |t| t.overlaps? ze }
|
119
|
-
self.texts.select do |t|
|
120
|
-
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
def to_json(options={})
|
125
|
-
{ :width => self.width,
|
126
|
-
:height => self.height,
|
127
|
-
:number => self.number,
|
128
|
-
:rotation => self.rotation,
|
129
|
-
:texts => self.texts
|
130
|
-
}.to_json(options)
|
131
|
-
end
|
132
|
-
|
133
|
-
end
|
134
|
-
|
135
|
-
class TextElement < ZoneEntity
|
136
|
-
attr_accessor :font, :font_size, :text, :width_of_space
|
137
|
-
|
138
|
-
CHARACTER_DISTANCE_THRESHOLD = 1.5
|
139
|
-
TOLERANCE_FACTOR = 0.25 #25
|
140
|
-
|
141
|
-
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
142
|
-
super(top, left, width, height)
|
143
|
-
self.font = font
|
144
|
-
self.font_size = font_size
|
145
|
-
self.text = text
|
146
|
-
self.width_of_space = width_of_space
|
147
|
-
end
|
148
|
-
|
149
|
-
# more or less returns True if distance < tolerance
|
150
|
-
def should_merge?(other)
|
151
|
-
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
|
152
|
-
overlaps = self.vertically_overlaps?(other)
|
153
|
-
|
154
|
-
tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
155
|
-
|
156
|
-
overlaps or
|
157
|
-
(self.height == 0 and other.height != 0) or
|
158
|
-
(other.height == 0 and self.height != 0) and
|
159
|
-
self.horizontal_distance(other) < tolerance
|
160
|
-
end
|
161
|
-
|
162
|
-
# more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
|
163
|
-
def should_add_space?(other)
|
164
|
-
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
|
165
|
-
overlaps = self.vertically_overlaps?(other)
|
166
|
-
|
167
|
-
up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
168
|
-
down_tolerance = 0.90 #90?
|
169
|
-
|
170
|
-
dist = self.horizontal_distance(other).abs
|
171
|
-
|
172
|
-
rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
|
173
|
-
rv
|
174
|
-
end
|
175
|
-
|
176
|
-
def merge!(other)
|
177
|
-
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
|
178
|
-
# unless self.horizontally_overlaps?(other) or self.vertically_overlaps?(other)
|
179
|
-
# raise ArgumentError, "won't merge TextElements that don't overlap"
|
180
|
-
# end
|
181
|
-
if self.horizontally_overlaps?(other) and other.top < self.top
|
182
|
-
self.text = other.text + self.text
|
183
|
-
else
|
184
|
-
self.text << other.text
|
185
|
-
end
|
186
|
-
super(other)
|
187
|
-
end
|
188
|
-
|
189
|
-
def to_h
|
190
|
-
hash = super
|
191
|
-
[:font, :text].each do |m|
|
192
|
-
hash[m] = self.send(m)
|
193
|
-
end
|
194
|
-
hash
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
class Table
|
199
|
-
attr_reader :lines
|
200
|
-
def initialize(line_count, separators)
|
201
|
-
@separators = separators
|
202
|
-
@lines = (0...line_count).inject([]) { |m| m << Line.new }
|
203
|
-
end
|
204
|
-
|
205
|
-
def add_text_element(text_element, i, j)
|
206
|
-
if @lines.size <= i
|
207
|
-
@lines[i] = Line.new
|
208
|
-
end
|
209
|
-
if @lines[i].text_elements[j]
|
210
|
-
@lines[i].text_elements[j].merge!(text_element)
|
211
|
-
else
|
212
|
-
@lines[i].text_elements[j] = text_element
|
213
|
-
end
|
214
|
-
end
|
215
|
-
end
|
216
|
-
|
217
|
-
class Line < ZoneEntity
|
218
|
-
attr_accessor :text_elements
|
219
|
-
attr_reader :index
|
220
|
-
|
221
|
-
def initialize(index=nil)
|
222
|
-
self.text_elements = []
|
223
|
-
@index = index
|
224
|
-
end
|
225
|
-
|
226
|
-
def <<(t)
|
227
|
-
if self.text_elements.size == 0
|
228
|
-
self.text_elements << t
|
229
|
-
self.top = t.top
|
230
|
-
self.left = t.left
|
231
|
-
self.width = t.width
|
232
|
-
self.height = t.height
|
233
|
-
else
|
234
|
-
if in_same_column = self.text_elements.find { |te| te.horizontally_overlaps?(t) }
|
235
|
-
#sometimes a space needs to be added here
|
236
|
-
unless in_same_column.vertically_overlaps?(t)
|
237
|
-
t.text = " " + t.text
|
238
|
-
end
|
239
|
-
in_same_column.merge!(t)
|
240
|
-
else
|
241
|
-
self.text_elements << t
|
242
|
-
self.merge!(t)
|
243
|
-
end
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
|
-
|
248
|
-
end
|
249
|
-
|
250
|
-
class Column < ZoneEntity
|
251
|
-
attr_accessor :text_elements
|
252
|
-
|
253
|
-
def initialize(left, width, text_elements=[])
|
254
|
-
super(0, left, width, 0)
|
255
|
-
@text_elements = text_elements
|
256
|
-
end
|
257
|
-
|
258
|
-
def <<(te)
|
259
|
-
self.text_elements << te
|
260
|
-
self.update_boundaries!(te)
|
261
|
-
self.text_elements.sort_by! { |t| t.top }
|
262
|
-
end
|
263
|
-
|
264
|
-
def update_boundaries!(text_element)
|
265
|
-
self.merge!(text_element)
|
266
|
-
end
|
267
|
-
|
268
|
-
# this column can be merged with other_column?
|
269
|
-
def contains?(other_column)
|
270
|
-
self.horizontally_overlaps?(other_column)
|
271
|
-
end
|
272
|
-
|
273
|
-
def average_line_distance
|
274
|
-
# avg distance between lines
|
275
|
-
# this might help to MERGE lines that are shouldn't be split
|
276
|
-
# e.g. cells with > 1 lines of text
|
277
|
-
1.upto(self.text_elements.size - 1).map { |i|
|
278
|
-
self.text_elements[i].top - self.text_elements[i - 1].top
|
279
|
-
}.inject{ |sum, el| sum + el }.to_f / self.text_elements.size
|
280
|
-
end
|
281
|
-
|
282
|
-
def inspect
|
283
|
-
vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
|
284
|
-
texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
|
285
|
-
"<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
|
286
|
-
end
|
287
|
-
|
288
|
-
end
|
289
|
-
|
290
|
-
require_relative './core_ext'
|
291
|
-
|
292
|
-
class Ruling < ZoneEntity
|
293
|
-
# 2D line intersection test taken from comp.graphics.algorithms FAQ
|
294
|
-
def intersects?(other)
|
295
|
-
r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
|
296
|
-
/ ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
|
297
|
-
|
298
|
-
s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
|
299
|
-
/ ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
|
300
|
-
|
301
|
-
r >= 0 and r < 1 and s >= 0 and s < 1
|
302
|
-
end
|
303
|
-
|
304
|
-
def length
|
305
|
-
Math.sqrt( (self.right - self.left).abs ** 2 + (self.bottom - self.top).abs ** 2 )
|
306
|
-
end
|
307
|
-
|
308
|
-
def vertical?
|
309
|
-
left == right
|
310
|
-
end
|
311
|
-
|
312
|
-
def horizontal?
|
313
|
-
top == bottom
|
314
|
-
end
|
315
|
-
|
316
|
-
def right
|
317
|
-
left + width
|
318
|
-
end
|
319
|
-
def bottom
|
320
|
-
top + height
|
321
|
-
end
|
322
|
-
|
323
|
-
def to_json(arg)
|
324
|
-
[left, top, right, bottom].to_json
|
325
|
-
end
|
326
|
-
|
327
|
-
def to_xml
|
328
|
-
"<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
|
329
|
-
% [left, top, right, bottom]
|
330
|
-
end
|
331
|
-
|
332
|
-
def self.clean_rulings(rulings, max_distance=4)
|
333
|
-
|
334
|
-
# merge horizontal and vertical lines
|
335
|
-
# TODO this should be iterative
|
336
|
-
|
337
|
-
skip = false
|
338
|
-
|
339
|
-
horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
|
340
|
-
.group_by(&:top)
|
341
|
-
.values.reduce([]) do |memo, rs|
|
342
|
-
|
343
|
-
rs = rs.sort_by(&:left)
|
344
|
-
if rs.size > 1
|
345
|
-
memo +=
|
346
|
-
rs.each_cons(2)
|
347
|
-
.chunk { |p| p[1].left - p[0].right < 7 }
|
348
|
-
.select { |c| c[0] }
|
349
|
-
.map { |group|
|
350
|
-
group = group.last.flatten.uniq
|
351
|
-
Tabula::Ruling.new(group[0].top,
|
352
|
-
group[0].left,
|
353
|
-
group[-1].right - group[0].left,
|
354
|
-
0)
|
355
|
-
}
|
356
|
-
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
357
|
-
else
|
358
|
-
memo << rs.first
|
359
|
-
end
|
360
|
-
memo
|
361
|
-
end
|
362
|
-
.sort_by(&:top)
|
363
|
-
|
364
|
-
h = []
|
365
|
-
horiz.size.times do |i|
|
366
|
-
|
367
|
-
if i == horiz.size - 1
|
368
|
-
h << horiz[-1]
|
369
|
-
break
|
370
|
-
end
|
371
|
-
|
372
|
-
if skip
|
373
|
-
skip = false;
|
374
|
-
next
|
375
|
-
end
|
376
|
-
d = (horiz[i+1].top - horiz[i].top).abs
|
377
|
-
|
378
|
-
h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
|
379
|
-
skip = true
|
380
|
-
Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
|
381
|
-
else
|
382
|
-
horiz[i]
|
383
|
-
end
|
384
|
-
end
|
385
|
-
horiz = h
|
386
|
-
|
387
|
-
vert = rulings.select { |r| r.vertical? && r.height > max_distance }
|
388
|
-
.group_by(&:left)
|
389
|
-
.values
|
390
|
-
.reduce([]) do |memo, rs|
|
391
|
-
|
392
|
-
rs = rs.sort_by(&:top)
|
393
|
-
|
394
|
-
if rs.size > 1
|
395
|
-
# Here be dragons:
|
396
|
-
# merge consecutive segments of lines that are close enough
|
397
|
-
memo +=
|
398
|
-
rs.each_cons(2)
|
399
|
-
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
400
|
-
.select { |c| c[0] }
|
401
|
-
.map { |group|
|
402
|
-
group = group.last.flatten.uniq
|
403
|
-
Tabula::Ruling.new(group[0].top,
|
404
|
-
group[0].left,
|
405
|
-
0,
|
406
|
-
group[-1].bottom - group[0].top)
|
407
|
-
}
|
408
|
-
else
|
409
|
-
memo << rs.first
|
410
|
-
end
|
411
|
-
memo
|
412
|
-
end.sort_by(&:left)
|
413
|
-
|
414
|
-
# v = []
|
415
|
-
|
416
|
-
# vert.size.times do |i|
|
417
|
-
# if i == vert.size - 1
|
418
|
-
# v << vert[-1]
|
419
|
-
# break
|
420
|
-
# end
|
421
|
-
|
422
|
-
# if skip
|
423
|
-
# skip = false;
|
424
|
-
# next
|
425
|
-
# end
|
426
|
-
# d = (vert[i+1].left - vert[i].left).abs
|
427
|
-
|
428
|
-
# v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
429
|
-
# skip = true
|
430
|
-
# Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
431
|
-
# else
|
432
|
-
# vert[i]
|
433
|
-
# end
|
434
|
-
# end
|
435
|
-
# vert = v
|
436
|
-
|
437
|
-
|
438
|
-
# - only keep horizontal rulings that intersect with at least one vertical ruling
|
439
|
-
# - only keep vertical rulings that intersect with at least one horizontal ruling
|
440
|
-
# yeah, it's a naive heuristic. but hey, it works.
|
441
|
-
|
442
|
-
# h_mean = horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
|
443
|
-
# horiz.reject { |h| h.width < h_mean }
|
444
|
-
|
445
|
-
#vert.delete_if { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
|
446
|
-
#horiz.delete_if { |h| !vert.any? { |v| v.intersects?(h) } } unless vert.empty?
|
447
|
-
|
448
|
-
return horiz += vert
|
449
|
-
end
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
end
|
455
|
-
|
456
|
-
end
|
1
|
+
require_relative './entities/zone_entity'
|
2
|
+
require_relative './entities/cell'
|
3
|
+
require_relative './entities/has_cells'
|
4
|
+
require_relative './entities/line'
|
5
|
+
require_relative './entities/page'
|
6
|
+
require_relative './entities/page_area'
|
7
|
+
require_relative './entities/ruling'
|
8
|
+
require_relative './entities/spreadsheet'
|
9
|
+
require_relative './entities/table'
|
10
|
+
require_relative './entities/text_chunk'
|
11
|
+
require_relative './entities/text_element'
|