tabula-extractor 0.0.1-java → 0.5.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/Gemfile +0 -3
- data/README.md +19 -2
- data/Rakefile +4 -5
- data/bin/tabula +27 -7
- data/ext/COPYING +661 -0
- data/ext/Makefile.OSX +15 -0
- data/ext/Makefile.defaults +9 -0
- data/ext/Makefile.linux32 +11 -0
- data/ext/Makefile.linux64 +12 -0
- data/ext/Makefile.mingw +10 -0
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +3 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/lsd.c +2270 -0
- data/ext/lsd.h +283 -0
- data/lib/tabula.rb +6 -0
- data/lib/tabula/core_ext.rb +21 -0
- data/lib/tabula/entities.rb +141 -20
- data/lib/tabula/line_segment_detector.rb +99 -0
- data/lib/tabula/pdf_dump.rb +10 -8
- data/lib/tabula/pdf_render.rb +64 -0
- data/lib/tabula/table_extractor.rb +19 -20
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +1 -1
- data/tabula-extractor.gemspec +3 -2
- data/target/{pdfbox-app-1.8.0.jar → pdfbox-app-2.0.0-SNAPSHOT.jar} +0 -0
- data/test/tests.rb +7 -6
- metadata +22 -5
data/ext/lsd.h
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
/*----------------------------------------------------------------------------
|
2
|
+
|
3
|
+
LSD - Line Segment Detector on digital images
|
4
|
+
|
5
|
+
This code is part of the following publication and was subject
|
6
|
+
to peer review:
|
7
|
+
|
8
|
+
"LSD: a Line Segment Detector" by Rafael Grompone von Gioi,
|
9
|
+
Jeremie Jakubowicz, Jean-Michel Morel, and Gregory Randall,
|
10
|
+
Image Processing On Line, 2012. DOI:10.5201/ipol.2012.gjmr-lsd
|
11
|
+
http://dx.doi.org/10.5201/ipol.2012.gjmr-lsd
|
12
|
+
|
13
|
+
Copyright (c) 2007-2011 rafael grompone von gioi <grompone@gmail.com>
|
14
|
+
|
15
|
+
This program is free software: you can redistribute it and/or modify
|
16
|
+
it under the terms of the GNU Affero General Public License as
|
17
|
+
published by the Free Software Foundation, either version 3 of the
|
18
|
+
License, or (at your option) any later version.
|
19
|
+
|
20
|
+
This program is distributed in the hope that it will be useful,
|
21
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
22
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
23
|
+
GNU Affero General Public License for more details.
|
24
|
+
|
25
|
+
You should have received a copy of the GNU Affero General Public License
|
26
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
27
|
+
|
28
|
+
----------------------------------------------------------------------------*/
|
29
|
+
|
30
|
+
/*----------------------------------------------------------------------------*/
|
31
|
+
/** @file lsd.h
|
32
|
+
LSD module header
|
33
|
+
@author rafael grompone von gioi <grompone@gmail.com>
|
34
|
+
*/
|
35
|
+
/*----------------------------------------------------------------------------*/
|
36
|
+
#ifndef LSD_HEADER
|
37
|
+
#define LSD_HEADER
|
38
|
+
|
39
|
+
/*----------------------------------------------------------------------------*/
|
40
|
+
/** LSD Full Interface
|
41
|
+
|
42
|
+
@param n_out Pointer to an int where LSD will store the number of
|
43
|
+
line segments detected.
|
44
|
+
|
45
|
+
@param img Pointer to input image data. It must be an array of
|
46
|
+
doubles of size X x Y, and the pixel at coordinates
|
47
|
+
(x,y) is obtained by img[x+y*X].
|
48
|
+
|
49
|
+
@param X X size of the image: the number of columns.
|
50
|
+
|
51
|
+
@param Y Y size of the image: the number of rows.
|
52
|
+
|
53
|
+
@param scale When different from 1.0, LSD will scale the input image
|
54
|
+
by 'scale' factor by Gaussian filtering, before detecting
|
55
|
+
line segments.
|
56
|
+
Example: if scale=0.8, the input image will be subsampled
|
57
|
+
to 80% of its size, before the line segment detector
|
58
|
+
is applied.
|
59
|
+
Suggested value: 0.8
|
60
|
+
|
61
|
+
@param sigma_scale When scale!=1.0, the sigma of the Gaussian filter is:
|
62
|
+
sigma = sigma_scale / scale, if scale < 1.0
|
63
|
+
sigma = sigma_scale, if scale >= 1.0
|
64
|
+
Suggested value: 0.6
|
65
|
+
|
66
|
+
@param quant Bound to the quantization error on the gradient norm.
|
67
|
+
Example: if gray levels are quantized to integer steps,
|
68
|
+
the gradient (computed by finite differences) error
|
69
|
+
due to quantization will be bounded by 2.0, as the
|
70
|
+
worst case is when the error are 1 and -1, that
|
71
|
+
gives an error of 2.0.
|
72
|
+
Suggested value: 2.0
|
73
|
+
|
74
|
+
@param ang_th Gradient angle tolerance in the region growing
|
75
|
+
algorithm, in degrees.
|
76
|
+
Suggested value: 22.5
|
77
|
+
|
78
|
+
@param log_eps Detection threshold, accept if -log10(NFA) > log_eps.
|
79
|
+
The larger the value, the more strict the detector is,
|
80
|
+
and will result in less detections.
|
81
|
+
(Note that the 'minus sign' makes that this
|
82
|
+
behavior is opposite to the one of NFA.)
|
83
|
+
The value -log10(NFA) is equivalent but more
|
84
|
+
intuitive than NFA:
|
85
|
+
- -1.0 gives an average of 10 false detections on noise
|
86
|
+
- 0.0 gives an average of 1 false detections on noise
|
87
|
+
- 1.0 gives an average of 0.1 false detections on nose
|
88
|
+
- 2.0 gives an average of 0.01 false detections on noise
|
89
|
+
.
|
90
|
+
Suggested value: 0.0
|
91
|
+
|
92
|
+
@param density_th Minimal proportion of 'supporting' points in a rectangle.
|
93
|
+
Suggested value: 0.7
|
94
|
+
|
95
|
+
@param n_bins Number of bins used in the pseudo-ordering of gradient
|
96
|
+
modulus.
|
97
|
+
Suggested value: 1024
|
98
|
+
|
99
|
+
@param reg_img Optional output: if desired, LSD will return an
|
100
|
+
int image where each pixel indicates the line segment
|
101
|
+
to which it belongs. Unused pixels have the value '0',
|
102
|
+
while the used ones have the number of the line segment,
|
103
|
+
numbered 1,2,3,..., in the same order as in the
|
104
|
+
output list. If desired, a non NULL int** pointer must
|
105
|
+
be assigned, and LSD will make that the pointer point
|
106
|
+
to an int array of size reg_x x reg_y, where the pixel
|
107
|
+
value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
|
108
|
+
Note that the resulting image has the size of the image
|
109
|
+
used for the processing, that is, the size of the input
|
110
|
+
image scaled by the given factor 'scale'. If scale!=1
|
111
|
+
this size differs from XxY and that is the reason why
|
112
|
+
its value is given by reg_x and reg_y.
|
113
|
+
Suggested value: NULL
|
114
|
+
|
115
|
+
@param reg_x Pointer to an int where LSD will put the X size
|
116
|
+
'reg_img' image, when asked for.
|
117
|
+
Suggested value: NULL
|
118
|
+
|
119
|
+
@param reg_y Pointer to an int where LSD will put the Y size
|
120
|
+
'reg_img' image, when asked for.
|
121
|
+
Suggested value: NULL
|
122
|
+
|
123
|
+
@return A double array of size 7 x n_out, containing the list
|
124
|
+
of line segments detected. The array contains first
|
125
|
+
7 values of line segment number 1, then the 7 values
|
126
|
+
of line segment number 2, and so on, and it finish
|
127
|
+
by the 7 values of line segment number n_out.
|
128
|
+
The seven values are:
|
129
|
+
- x1,y1,x2,y2,width,p,-log10(NFA)
|
130
|
+
.
|
131
|
+
for a line segment from coordinates (x1,y1) to (x2,y2),
|
132
|
+
a width 'width', an angle precision of p in (0,1) given
|
133
|
+
by angle_tolerance/180 degree, and NFA value 'NFA'.
|
134
|
+
If 'out' is the returned pointer, the 7 values of
|
135
|
+
line segment number 'n+1' are obtained with
|
136
|
+
'out[7*n+0]' to 'out[7*n+6]'.
|
137
|
+
*/
|
138
|
+
double * LineSegmentDetection( int * n_out,
|
139
|
+
double * img, int X, int Y,
|
140
|
+
double scale, double sigma_scale, double quant,
|
141
|
+
double ang_th, double log_eps, double density_th,
|
142
|
+
int n_bins,
|
143
|
+
int ** reg_img, int * reg_x, int * reg_y );
|
144
|
+
|
145
|
+
/*----------------------------------------------------------------------------*/
|
146
|
+
/** LSD Simple Interface with Scale and Region output.
|
147
|
+
|
148
|
+
@param n_out Pointer to an int where LSD will store the number of
|
149
|
+
line segments detected.
|
150
|
+
|
151
|
+
@param img Pointer to input image data. It must be an array of
|
152
|
+
doubles of size X x Y, and the pixel at coordinates
|
153
|
+
(x,y) is obtained by img[x+y*X].
|
154
|
+
|
155
|
+
@param X X size of the image: the number of columns.
|
156
|
+
|
157
|
+
@param Y Y size of the image: the number of rows.
|
158
|
+
|
159
|
+
@param scale When different from 1.0, LSD will scale the input image
|
160
|
+
by 'scale' factor by Gaussian filtering, before detecting
|
161
|
+
line segments.
|
162
|
+
Example: if scale=0.8, the input image will be subsampled
|
163
|
+
to 80% of its size, before the line segment detector
|
164
|
+
is applied.
|
165
|
+
Suggested value: 0.8
|
166
|
+
|
167
|
+
@param reg_img Optional output: if desired, LSD will return an
|
168
|
+
int image where each pixel indicates the line segment
|
169
|
+
to which it belongs. Unused pixels have the value '0',
|
170
|
+
while the used ones have the number of the line segment,
|
171
|
+
numbered 1,2,3,..., in the same order as in the
|
172
|
+
output list. If desired, a non NULL int** pointer must
|
173
|
+
be assigned, and LSD will make that the pointer point
|
174
|
+
to an int array of size reg_x x reg_y, where the pixel
|
175
|
+
value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
|
176
|
+
Note that the resulting image has the size of the image
|
177
|
+
used for the processing, that is, the size of the input
|
178
|
+
image scaled by the given factor 'scale'. If scale!=1
|
179
|
+
this size differs from XxY and that is the reason why
|
180
|
+
its value is given by reg_x and reg_y.
|
181
|
+
Suggested value: NULL
|
182
|
+
|
183
|
+
@param reg_x Pointer to an int where LSD will put the X size
|
184
|
+
'reg_img' image, when asked for.
|
185
|
+
Suggested value: NULL
|
186
|
+
|
187
|
+
@param reg_y Pointer to an int where LSD will put the Y size
|
188
|
+
'reg_img' image, when asked for.
|
189
|
+
Suggested value: NULL
|
190
|
+
|
191
|
+
@return A double array of size 7 x n_out, containing the list
|
192
|
+
of line segments detected. The array contains first
|
193
|
+
7 values of line segment number 1, then the 7 values
|
194
|
+
of line segment number 2, and so on, and it finish
|
195
|
+
by the 7 values of line segment number n_out.
|
196
|
+
The seven values are:
|
197
|
+
- x1,y1,x2,y2,width,p,-log10(NFA)
|
198
|
+
.
|
199
|
+
for a line segment from coordinates (x1,y1) to (x2,y2),
|
200
|
+
a width 'width', an angle precision of p in (0,1) given
|
201
|
+
by angle_tolerance/180 degree, and NFA value 'NFA'.
|
202
|
+
If 'out' is the returned pointer, the 7 values of
|
203
|
+
line segment number 'n+1' are obtained with
|
204
|
+
'out[7*n+0]' to 'out[7*n+6]'.
|
205
|
+
*/
|
206
|
+
double * lsd_scale_region( int * n_out,
|
207
|
+
double * img, int X, int Y, double scale,
|
208
|
+
int ** reg_img, int * reg_x, int * reg_y );
|
209
|
+
|
210
|
+
/*----------------------------------------------------------------------------*/
|
211
|
+
/** LSD Simple Interface with Scale
|
212
|
+
|
213
|
+
@param n_out Pointer to an int where LSD will store the number of
|
214
|
+
line segments detected.
|
215
|
+
|
216
|
+
@param img Pointer to input image data. It must be an array of
|
217
|
+
doubles of size X x Y, and the pixel at coordinates
|
218
|
+
(x,y) is obtained by img[x+y*X].
|
219
|
+
|
220
|
+
@param X X size of the image: the number of columns.
|
221
|
+
|
222
|
+
@param Y Y size of the image: the number of rows.
|
223
|
+
|
224
|
+
@param scale When different from 1.0, LSD will scale the input image
|
225
|
+
by 'scale' factor by Gaussian filtering, before detecting
|
226
|
+
line segments.
|
227
|
+
Example: if scale=0.8, the input image will be subsampled
|
228
|
+
to 80% of its size, before the line segment detector
|
229
|
+
is applied.
|
230
|
+
Suggested value: 0.8
|
231
|
+
|
232
|
+
@return A double array of size 7 x n_out, containing the list
|
233
|
+
of line segments detected. The array contains first
|
234
|
+
7 values of line segment number 1, then the 7 values
|
235
|
+
of line segment number 2, and so on, and it finish
|
236
|
+
by the 7 values of line segment number n_out.
|
237
|
+
The seven values are:
|
238
|
+
- x1,y1,x2,y2,width,p,-log10(NFA)
|
239
|
+
.
|
240
|
+
for a line segment from coordinates (x1,y1) to (x2,y2),
|
241
|
+
a width 'width', an angle precision of p in (0,1) given
|
242
|
+
by angle_tolerance/180 degree, and NFA value 'NFA'.
|
243
|
+
If 'out' is the returned pointer, the 7 values of
|
244
|
+
line segment number 'n+1' are obtained with
|
245
|
+
'out[7*n+0]' to 'out[7*n+6]'.
|
246
|
+
*/
|
247
|
+
double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
|
248
|
+
|
249
|
+
/*----------------------------------------------------------------------------*/
|
250
|
+
/** LSD Simple Interface
|
251
|
+
|
252
|
+
@param n_out Pointer to an int where LSD will store the number of
|
253
|
+
line segments detected.
|
254
|
+
|
255
|
+
@param img Pointer to input image data. It must be an array of
|
256
|
+
doubles of size X x Y, and the pixel at coordinates
|
257
|
+
(x,y) is obtained by img[x+y*X].
|
258
|
+
|
259
|
+
@param X X size of the image: the number of columns.
|
260
|
+
|
261
|
+
@param Y Y size of the image: the number of rows.
|
262
|
+
|
263
|
+
@return A double array of size 7 x n_out, containing the list
|
264
|
+
of line segments detected. The array contains first
|
265
|
+
7 values of line segment number 1, then the 7 values
|
266
|
+
of line segment number 2, and so on, and it finish
|
267
|
+
by the 7 values of line segment number n_out.
|
268
|
+
The seven values are:
|
269
|
+
- x1,y1,x2,y2,width,p,-log10(NFA)
|
270
|
+
.
|
271
|
+
for a line segment from coordinates (x1,y1) to (x2,y2),
|
272
|
+
a width 'width', an angle precision of p in (0,1) given
|
273
|
+
by angle_tolerance/180 degree, and NFA value 'NFA'.
|
274
|
+
If 'out' is the returned pointer, the 7 values of
|
275
|
+
line segment number 'n+1' are obtained with
|
276
|
+
'out[7*n+0]' to 'out[7*n+6]'.
|
277
|
+
*/
|
278
|
+
double * lsd(int * n_out, double * img, int X, int Y);
|
279
|
+
|
280
|
+
void free_values(double * p);
|
281
|
+
|
282
|
+
#endif /* !LSD_HEADER */
|
283
|
+
/*----------------------------------------------------------------------------*/
|
data/lib/tabula.rb
CHANGED
@@ -1,5 +1,11 @@
|
|
1
|
+
module Tabula
|
2
|
+
PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
|
3
|
+
end
|
4
|
+
|
1
5
|
require_relative './tabula/version'
|
2
6
|
require_relative './tabula/entities'
|
3
7
|
require_relative './tabula/pdf_dump'
|
4
8
|
require_relative './tabula/table_extractor'
|
5
9
|
require_relative './tabula/writers'
|
10
|
+
require_relative './tabula/line_segment_detector'
|
11
|
+
require_relative './tabula/pdf_render'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Enumerable
|
2
|
+
|
3
|
+
def sum
|
4
|
+
self.inject(0){|accum, i| accum + i }
|
5
|
+
end
|
6
|
+
|
7
|
+
def mean
|
8
|
+
self.sum/self.length.to_f
|
9
|
+
end
|
10
|
+
|
11
|
+
def sample_variance
|
12
|
+
m = self.mean
|
13
|
+
sum = self.inject(0){|accum, i| accum +(i-m)**2 }
|
14
|
+
sum/(self.length - 1).to_f
|
15
|
+
end
|
16
|
+
|
17
|
+
def standard_deviation
|
18
|
+
return Math.sqrt(self.sample_variance)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
data/lib/tabula/entities.rb
CHANGED
@@ -96,8 +96,13 @@ module Tabula
|
|
96
96
|
# get text, optionally from a provided area in the page [top, left, bottom, right]
|
97
97
|
def get_text(area=nil)
|
98
98
|
area = [0, 0, width, height] if area.nil?
|
99
|
-
|
100
|
-
|
99
|
+
|
100
|
+
# spaces are not detected, b/c they have height == 0
|
101
|
+
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
+
# self.texts.select { |t| t.overlaps? ze }
|
103
|
+
self.texts.select { |t|
|
104
|
+
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
|
+
}
|
101
106
|
end
|
102
107
|
|
103
108
|
def to_json(options={})
|
@@ -112,15 +117,17 @@ module Tabula
|
|
112
117
|
end
|
113
118
|
|
114
119
|
class TextElement < ZoneEntity
|
115
|
-
attr_accessor :font, :font_size, :text
|
120
|
+
attr_accessor :font, :font_size, :text, :width_of_space
|
116
121
|
|
117
122
|
CHARACTER_DISTANCE_THRESHOLD = 1.5
|
123
|
+
TOLERANCE_FACTOR = 0.25
|
118
124
|
|
119
|
-
def initialize(top, left, width, height, font, font_size, text)
|
125
|
+
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
120
126
|
super(top, left, width, height)
|
121
127
|
self.font = font
|
122
128
|
self.font_size = font_size
|
123
129
|
self.text = text
|
130
|
+
self.width_of_space = width_of_space
|
124
131
|
end
|
125
132
|
|
126
133
|
# more or less returns True if distance < tolerance
|
@@ -128,7 +135,7 @@ module Tabula
|
|
128
135
|
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
|
129
136
|
overlaps = self.vertically_overlaps?(other)
|
130
137
|
|
131
|
-
tolerance = ((self.font_size + other.font_size) / 2) *
|
138
|
+
tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
132
139
|
|
133
140
|
overlaps or
|
134
141
|
(self.height == 0 and other.height != 0) or
|
@@ -141,13 +148,13 @@ module Tabula
|
|
141
148
|
raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
|
142
149
|
overlaps = self.vertically_overlaps?(other)
|
143
150
|
|
144
|
-
|
151
|
+
up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
|
152
|
+
down_tolerance = 0.95
|
145
153
|
|
146
|
-
dist = self.horizontal_distance(other)
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
|
154
|
+
dist = self.horizontal_distance(other).abs
|
155
|
+
|
156
|
+
rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
|
157
|
+
rv
|
151
158
|
end
|
152
159
|
|
153
160
|
def merge!(other)
|
@@ -235,25 +242,139 @@ module Tabula
|
|
235
242
|
def inspect
|
236
243
|
vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
|
237
244
|
texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
|
238
|
-
"<#{self.class}: #{vars.join(', ')}, @text_elements
|
245
|
+
"<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
|
239
246
|
end
|
240
247
|
|
241
248
|
end
|
242
249
|
|
250
|
+
require_relative './core_ext'
|
251
|
+
|
243
252
|
class Ruling < ZoneEntity
|
244
|
-
|
253
|
+
# 2D line intersection test taken from comp.graphics.algorithms FAQ
|
254
|
+
def intersects?(other)
|
255
|
+
r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
|
256
|
+
/ ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
|
245
257
|
|
246
|
-
|
247
|
-
|
248
|
-
|
258
|
+
s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
|
259
|
+
/ ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
|
260
|
+
|
261
|
+
r >= 0 and r < 1 and s >= 0 and s < 1
|
249
262
|
end
|
250
263
|
|
251
|
-
def
|
252
|
-
|
253
|
-
|
254
|
-
|
264
|
+
def vertical?
|
265
|
+
left == right
|
266
|
+
end
|
267
|
+
|
268
|
+
def horizontal?
|
269
|
+
top == bottom
|
270
|
+
end
|
271
|
+
|
272
|
+
def to_json(arg)
|
273
|
+
[left, top, right, bottom].to_json
|
274
|
+
end
|
275
|
+
|
276
|
+
def to_xml
|
277
|
+
"<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
|
278
|
+
% [left, top, right, bottom]
|
279
|
+
end
|
280
|
+
|
281
|
+
def self.clean_rulings(rulings, max_distance=4)
|
282
|
+
|
283
|
+
# merge horizontal and vertical lines
|
284
|
+
# TODO this should be iterative
|
285
|
+
|
286
|
+
skip = false
|
287
|
+
|
288
|
+
horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
|
289
|
+
.group_by(&:top)
|
290
|
+
.values.reduce([]) { |memo, rs|
|
291
|
+
rs = rs.sort_by(&:left)
|
292
|
+
|
293
|
+
memo << if rs.size > 1
|
294
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
295
|
+
else
|
296
|
+
rs.first
|
297
|
+
end
|
298
|
+
|
299
|
+
}
|
300
|
+
.sort_by(&:top)
|
301
|
+
|
302
|
+
h = []
|
303
|
+
horiz.size.times do |i|
|
304
|
+
|
305
|
+
if i == horiz.size - 1
|
306
|
+
h << horiz[-1]
|
307
|
+
break
|
308
|
+
end
|
309
|
+
|
310
|
+
if skip
|
311
|
+
skip = false;
|
312
|
+
next
|
313
|
+
end
|
314
|
+
d = (horiz[i+1].top - horiz[i].top).abs
|
315
|
+
|
316
|
+
h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
|
317
|
+
skip = true
|
318
|
+
Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
|
319
|
+
else
|
320
|
+
horiz[i]
|
321
|
+
end
|
322
|
+
end
|
323
|
+
horiz = h
|
324
|
+
|
325
|
+
vert = rulings.select { |r| r.vertical? && r.height > max_distance }
|
326
|
+
.group_by(&:left)
|
327
|
+
.values.reduce([]) { |memo, rs|
|
328
|
+
|
329
|
+
rs = rs.sort_by(&:top)
|
330
|
+
memo << if rs.size > 1
|
331
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
|
332
|
+
else rs.first
|
333
|
+
rs.first
|
334
|
+
end
|
335
|
+
}
|
336
|
+
.sort_by(&:left)
|
337
|
+
|
338
|
+
v = []
|
339
|
+
vert.size.times do |i|
|
340
|
+
|
341
|
+
if i == vert.size - 1
|
342
|
+
v << vert[-1]
|
343
|
+
break
|
344
|
+
end
|
345
|
+
|
346
|
+
if skip
|
347
|
+
skip = false;
|
348
|
+
next
|
349
|
+
end
|
350
|
+
d = (vert[i+1].left - vert[i].left).abs
|
351
|
+
|
352
|
+
v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
353
|
+
skip = true
|
354
|
+
Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
355
|
+
else
|
356
|
+
vert[i]
|
357
|
+
end
|
358
|
+
end
|
359
|
+
vert = v
|
360
|
+
|
361
|
+
|
362
|
+
# - only keep horizontal rulings that intersect with at least one vertical ruling
|
363
|
+
# - only keep vertical rulings that intersect with at least one horizontal ruling
|
364
|
+
# yeah, it's a naive heuristic. but hey, it works.
|
365
|
+
|
366
|
+
# h_mean = horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
|
367
|
+
# horiz.reject { |h| h.width < h_mean }
|
368
|
+
|
369
|
+
#vert.delete_if { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
|
370
|
+
#horiz.delete_if { |h| !vert.any? { |v| v.intersects?(h) } } unless vert.empty?
|
371
|
+
|
372
|
+
return horiz += vert
|
255
373
|
end
|
256
374
|
|
375
|
+
|
376
|
+
|
377
|
+
|
257
378
|
end
|
258
379
|
|
259
380
|
end
|