tabula-extractor 0.0.1-java → 0.5.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/ext/lsd.h ADDED
@@ -0,0 +1,283 @@
1
+ /*----------------------------------------------------------------------------
2
+
3
+ LSD - Line Segment Detector on digital images
4
+
5
+ This code is part of the following publication and was subject
6
+ to peer review:
7
+
8
+ "LSD: a Line Segment Detector" by Rafael Grompone von Gioi,
9
+ Jeremie Jakubowicz, Jean-Michel Morel, and Gregory Randall,
10
+ Image Processing On Line, 2012. DOI:10.5201/ipol.2012.gjmr-lsd
11
+ http://dx.doi.org/10.5201/ipol.2012.gjmr-lsd
12
+
13
+ Copyright (c) 2007-2011 rafael grompone von gioi <grompone@gmail.com>
14
+
15
+ This program is free software: you can redistribute it and/or modify
16
+ it under the terms of the GNU Affero General Public License as
17
+ published by the Free Software Foundation, either version 3 of the
18
+ License, or (at your option) any later version.
19
+
20
+ This program is distributed in the hope that it will be useful,
21
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ GNU Affero General Public License for more details.
24
+
25
+ You should have received a copy of the GNU Affero General Public License
26
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ ----------------------------------------------------------------------------*/
29
+
30
+ /*----------------------------------------------------------------------------*/
31
+ /** @file lsd.h
32
+ LSD module header
33
+ @author rafael grompone von gioi <grompone@gmail.com>
34
+ */
35
+ /*----------------------------------------------------------------------------*/
36
+ #ifndef LSD_HEADER
37
+ #define LSD_HEADER
38
+
39
+ /*----------------------------------------------------------------------------*/
40
+ /** LSD Full Interface
41
+
42
+ @param n_out Pointer to an int where LSD will store the number of
43
+ line segments detected.
44
+
45
+ @param img Pointer to input image data. It must be an array of
46
+ doubles of size X x Y, and the pixel at coordinates
47
+ (x,y) is obtained by img[x+y*X].
48
+
49
+ @param X X size of the image: the number of columns.
50
+
51
+ @param Y Y size of the image: the number of rows.
52
+
53
+ @param scale When different from 1.0, LSD will scale the input image
54
+ by 'scale' factor by Gaussian filtering, before detecting
55
+ line segments.
56
+ Example: if scale=0.8, the input image will be subsampled
57
+ to 80% of its size, before the line segment detector
58
+ is applied.
59
+ Suggested value: 0.8
60
+
61
+ @param sigma_scale When scale!=1.0, the sigma of the Gaussian filter is:
62
+ sigma = sigma_scale / scale, if scale < 1.0
63
+ sigma = sigma_scale, if scale >= 1.0
64
+ Suggested value: 0.6
65
+
66
+ @param quant Bound to the quantization error on the gradient norm.
67
+ Example: if gray levels are quantized to integer steps,
68
+ the gradient (computed by finite differences) error
69
+ due to quantization will be bounded by 2.0, as the
70
+ worst case is when the error are 1 and -1, that
71
+ gives an error of 2.0.
72
+ Suggested value: 2.0
73
+
74
+ @param ang_th Gradient angle tolerance in the region growing
75
+ algorithm, in degrees.
76
+ Suggested value: 22.5
77
+
78
+ @param log_eps Detection threshold, accept if -log10(NFA) > log_eps.
79
+ The larger the value, the more strict the detector is,
80
+ and will result in less detections.
81
+ (Note that the 'minus sign' makes that this
82
+ behavior is opposite to the one of NFA.)
83
+ The value -log10(NFA) is equivalent but more
84
+ intuitive than NFA:
85
+ - -1.0 gives an average of 10 false detections on noise
86
+ - 0.0 gives an average of 1 false detections on noise
87
+ - 1.0 gives an average of 0.1 false detections on nose
88
+ - 2.0 gives an average of 0.01 false detections on noise
89
+ .
90
+ Suggested value: 0.0
91
+
92
+ @param density_th Minimal proportion of 'supporting' points in a rectangle.
93
+ Suggested value: 0.7
94
+
95
+ @param n_bins Number of bins used in the pseudo-ordering of gradient
96
+ modulus.
97
+ Suggested value: 1024
98
+
99
+ @param reg_img Optional output: if desired, LSD will return an
100
+ int image where each pixel indicates the line segment
101
+ to which it belongs. Unused pixels have the value '0',
102
+ while the used ones have the number of the line segment,
103
+ numbered 1,2,3,..., in the same order as in the
104
+ output list. If desired, a non NULL int** pointer must
105
+ be assigned, and LSD will make that the pointer point
106
+ to an int array of size reg_x x reg_y, where the pixel
107
+ value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
108
+ Note that the resulting image has the size of the image
109
+ used for the processing, that is, the size of the input
110
+ image scaled by the given factor 'scale'. If scale!=1
111
+ this size differs from XxY and that is the reason why
112
+ its value is given by reg_x and reg_y.
113
+ Suggested value: NULL
114
+
115
+ @param reg_x Pointer to an int where LSD will put the X size
116
+ 'reg_img' image, when asked for.
117
+ Suggested value: NULL
118
+
119
+ @param reg_y Pointer to an int where LSD will put the Y size
120
+ 'reg_img' image, when asked for.
121
+ Suggested value: NULL
122
+
123
+ @return A double array of size 7 x n_out, containing the list
124
+ of line segments detected. The array contains first
125
+ 7 values of line segment number 1, then the 7 values
126
+ of line segment number 2, and so on, and it finish
127
+ by the 7 values of line segment number n_out.
128
+ The seven values are:
129
+ - x1,y1,x2,y2,width,p,-log10(NFA)
130
+ .
131
+ for a line segment from coordinates (x1,y1) to (x2,y2),
132
+ a width 'width', an angle precision of p in (0,1) given
133
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
134
+ If 'out' is the returned pointer, the 7 values of
135
+ line segment number 'n+1' are obtained with
136
+ 'out[7*n+0]' to 'out[7*n+6]'.
137
+ */
138
+ double * LineSegmentDetection( int * n_out,
139
+ double * img, int X, int Y,
140
+ double scale, double sigma_scale, double quant,
141
+ double ang_th, double log_eps, double density_th,
142
+ int n_bins,
143
+ int ** reg_img, int * reg_x, int * reg_y );
144
+
145
+ /*----------------------------------------------------------------------------*/
146
+ /** LSD Simple Interface with Scale and Region output.
147
+
148
+ @param n_out Pointer to an int where LSD will store the number of
149
+ line segments detected.
150
+
151
+ @param img Pointer to input image data. It must be an array of
152
+ doubles of size X x Y, and the pixel at coordinates
153
+ (x,y) is obtained by img[x+y*X].
154
+
155
+ @param X X size of the image: the number of columns.
156
+
157
+ @param Y Y size of the image: the number of rows.
158
+
159
+ @param scale When different from 1.0, LSD will scale the input image
160
+ by 'scale' factor by Gaussian filtering, before detecting
161
+ line segments.
162
+ Example: if scale=0.8, the input image will be subsampled
163
+ to 80% of its size, before the line segment detector
164
+ is applied.
165
+ Suggested value: 0.8
166
+
167
+ @param reg_img Optional output: if desired, LSD will return an
168
+ int image where each pixel indicates the line segment
169
+ to which it belongs. Unused pixels have the value '0',
170
+ while the used ones have the number of the line segment,
171
+ numbered 1,2,3,..., in the same order as in the
172
+ output list. If desired, a non NULL int** pointer must
173
+ be assigned, and LSD will make that the pointer point
174
+ to an int array of size reg_x x reg_y, where the pixel
175
+ value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
176
+ Note that the resulting image has the size of the image
177
+ used for the processing, that is, the size of the input
178
+ image scaled by the given factor 'scale'. If scale!=1
179
+ this size differs from XxY and that is the reason why
180
+ its value is given by reg_x and reg_y.
181
+ Suggested value: NULL
182
+
183
+ @param reg_x Pointer to an int where LSD will put the X size
184
+ 'reg_img' image, when asked for.
185
+ Suggested value: NULL
186
+
187
+ @param reg_y Pointer to an int where LSD will put the Y size
188
+ 'reg_img' image, when asked for.
189
+ Suggested value: NULL
190
+
191
+ @return A double array of size 7 x n_out, containing the list
192
+ of line segments detected. The array contains first
193
+ 7 values of line segment number 1, then the 7 values
194
+ of line segment number 2, and so on, and it finish
195
+ by the 7 values of line segment number n_out.
196
+ The seven values are:
197
+ - x1,y1,x2,y2,width,p,-log10(NFA)
198
+ .
199
+ for a line segment from coordinates (x1,y1) to (x2,y2),
200
+ a width 'width', an angle precision of p in (0,1) given
201
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
202
+ If 'out' is the returned pointer, the 7 values of
203
+ line segment number 'n+1' are obtained with
204
+ 'out[7*n+0]' to 'out[7*n+6]'.
205
+ */
206
+ double * lsd_scale_region( int * n_out,
207
+ double * img, int X, int Y, double scale,
208
+ int ** reg_img, int * reg_x, int * reg_y );
209
+
210
+ /*----------------------------------------------------------------------------*/
211
+ /** LSD Simple Interface with Scale
212
+
213
+ @param n_out Pointer to an int where LSD will store the number of
214
+ line segments detected.
215
+
216
+ @param img Pointer to input image data. It must be an array of
217
+ doubles of size X x Y, and the pixel at coordinates
218
+ (x,y) is obtained by img[x+y*X].
219
+
220
+ @param X X size of the image: the number of columns.
221
+
222
+ @param Y Y size of the image: the number of rows.
223
+
224
+ @param scale When different from 1.0, LSD will scale the input image
225
+ by 'scale' factor by Gaussian filtering, before detecting
226
+ line segments.
227
+ Example: if scale=0.8, the input image will be subsampled
228
+ to 80% of its size, before the line segment detector
229
+ is applied.
230
+ Suggested value: 0.8
231
+
232
+ @return A double array of size 7 x n_out, containing the list
233
+ of line segments detected. The array contains first
234
+ 7 values of line segment number 1, then the 7 values
235
+ of line segment number 2, and so on, and it finish
236
+ by the 7 values of line segment number n_out.
237
+ The seven values are:
238
+ - x1,y1,x2,y2,width,p,-log10(NFA)
239
+ .
240
+ for a line segment from coordinates (x1,y1) to (x2,y2),
241
+ a width 'width', an angle precision of p in (0,1) given
242
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
243
+ If 'out' is the returned pointer, the 7 values of
244
+ line segment number 'n+1' are obtained with
245
+ 'out[7*n+0]' to 'out[7*n+6]'.
246
+ */
247
+ double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
248
+
249
+ /*----------------------------------------------------------------------------*/
250
+ /** LSD Simple Interface
251
+
252
+ @param n_out Pointer to an int where LSD will store the number of
253
+ line segments detected.
254
+
255
+ @param img Pointer to input image data. It must be an array of
256
+ doubles of size X x Y, and the pixel at coordinates
257
+ (x,y) is obtained by img[x+y*X].
258
+
259
+ @param X X size of the image: the number of columns.
260
+
261
+ @param Y Y size of the image: the number of rows.
262
+
263
+ @return A double array of size 7 x n_out, containing the list
264
+ of line segments detected. The array contains first
265
+ 7 values of line segment number 1, then the 7 values
266
+ of line segment number 2, and so on, and it finish
267
+ by the 7 values of line segment number n_out.
268
+ The seven values are:
269
+ - x1,y1,x2,y2,width,p,-log10(NFA)
270
+ .
271
+ for a line segment from coordinates (x1,y1) to (x2,y2),
272
+ a width 'width', an angle precision of p in (0,1) given
273
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
274
+ If 'out' is the returned pointer, the 7 values of
275
+ line segment number 'n+1' are obtained with
276
+ 'out[7*n+0]' to 'out[7*n+6]'.
277
+ */
278
+ double * lsd(int * n_out, double * img, int X, int Y);
279
+
280
+ void free_values(double * p);
281
+
282
+ #endif /* !LSD_HEADER */
283
+ /*----------------------------------------------------------------------------*/
data/lib/tabula.rb CHANGED
@@ -1,5 +1,11 @@
1
+ module Tabula
2
+ PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
3
+ end
4
+
1
5
  require_relative './tabula/version'
2
6
  require_relative './tabula/entities'
3
7
  require_relative './tabula/pdf_dump'
4
8
  require_relative './tabula/table_extractor'
5
9
  require_relative './tabula/writers'
10
+ require_relative './tabula/line_segment_detector'
11
+ require_relative './tabula/pdf_render'
@@ -0,0 +1,21 @@
1
+ module Enumerable
2
+
3
+ def sum
4
+ self.inject(0){|accum, i| accum + i }
5
+ end
6
+
7
+ def mean
8
+ self.sum/self.length.to_f
9
+ end
10
+
11
+ def sample_variance
12
+ m = self.mean
13
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
14
+ sum/(self.length - 1).to_f
15
+ end
16
+
17
+ def standard_deviation
18
+ return Math.sqrt(self.sample_variance)
19
+ end
20
+
21
+ end
@@ -96,8 +96,13 @@ module Tabula
96
96
  # get text, optionally from a provided area in the page [top, left, bottom, right]
97
97
  def get_text(area=nil)
98
98
  area = [0, 0, width, height] if area.nil?
99
- ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
100
- self.texts.select { |t| t.overlaps? ze }
99
+
100
+ # spaces are not detected, b/c they have height == 0
101
+ # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select { |t|
104
+ t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
+ }
101
106
  end
102
107
 
103
108
  def to_json(options={})
@@ -112,15 +117,17 @@ module Tabula
112
117
  end
113
118
 
114
119
  class TextElement < ZoneEntity
115
- attr_accessor :font, :font_size, :text
120
+ attr_accessor :font, :font_size, :text, :width_of_space
116
121
 
117
122
  CHARACTER_DISTANCE_THRESHOLD = 1.5
123
+ TOLERANCE_FACTOR = 0.25
118
124
 
119
- def initialize(top, left, width, height, font, font_size, text)
125
+ def initialize(top, left, width, height, font, font_size, text, width_of_space)
120
126
  super(top, left, width, height)
121
127
  self.font = font
122
128
  self.font_size = font_size
123
129
  self.text = text
130
+ self.width_of_space = width_of_space
124
131
  end
125
132
 
126
133
  # more or less returns True if distance < tolerance
@@ -128,7 +135,7 @@ module Tabula
128
135
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
129
136
  overlaps = self.vertically_overlaps?(other)
130
137
 
131
- tolerance = ((self.font_size + other.font_size) / 2) * 0.25
138
+ tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
132
139
 
133
140
  overlaps or
134
141
  (self.height == 0 and other.height != 0) or
@@ -141,13 +148,13 @@ module Tabula
141
148
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
142
149
  overlaps = self.vertically_overlaps?(other)
143
150
 
144
- tolerance = ((self.font_size + other.font_size) / 2) * 0.25
151
+ up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
152
+ down_tolerance = 0.95
145
153
 
146
- dist = self.horizontal_distance(other)
147
- overlaps or
148
- (self.height == 0 and other.height != 0) or
149
- (other.height == 0 and self.height != 0) and
150
- ((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
154
+ dist = self.horizontal_distance(other).abs
155
+
156
+ rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
157
+ rv
151
158
  end
152
159
 
153
160
  def merge!(other)
@@ -235,25 +242,139 @@ module Tabula
235
242
  def inspect
236
243
  vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
237
244
  texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
238
- "<#{self.class}: #{vars.join(', ')}, @text_elements=#{texts.join(', ')}>"
245
+ "<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
239
246
  end
240
247
 
241
248
  end
242
249
 
250
+ require_relative './core_ext'
251
+
243
252
  class Ruling < ZoneEntity
244
- attr_accessor :color
253
+ # 2D line intersection test taken from comp.graphics.algorithms FAQ
254
+ def intersects?(other)
255
+ r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
256
+ / ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
245
257
 
246
- def initialize(top, left, width, height, color)
247
- super(top, left, width, height)
248
- self.color = color
258
+ s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
259
+ / ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
260
+
261
+ r >= 0 and r < 1 and s >= 0 and s < 1
249
262
  end
250
263
 
251
- def to_h
252
- hash = super
253
- hash[:color] = self.color
254
- hash
264
+ def vertical?
265
+ left == right
266
+ end
267
+
268
+ def horizontal?
269
+ top == bottom
270
+ end
271
+
272
+ def to_json(arg)
273
+ [left, top, right, bottom].to_json
274
+ end
275
+
276
+ def to_xml
277
+ "<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
278
+ % [left, top, right, bottom]
279
+ end
280
+
281
+ def self.clean_rulings(rulings, max_distance=4)
282
+
283
+ # merge horizontal and vertical lines
284
+ # TODO this should be iterative
285
+
286
+ skip = false
287
+
288
+ horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
289
+ .group_by(&:top)
290
+ .values.reduce([]) { |memo, rs|
291
+ rs = rs.sort_by(&:left)
292
+
293
+ memo << if rs.size > 1
294
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
295
+ else
296
+ rs.first
297
+ end
298
+
299
+ }
300
+ .sort_by(&:top)
301
+
302
+ h = []
303
+ horiz.size.times do |i|
304
+
305
+ if i == horiz.size - 1
306
+ h << horiz[-1]
307
+ break
308
+ end
309
+
310
+ if skip
311
+ skip = false;
312
+ next
313
+ end
314
+ d = (horiz[i+1].top - horiz[i].top).abs
315
+
316
+ h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
317
+ skip = true
318
+ Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
319
+ else
320
+ horiz[i]
321
+ end
322
+ end
323
+ horiz = h
324
+
325
+ vert = rulings.select { |r| r.vertical? && r.height > max_distance }
326
+ .group_by(&:left)
327
+ .values.reduce([]) { |memo, rs|
328
+
329
+ rs = rs.sort_by(&:top)
330
+ memo << if rs.size > 1
331
+ Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
332
+ else rs.first
333
+ rs.first
334
+ end
335
+ }
336
+ .sort_by(&:left)
337
+
338
+ v = []
339
+ vert.size.times do |i|
340
+
341
+ if i == vert.size - 1
342
+ v << vert[-1]
343
+ break
344
+ end
345
+
346
+ if skip
347
+ skip = false;
348
+ next
349
+ end
350
+ d = (vert[i+1].left - vert[i].left).abs
351
+
352
+ v << if d < 4 # THRESHOLD DISTANCE between vertical lines
353
+ skip = true
354
+ Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
355
+ else
356
+ vert[i]
357
+ end
358
+ end
359
+ vert = v
360
+
361
+
362
+ # - only keep horizontal rulings that intersect with at least one vertical ruling
363
+ # - only keep vertical rulings that intersect with at least one horizontal ruling
364
+ # yeah, it's a naive heuristic. but hey, it works.
365
+
366
+ # h_mean = horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
367
+ # horiz.reject { |h| h.width < h_mean }
368
+
369
+ #vert.delete_if { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
370
+ #horiz.delete_if { |h| !vert.any? { |v| v.intersects?(h) } } unless vert.empty?
371
+
372
+ return horiz += vert
255
373
  end
256
374
 
375
+
376
+
377
+
257
378
  end
258
379
 
259
380
  end