tabula-extractor 0.0.1-java → 0.5.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/lsd.h ADDED
@@ -0,0 +1,283 @@
1
+ /*----------------------------------------------------------------------------
2
+
3
+ LSD - Line Segment Detector on digital images
4
+
5
+ This code is part of the following publication and was subject
6
+ to peer review:
7
+
8
+ "LSD: a Line Segment Detector" by Rafael Grompone von Gioi,
9
+ Jeremie Jakubowicz, Jean-Michel Morel, and Gregory Randall,
10
+ Image Processing On Line, 2012. DOI:10.5201/ipol.2012.gjmr-lsd
11
+ http://dx.doi.org/10.5201/ipol.2012.gjmr-lsd
12
+
13
+ Copyright (c) 2007-2011 rafael grompone von gioi <grompone@gmail.com>
14
+
15
+ This program is free software: you can redistribute it and/or modify
16
+ it under the terms of the GNU Affero General Public License as
17
+ published by the Free Software Foundation, either version 3 of the
18
+ License, or (at your option) any later version.
19
+
20
+ This program is distributed in the hope that it will be useful,
21
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ GNU Affero General Public License for more details.
24
+
25
+ You should have received a copy of the GNU Affero General Public License
26
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ ----------------------------------------------------------------------------*/
29
+
30
+ /*----------------------------------------------------------------------------*/
31
+ /** @file lsd.h
32
+ LSD module header
33
+ @author rafael grompone von gioi <grompone@gmail.com>
34
+ */
35
+ /*----------------------------------------------------------------------------*/
36
+ #ifndef LSD_HEADER
37
+ #define LSD_HEADER
38
+
39
+ /*----------------------------------------------------------------------------*/
40
+ /** LSD Full Interface
41
+
42
+ @param n_out Pointer to an int where LSD will store the number of
43
+ line segments detected.
44
+
45
+ @param img Pointer to input image data. It must be an array of
46
+ doubles of size X x Y, and the pixel at coordinates
47
+ (x,y) is obtained by img[x+y*X].
48
+
49
+ @param X X size of the image: the number of columns.
50
+
51
+ @param Y Y size of the image: the number of rows.
52
+
53
+ @param scale When different from 1.0, LSD will scale the input image
54
+ by 'scale' factor by Gaussian filtering, before detecting
55
+ line segments.
56
+ Example: if scale=0.8, the input image will be subsampled
57
+ to 80% of its size, before the line segment detector
58
+ is applied.
59
+ Suggested value: 0.8
60
+
61
+ @param sigma_scale When scale!=1.0, the sigma of the Gaussian filter is:
62
+ sigma = sigma_scale / scale, if scale < 1.0
63
+ sigma = sigma_scale, if scale >= 1.0
64
+ Suggested value: 0.6
65
+
66
+ @param quant Bound to the quantization error on the gradient norm.
67
+ Example: if gray levels are quantized to integer steps,
68
+ the gradient (computed by finite differences) error
69
+ due to quantization will be bounded by 2.0, as the
70
+ worst case is when the error are 1 and -1, that
71
+ gives an error of 2.0.
72
+ Suggested value: 2.0
73
+
74
+ @param ang_th Gradient angle tolerance in the region growing
75
+ algorithm, in degrees.
76
+ Suggested value: 22.5
77
+
78
+ @param log_eps Detection threshold, accept if -log10(NFA) > log_eps.
79
+ The larger the value, the more strict the detector is,
80
+ and will result in less detections.
81
+ (Note that the 'minus sign' makes that this
82
+ behavior is opposite to the one of NFA.)
83
+ The value -log10(NFA) is equivalent but more
84
+ intuitive than NFA:
85
+ - -1.0 gives an average of 10 false detections on noise
86
+ - 0.0 gives an average of 1 false detections on noise
87
+ - 1.0 gives an average of 0.1 false detections on nose
88
+ - 2.0 gives an average of 0.01 false detections on noise
89
+ .
90
+ Suggested value: 0.0
91
+
92
+ @param density_th Minimal proportion of 'supporting' points in a rectangle.
93
+ Suggested value: 0.7
94
+
95
+ @param n_bins Number of bins used in the pseudo-ordering of gradient
96
+ modulus.
97
+ Suggested value: 1024
98
+
99
+ @param reg_img Optional output: if desired, LSD will return an
100
+ int image where each pixel indicates the line segment
101
+ to which it belongs. Unused pixels have the value '0',
102
+ while the used ones have the number of the line segment,
103
+ numbered 1,2,3,..., in the same order as in the
104
+ output list. If desired, a non NULL int** pointer must
105
+ be assigned, and LSD will make that the pointer point
106
+ to an int array of size reg_x x reg_y, where the pixel
107
+ value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
108
+ Note that the resulting image has the size of the image
109
+ used for the processing, that is, the size of the input
110
+ image scaled by the given factor 'scale'. If scale!=1
111
+ this size differs from XxY and that is the reason why
112
+ its value is given by reg_x and reg_y.
113
+ Suggested value: NULL
114
+
115
+ @param reg_x Pointer to an int where LSD will put the X size
116
+ 'reg_img' image, when asked for.
117
+ Suggested value: NULL
118
+
119
+ @param reg_y Pointer to an int where LSD will put the Y size
120
+ 'reg_img' image, when asked for.
121
+ Suggested value: NULL
122
+
123
+ @return A double array of size 7 x n_out, containing the list
124
+ of line segments detected. The array contains first
125
+ 7 values of line segment number 1, then the 7 values
126
+ of line segment number 2, and so on, and it finish
127
+ by the 7 values of line segment number n_out.
128
+ The seven values are:
129
+ - x1,y1,x2,y2,width,p,-log10(NFA)
130
+ .
131
+ for a line segment from coordinates (x1,y1) to (x2,y2),
132
+ a width 'width', an angle precision of p in (0,1) given
133
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
134
+ If 'out' is the returned pointer, the 7 values of
135
+ line segment number 'n+1' are obtained with
136
+ 'out[7*n+0]' to 'out[7*n+6]'.
137
+ */
138
+ double * LineSegmentDetection( int * n_out,
139
+ double * img, int X, int Y,
140
+ double scale, double sigma_scale, double quant,
141
+ double ang_th, double log_eps, double density_th,
142
+ int n_bins,
143
+ int ** reg_img, int * reg_x, int * reg_y );
144
+
145
+ /*----------------------------------------------------------------------------*/
146
+ /** LSD Simple Interface with Scale and Region output.
147
+
148
+ @param n_out Pointer to an int where LSD will store the number of
149
+ line segments detected.
150
+
151
+ @param img Pointer to input image data. It must be an array of
152
+ doubles of size X x Y, and the pixel at coordinates
153
+ (x,y) is obtained by img[x+y*X].
154
+
155
+ @param X X size of the image: the number of columns.
156
+
157
+ @param Y Y size of the image: the number of rows.
158
+
159
+ @param scale When different from 1.0, LSD will scale the input image
160
+ by 'scale' factor by Gaussian filtering, before detecting
161
+ line segments.
162
+ Example: if scale=0.8, the input image will be subsampled
163
+ to 80% of its size, before the line segment detector
164
+ is applied.
165
+ Suggested value: 0.8
166
+
167
+ @param reg_img Optional output: if desired, LSD will return an
168
+ int image where each pixel indicates the line segment
169
+ to which it belongs. Unused pixels have the value '0',
170
+ while the used ones have the number of the line segment,
171
+ numbered 1,2,3,..., in the same order as in the
172
+ output list. If desired, a non NULL int** pointer must
173
+ be assigned, and LSD will make that the pointer point
174
+ to an int array of size reg_x x reg_y, where the pixel
175
+ value at (x,y) is obtained with (*reg_img)[x+y*reg_x].
176
+ Note that the resulting image has the size of the image
177
+ used for the processing, that is, the size of the input
178
+ image scaled by the given factor 'scale'. If scale!=1
179
+ this size differs from XxY and that is the reason why
180
+ its value is given by reg_x and reg_y.
181
+ Suggested value: NULL
182
+
183
+ @param reg_x Pointer to an int where LSD will put the X size
184
+ 'reg_img' image, when asked for.
185
+ Suggested value: NULL
186
+
187
+ @param reg_y Pointer to an int where LSD will put the Y size
188
+ 'reg_img' image, when asked for.
189
+ Suggested value: NULL
190
+
191
+ @return A double array of size 7 x n_out, containing the list
192
+ of line segments detected. The array contains first
193
+ 7 values of line segment number 1, then the 7 values
194
+ of line segment number 2, and so on, and it finish
195
+ by the 7 values of line segment number n_out.
196
+ The seven values are:
197
+ - x1,y1,x2,y2,width,p,-log10(NFA)
198
+ .
199
+ for a line segment from coordinates (x1,y1) to (x2,y2),
200
+ a width 'width', an angle precision of p in (0,1) given
201
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
202
+ If 'out' is the returned pointer, the 7 values of
203
+ line segment number 'n+1' are obtained with
204
+ 'out[7*n+0]' to 'out[7*n+6]'.
205
+ */
206
+ double * lsd_scale_region( int * n_out,
207
+ double * img, int X, int Y, double scale,
208
+ int ** reg_img, int * reg_x, int * reg_y );
209
+
210
+ /*----------------------------------------------------------------------------*/
211
+ /** LSD Simple Interface with Scale
212
+
213
+ @param n_out Pointer to an int where LSD will store the number of
214
+ line segments detected.
215
+
216
+ @param img Pointer to input image data. It must be an array of
217
+ doubles of size X x Y, and the pixel at coordinates
218
+ (x,y) is obtained by img[x+y*X].
219
+
220
+ @param X X size of the image: the number of columns.
221
+
222
+ @param Y Y size of the image: the number of rows.
223
+
224
+ @param scale When different from 1.0, LSD will scale the input image
225
+ by 'scale' factor by Gaussian filtering, before detecting
226
+ line segments.
227
+ Example: if scale=0.8, the input image will be subsampled
228
+ to 80% of its size, before the line segment detector
229
+ is applied.
230
+ Suggested value: 0.8
231
+
232
+ @return A double array of size 7 x n_out, containing the list
233
+ of line segments detected. The array contains first
234
+ 7 values of line segment number 1, then the 7 values
235
+ of line segment number 2, and so on, and it finish
236
+ by the 7 values of line segment number n_out.
237
+ The seven values are:
238
+ - x1,y1,x2,y2,width,p,-log10(NFA)
239
+ .
240
+ for a line segment from coordinates (x1,y1) to (x2,y2),
241
+ a width 'width', an angle precision of p in (0,1) given
242
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
243
+ If 'out' is the returned pointer, the 7 values of
244
+ line segment number 'n+1' are obtained with
245
+ 'out[7*n+0]' to 'out[7*n+6]'.
246
+ */
247
+ double * lsd_scale(int * n_out, double * img, int X, int Y, double scale);
248
+
249
+ /*----------------------------------------------------------------------------*/
250
+ /** LSD Simple Interface
251
+
252
+ @param n_out Pointer to an int where LSD will store the number of
253
+ line segments detected.
254
+
255
+ @param img Pointer to input image data. It must be an array of
256
+ doubles of size X x Y, and the pixel at coordinates
257
+ (x,y) is obtained by img[x+y*X].
258
+
259
+ @param X X size of the image: the number of columns.
260
+
261
+ @param Y Y size of the image: the number of rows.
262
+
263
+ @return A double array of size 7 x n_out, containing the list
264
+ of line segments detected. The array contains first
265
+ 7 values of line segment number 1, then the 7 values
266
+ of line segment number 2, and so on, and it finish
267
+ by the 7 values of line segment number n_out.
268
+ The seven values are:
269
+ - x1,y1,x2,y2,width,p,-log10(NFA)
270
+ .
271
+ for a line segment from coordinates (x1,y1) to (x2,y2),
272
+ a width 'width', an angle precision of p in (0,1) given
273
+ by angle_tolerance/180 degree, and NFA value 'NFA'.
274
+ If 'out' is the returned pointer, the 7 values of
275
+ line segment number 'n+1' are obtained with
276
+ 'out[7*n+0]' to 'out[7*n+6]'.
277
+ */
278
+ double * lsd(int * n_out, double * img, int X, int Y);
279
+
280
+ void free_values(double * p);
281
+
282
+ #endif /* !LSD_HEADER */
283
+ /*----------------------------------------------------------------------------*/
data/lib/tabula.rb CHANGED
@@ -1,5 +1,11 @@
1
+ module Tabula
2
+ PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
3
+ end
4
+
1
5
  require_relative './tabula/version'
2
6
  require_relative './tabula/entities'
3
7
  require_relative './tabula/pdf_dump'
4
8
  require_relative './tabula/table_extractor'
5
9
  require_relative './tabula/writers'
10
+ require_relative './tabula/line_segment_detector'
11
+ require_relative './tabula/pdf_render'
@@ -0,0 +1,21 @@
1
+ module Enumerable
2
+
3
+ def sum
4
+ self.inject(0){|accum, i| accum + i }
5
+ end
6
+
7
+ def mean
8
+ self.sum/self.length.to_f
9
+ end
10
+
11
+ def sample_variance
12
+ m = self.mean
13
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
14
+ sum/(self.length - 1).to_f
15
+ end
16
+
17
+ def standard_deviation
18
+ return Math.sqrt(self.sample_variance)
19
+ end
20
+
21
+ end
@@ -96,8 +96,13 @@ module Tabula
96
96
  # get text, optionally from a provided area in the page [top, left, bottom, right]
97
97
  def get_text(area=nil)
98
98
  area = [0, 0, width, height] if area.nil?
99
- ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
100
- self.texts.select { |t| t.overlaps? ze }
99
+
100
+ # spaces are not detected, b/c they have height == 0
101
+ # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select { |t|
104
+ t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
+ }
101
106
  end
102
107
 
103
108
  def to_json(options={})
@@ -112,15 +117,17 @@ module Tabula
112
117
  end
113
118
 
114
119
  class TextElement < ZoneEntity
115
- attr_accessor :font, :font_size, :text
120
+ attr_accessor :font, :font_size, :text, :width_of_space
116
121
 
117
122
  CHARACTER_DISTANCE_THRESHOLD = 1.5
123
+ TOLERANCE_FACTOR = 0.25
118
124
 
119
- def initialize(top, left, width, height, font, font_size, text)
125
+ def initialize(top, left, width, height, font, font_size, text, width_of_space)
120
126
  super(top, left, width, height)
121
127
  self.font = font
122
128
  self.font_size = font_size
123
129
  self.text = text
130
+ self.width_of_space = width_of_space
124
131
  end
125
132
 
126
133
  # more or less returns True if distance < tolerance
@@ -128,7 +135,7 @@ module Tabula
128
135
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
129
136
  overlaps = self.vertically_overlaps?(other)
130
137
 
131
- tolerance = ((self.font_size + other.font_size) / 2) * 0.25
138
+ tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
132
139
 
133
140
  overlaps or
134
141
  (self.height == 0 and other.height != 0) or
@@ -141,13 +148,13 @@ module Tabula
141
148
  raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
142
149
  overlaps = self.vertically_overlaps?(other)
143
150
 
144
- tolerance = ((self.font_size + other.font_size) / 2) * 0.25
151
+ up_tolerance = ((self.font_size + other.font_size) / 2) * TOLERANCE_FACTOR
152
+ down_tolerance = 0.95
145
153
 
146
- dist = self.horizontal_distance(other)
147
- overlaps or
148
- (self.height == 0 and other.height != 0) or
149
- (other.height == 0 and self.height != 0) and
150
- ((tolerance <= dist) and (dist < tolerance*CHARACTER_DISTANCE_THRESHOLD))
154
+ dist = self.horizontal_distance(other).abs
155
+
156
+ rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
157
+ rv
151
158
  end
152
159
 
153
160
  def merge!(other)
@@ -235,25 +242,139 @@ module Tabula
235
242
  def inspect
236
243
  vars = (self.instance_variables - [:@text_elements]).map{ |v| "#{v}=#{instance_variable_get(v).inspect}" }
237
244
  texts = self.text_elements.sort_by { |te| te.top }.map { |te| te.text }
238
- "<#{self.class}: #{vars.join(', ')}, @text_elements=#{texts.join(', ')}>"
245
+ "<#{self.class}: #{vars.join(', ')}, @text_elements=[#{texts.join('], [')}]>"
239
246
  end
240
247
 
241
248
  end
242
249
 
250
+ require_relative './core_ext'
251
+
243
252
  class Ruling < ZoneEntity
244
- attr_accessor :color
253
+ # 2D line intersection test taken from comp.graphics.algorithms FAQ
254
+ def intersects?(other)
255
+ r = ((self.top-other.top)*(other.right-other.left) - (self.left-other.left)*(other.bottom-other.top)) \
256
+ / ((self.right-self.left)*(other.bottom-other.top)-(self.bottom-self.top)*(other.right-other.left))
245
257
 
246
- def initialize(top, left, width, height, color)
247
- super(top, left, width, height)
248
- self.color = color
258
+ s = ((self.top-other.top)*(self.right-self.left) - (self.left-other.left)*(self.bottom-self.top)) \
259
+ / ((self.right-self.left)*(other.bottom-other.top) - (self.bottom-self.top)*(other.right-other.left))
260
+
261
+ r >= 0 and r < 1 and s >= 0 and s < 1
249
262
  end
250
263
 
251
- def to_h
252
- hash = super
253
- hash[:color] = self.color
254
- hash
264
+ def vertical?
265
+ left == right
266
+ end
267
+
268
+ def horizontal?
269
+ top == bottom
270
+ end
271
+
272
+ def to_json(arg)
273
+ [left, top, right, bottom].to_json
274
+ end
275
+
276
+ def to_xml
277
+ "<ruling x1=\"%.2f\" y1=\"%.2f\" x2=\"%.2f\" y2=\"%.2f\" />" \
278
+ % [left, top, right, bottom]
279
+ end
280
+
281
+ def self.clean_rulings(rulings, max_distance=4)
282
+
283
+ # merge horizontal and vertical lines
284
+ # TODO this should be iterative
285
+
286
+ skip = false
287
+
288
+ horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
289
+ .group_by(&:top)
290
+ .values.reduce([]) { |memo, rs|
291
+ rs = rs.sort_by(&:left)
292
+
293
+ memo << if rs.size > 1
294
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
295
+ else
296
+ rs.first
297
+ end
298
+
299
+ }
300
+ .sort_by(&:top)
301
+
302
+ h = []
303
+ horiz.size.times do |i|
304
+
305
+ if i == horiz.size - 1
306
+ h << horiz[-1]
307
+ break
308
+ end
309
+
310
+ if skip
311
+ skip = false;
312
+ next
313
+ end
314
+ d = (horiz[i+1].top - horiz[i].top).abs
315
+
316
+ h << if d < 4 # THRESHOLD DISTANCE between horizontal lines
317
+ skip = true
318
+ Tabula::Ruling.new(horiz[i].top + d / 2, [horiz[i].left, horiz[i+1].left].min, [horiz[i+1].width.abs, horiz[i].width.abs].max, 0)
319
+ else
320
+ horiz[i]
321
+ end
322
+ end
323
+ horiz = h
324
+
325
+ vert = rulings.select { |r| r.vertical? && r.height > max_distance }
326
+ .group_by(&:left)
327
+ .values.reduce([]) { |memo, rs|
328
+
329
+ rs = rs.sort_by(&:top)
330
+ memo << if rs.size > 1
331
+ Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
332
+ else rs.first
333
+ rs.first
334
+ end
335
+ }
336
+ .sort_by(&:left)
337
+
338
+ v = []
339
+ vert.size.times do |i|
340
+
341
+ if i == vert.size - 1
342
+ v << vert[-1]
343
+ break
344
+ end
345
+
346
+ if skip
347
+ skip = false;
348
+ next
349
+ end
350
+ d = (vert[i+1].left - vert[i].left).abs
351
+
352
+ v << if d < 4 # THRESHOLD DISTANCE between vertical lines
353
+ skip = true
354
+ Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
355
+ else
356
+ vert[i]
357
+ end
358
+ end
359
+ vert = v
360
+
361
+
362
+ # - only keep horizontal rulings that intersect with at least one vertical ruling
363
+ # - only keep vertical rulings that intersect with at least one horizontal ruling
364
+ # yeah, it's a naive heuristic. but hey, it works.
365
+
366
+ # h_mean = horiz.reduce(0) { |accum, i| accum + i.width } / horiz.size
367
+ # horiz.reject { |h| h.width < h_mean }
368
+
369
+ #vert.delete_if { |v| !horiz.any? { |h| h.intersects?(v) } } unless horiz.empty?
370
+ #horiz.delete_if { |h| !vert.any? { |v| v.intersects?(h) } } unless vert.empty?
371
+
372
+ return horiz += vert
255
373
  end
256
374
 
375
+
376
+
377
+
257
378
  end
258
379
 
259
380
  end