tabula-extractor 0.5.0-java → 0.5.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/ext/Makefile.mingw64 +10 -0
- data/ext/liblsd64.dll +0 -0
- data/lib/tabula/entities.rb +72 -47
- data/lib/tabula/line_segment_detector.rb +19 -4
- data/lib/tabula/version.rb +1 -1
- metadata +4 -2
@@ -0,0 +1,10 @@
|
|
1
|
+
include Makefile.defaults
|
2
|
+
|
3
|
+
#CC = /usr/local/gcc-4.8.0-qt-4.8.4-for-mingw32/win32-gcc/bin/i586-mingw32-gcc
|
4
|
+
CC = /usr/bin/x86_64-w64-mingw32-gcc
|
5
|
+
CFLAGS := -Wall -Werror
|
6
|
+
|
7
|
+
lib: lib$(NAME).$(VERSION).dll
|
8
|
+
|
9
|
+
lib$(NAME).$(VERSION).dll: $(NAME).o
|
10
|
+
$(CC) -shared -o lib$(NAME)64.dll liblsd.def $^
|
data/ext/liblsd64.dll
ADDED
Binary file
|
data/lib/tabula/entities.rb
CHANGED
@@ -99,8 +99,8 @@ module Tabula
|
|
99
99
|
|
100
100
|
# spaces are not detected, b/c they have height == 0
|
101
101
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select { |t|
|
102
|
+
# self.texts.select { |t| t.overlaps? ze }
|
103
|
+
self.texts.select { |t|
|
104
104
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
105
|
}
|
106
106
|
end
|
@@ -152,7 +152,7 @@ module Tabula
|
|
152
152
|
down_tolerance = 0.95
|
153
153
|
|
154
154
|
dist = self.horizontal_distance(other).abs
|
155
|
-
|
155
|
+
|
156
156
|
rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
|
157
157
|
rv
|
158
158
|
end
|
@@ -287,20 +287,31 @@ module Tabula
|
|
287
287
|
|
288
288
|
horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
|
289
289
|
.group_by(&:top)
|
290
|
-
.values.reduce([])
|
291
|
-
rs = rs.sort_by(&:left)
|
292
|
-
|
293
|
-
memo << if rs.size > 1
|
294
|
-
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
295
|
-
else
|
296
|
-
rs.first
|
297
|
-
end
|
290
|
+
.values.reduce([]) do |memo, rs|
|
298
291
|
|
299
|
-
|
292
|
+
rs = rs.sort_by(&:left)
|
293
|
+
if rs.size > 1
|
294
|
+
memo +=
|
295
|
+
rs.each_cons(2)
|
296
|
+
.chunk { |p| p[1].left - p[0].right < 7 }
|
297
|
+
.select { |c| c[0] }
|
298
|
+
.map { |group|
|
299
|
+
group = group.last.flatten.uniq
|
300
|
+
Tabula::Ruling.new(group[0].top,
|
301
|
+
group[0].left,
|
302
|
+
group[-1].right - group[0].left,
|
303
|
+
0)
|
304
|
+
}
|
305
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
306
|
+
else
|
307
|
+
memo << rs.first
|
308
|
+
end
|
309
|
+
memo
|
310
|
+
end
|
300
311
|
.sort_by(&:top)
|
301
312
|
|
302
313
|
h = []
|
303
|
-
horiz.size.times do |i|
|
314
|
+
horiz.size.times do |i|
|
304
315
|
|
305
316
|
if i == horiz.size - 1
|
306
317
|
h << horiz[-1]
|
@@ -308,7 +319,7 @@ module Tabula
|
|
308
319
|
end
|
309
320
|
|
310
321
|
if skip
|
311
|
-
skip = false;
|
322
|
+
skip = false;
|
312
323
|
next
|
313
324
|
end
|
314
325
|
d = (horiz[i+1].top - horiz[i].top).abs
|
@@ -324,41 +335,55 @@ module Tabula
|
|
324
335
|
|
325
336
|
vert = rulings.select { |r| r.vertical? && r.height > max_distance }
|
326
337
|
.group_by(&:left)
|
327
|
-
.values
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
338
|
+
.values
|
339
|
+
.reduce([]) do |memo, rs|
|
340
|
+
|
341
|
+
rs = rs.sort_by(&:top)
|
342
|
+
|
343
|
+
if rs.size > 1
|
344
|
+
# Here be dragons:
|
345
|
+
# merge consecutive segments of lines that are close enough
|
346
|
+
memo +=
|
347
|
+
rs.each_cons(2)
|
348
|
+
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
349
|
+
.select { |c| c[0] }
|
350
|
+
.map { |group|
|
351
|
+
group = group.last.flatten.uniq
|
352
|
+
Tabula::Ruling.new(group[0].top,
|
353
|
+
group[0].left,
|
354
|
+
0,
|
355
|
+
group[-1].bottom - group[0].top)
|
356
|
+
}
|
357
|
+
else
|
358
|
+
memo << rs.first
|
359
|
+
end
|
360
|
+
memo
|
361
|
+
end.sort_by(&:left)
|
362
|
+
|
363
|
+
# v = []
|
364
|
+
|
365
|
+
# vert.size.times do |i|
|
366
|
+
# if i == vert.size - 1
|
367
|
+
# v << vert[-1]
|
368
|
+
# break
|
369
|
+
# end
|
370
|
+
|
371
|
+
# if skip
|
372
|
+
# skip = false;
|
373
|
+
# next
|
374
|
+
# end
|
375
|
+
# d = (vert[i+1].left - vert[i].left).abs
|
376
|
+
|
377
|
+
# v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
378
|
+
# skip = true
|
379
|
+
# Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
380
|
+
# else
|
381
|
+
# vert[i]
|
382
|
+
# end
|
383
|
+
# end
|
384
|
+
# vert = v
|
351
385
|
|
352
|
-
v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
353
|
-
skip = true
|
354
|
-
Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
355
|
-
else
|
356
|
-
vert[i]
|
357
|
-
end
|
358
|
-
end
|
359
|
-
vert = v
|
360
386
|
|
361
|
-
|
362
387
|
# - only keep horizontal rulings that intersect with at least one vertical ruling
|
363
388
|
# - only keep vertical rulings that intersect with at least one horizontal ruling
|
364
389
|
# yeah, it's a naive heuristic. but hey, it works.
|
@@ -16,7 +16,11 @@ module Tabula
|
|
16
16
|
extend FFI::Library
|
17
17
|
ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
|
18
18
|
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
19
|
-
'
|
19
|
+
if RbConfig::CONFIG['host_cpu'] == 'x86_64'
|
20
|
+
'liblsd64.dll'
|
21
|
+
else
|
22
|
+
'liblsd.dll'
|
23
|
+
end
|
20
24
|
when /darwin|mac os/
|
21
25
|
'liblsd.dylib'
|
22
26
|
when /linux/
|
@@ -33,16 +37,27 @@ module Tabula
|
|
33
37
|
attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
|
34
38
|
attach_function :free_values, [ :pointer ], :void
|
35
39
|
|
36
|
-
|
40
|
+
DETECT_LINES_DEFAULTS = {
|
41
|
+
:scale_factor => nil,
|
42
|
+
:image_size => 2048
|
43
|
+
}
|
44
|
+
|
45
|
+
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
46
|
+
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
|
+
|
37
48
|
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
38
|
-
|
49
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
|
50
|
+
bi = Tabula::Render.pageToBufferedImage(page,
|
51
|
+
options[:image_size])
|
39
52
|
pdf_file.close
|
40
|
-
detect_lines(bi,
|
53
|
+
detect_lines(bi,
|
54
|
+
options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
|
41
55
|
end
|
42
56
|
|
43
57
|
# image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
|
44
58
|
# image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
|
45
59
|
def LSD.detect_lines(image, scale_factor=1)
|
60
|
+
|
46
61
|
bimage = if image.class == Java::JavaAwtImage::BufferedImage
|
47
62
|
image
|
48
63
|
elsif image.class == String
|
data/lib/tabula/version.rb
CHANGED
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.5.
|
5
|
+
version: 0.5.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -84,11 +84,13 @@ files:
|
|
84
84
|
- ext/Makefile.linux32
|
85
85
|
- ext/Makefile.linux64
|
86
86
|
- ext/Makefile.mingw
|
87
|
+
- ext/Makefile.mingw64
|
87
88
|
- ext/liblsd-linux32.so
|
88
89
|
- ext/liblsd-linux64.so
|
89
90
|
- ext/liblsd.def
|
90
91
|
- ext/liblsd.dll
|
91
92
|
- ext/liblsd.dylib
|
93
|
+
- ext/liblsd64.dll
|
92
94
|
- ext/lsd.c
|
93
95
|
- ext/lsd.h
|
94
96
|
- lib/tabula.rb
|