tabula-extractor 0.5.0-java → 0.5.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/ext/Makefile.mingw64 +10 -0
- data/ext/liblsd64.dll +0 -0
- data/lib/tabula/entities.rb +72 -47
- data/lib/tabula/line_segment_detector.rb +19 -4
- data/lib/tabula/version.rb +1 -1
- metadata +4 -2
@@ -0,0 +1,10 @@
|
|
1
|
+
include Makefile.defaults
|
2
|
+
|
3
|
+
#CC = /usr/local/gcc-4.8.0-qt-4.8.4-for-mingw32/win32-gcc/bin/i586-mingw32-gcc
|
4
|
+
CC = /usr/bin/x86_64-w64-mingw32-gcc
|
5
|
+
CFLAGS := -Wall -Werror
|
6
|
+
|
7
|
+
lib: lib$(NAME).$(VERSION).dll
|
8
|
+
|
9
|
+
lib$(NAME).$(VERSION).dll: $(NAME).o
|
10
|
+
$(CC) -shared -o lib$(NAME)64.dll liblsd.def $^
|
data/ext/liblsd64.dll
ADDED
Binary file
|
data/lib/tabula/entities.rb
CHANGED
@@ -99,8 +99,8 @@ module Tabula
|
|
99
99
|
|
100
100
|
# spaces are not detected, b/c they have height == 0
|
101
101
|
# ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
|
102
|
-
# self.texts.select { |t| t.overlaps? ze }
|
103
|
-
self.texts.select { |t|
|
102
|
+
# self.texts.select { |t| t.overlaps? ze }
|
103
|
+
self.texts.select { |t|
|
104
104
|
t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
|
105
105
|
}
|
106
106
|
end
|
@@ -152,7 +152,7 @@ module Tabula
|
|
152
152
|
down_tolerance = 0.95
|
153
153
|
|
154
154
|
dist = self.horizontal_distance(other).abs
|
155
|
-
|
155
|
+
|
156
156
|
rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
|
157
157
|
rv
|
158
158
|
end
|
@@ -287,20 +287,31 @@ module Tabula
|
|
287
287
|
|
288
288
|
horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
|
289
289
|
.group_by(&:top)
|
290
|
-
.values.reduce([])
|
291
|
-
rs = rs.sort_by(&:left)
|
292
|
-
|
293
|
-
memo << if rs.size > 1
|
294
|
-
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
295
|
-
else
|
296
|
-
rs.first
|
297
|
-
end
|
290
|
+
.values.reduce([]) do |memo, rs|
|
298
291
|
|
299
|
-
|
292
|
+
rs = rs.sort_by(&:left)
|
293
|
+
if rs.size > 1
|
294
|
+
memo +=
|
295
|
+
rs.each_cons(2)
|
296
|
+
.chunk { |p| p[1].left - p[0].right < 7 }
|
297
|
+
.select { |c| c[0] }
|
298
|
+
.map { |group|
|
299
|
+
group = group.last.flatten.uniq
|
300
|
+
Tabula::Ruling.new(group[0].top,
|
301
|
+
group[0].left,
|
302
|
+
group[-1].right - group[0].left,
|
303
|
+
0)
|
304
|
+
}
|
305
|
+
Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
|
306
|
+
else
|
307
|
+
memo << rs.first
|
308
|
+
end
|
309
|
+
memo
|
310
|
+
end
|
300
311
|
.sort_by(&:top)
|
301
312
|
|
302
313
|
h = []
|
303
|
-
horiz.size.times do |i|
|
314
|
+
horiz.size.times do |i|
|
304
315
|
|
305
316
|
if i == horiz.size - 1
|
306
317
|
h << horiz[-1]
|
@@ -308,7 +319,7 @@ module Tabula
|
|
308
319
|
end
|
309
320
|
|
310
321
|
if skip
|
311
|
-
skip = false;
|
322
|
+
skip = false;
|
312
323
|
next
|
313
324
|
end
|
314
325
|
d = (horiz[i+1].top - horiz[i].top).abs
|
@@ -324,41 +335,55 @@ module Tabula
|
|
324
335
|
|
325
336
|
vert = rulings.select { |r| r.vertical? && r.height > max_distance }
|
326
337
|
.group_by(&:left)
|
327
|
-
.values
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
338
|
+
.values
|
339
|
+
.reduce([]) do |memo, rs|
|
340
|
+
|
341
|
+
rs = rs.sort_by(&:top)
|
342
|
+
|
343
|
+
if rs.size > 1
|
344
|
+
# Here be dragons:
|
345
|
+
# merge consecutive segments of lines that are close enough
|
346
|
+
memo +=
|
347
|
+
rs.each_cons(2)
|
348
|
+
.chunk { |p| p[1].top - p[0].bottom < 7 }
|
349
|
+
.select { |c| c[0] }
|
350
|
+
.map { |group|
|
351
|
+
group = group.last.flatten.uniq
|
352
|
+
Tabula::Ruling.new(group[0].top,
|
353
|
+
group[0].left,
|
354
|
+
0,
|
355
|
+
group[-1].bottom - group[0].top)
|
356
|
+
}
|
357
|
+
else
|
358
|
+
memo << rs.first
|
359
|
+
end
|
360
|
+
memo
|
361
|
+
end.sort_by(&:left)
|
362
|
+
|
363
|
+
# v = []
|
364
|
+
|
365
|
+
# vert.size.times do |i|
|
366
|
+
# if i == vert.size - 1
|
367
|
+
# v << vert[-1]
|
368
|
+
# break
|
369
|
+
# end
|
370
|
+
|
371
|
+
# if skip
|
372
|
+
# skip = false;
|
373
|
+
# next
|
374
|
+
# end
|
375
|
+
# d = (vert[i+1].left - vert[i].left).abs
|
376
|
+
|
377
|
+
# v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
378
|
+
# skip = true
|
379
|
+
# Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
380
|
+
# else
|
381
|
+
# vert[i]
|
382
|
+
# end
|
383
|
+
# end
|
384
|
+
# vert = v
|
351
385
|
|
352
|
-
v << if d < 4 # THRESHOLD DISTANCE between vertical lines
|
353
|
-
skip = true
|
354
|
-
Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
|
355
|
-
else
|
356
|
-
vert[i]
|
357
|
-
end
|
358
|
-
end
|
359
|
-
vert = v
|
360
386
|
|
361
|
-
|
362
387
|
# - only keep horizontal rulings that intersect with at least one vertical ruling
|
363
388
|
# - only keep vertical rulings that intersect with at least one horizontal ruling
|
364
389
|
# yeah, it's a naive heuristic. but hey, it works.
|
@@ -16,7 +16,11 @@ module Tabula
|
|
16
16
|
extend FFI::Library
|
17
17
|
ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
|
18
18
|
when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
|
19
|
-
'
|
19
|
+
if RbConfig::CONFIG['host_cpu'] == 'x86_64'
|
20
|
+
'liblsd64.dll'
|
21
|
+
else
|
22
|
+
'liblsd.dll'
|
23
|
+
end
|
20
24
|
when /darwin|mac os/
|
21
25
|
'liblsd.dylib'
|
22
26
|
when /linux/
|
@@ -33,16 +37,27 @@ module Tabula
|
|
33
37
|
attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
|
34
38
|
attach_function :free_values, [ :pointer ], :void
|
35
39
|
|
36
|
-
|
40
|
+
DETECT_LINES_DEFAULTS = {
|
41
|
+
:scale_factor => nil,
|
42
|
+
:image_size => 2048
|
43
|
+
}
|
44
|
+
|
45
|
+
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
46
|
+
options = DETECT_LINES_DEFAULTS.merge(options)
|
47
|
+
|
37
48
|
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
38
|
-
|
49
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
|
50
|
+
bi = Tabula::Render.pageToBufferedImage(page,
|
51
|
+
options[:image_size])
|
39
52
|
pdf_file.close
|
40
|
-
detect_lines(bi,
|
53
|
+
detect_lines(bi,
|
54
|
+
options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
|
41
55
|
end
|
42
56
|
|
43
57
|
# image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
|
44
58
|
# image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
|
45
59
|
def LSD.detect_lines(image, scale_factor=1)
|
60
|
+
|
46
61
|
bimage = if image.class == Java::JavaAwtImage::BufferedImage
|
47
62
|
image
|
48
63
|
elsif image.class == String
|
data/lib/tabula/version.rb
CHANGED
metadata
CHANGED
@@ -2,14 +2,14 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.5.
|
5
|
+
version: 0.5.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: minitest
|
@@ -84,11 +84,13 @@ files:
|
|
84
84
|
- ext/Makefile.linux32
|
85
85
|
- ext/Makefile.linux64
|
86
86
|
- ext/Makefile.mingw
|
87
|
+
- ext/Makefile.mingw64
|
87
88
|
- ext/liblsd-linux32.so
|
88
89
|
- ext/liblsd-linux64.so
|
89
90
|
- ext/liblsd.def
|
90
91
|
- ext/liblsd.dll
|
91
92
|
- ext/liblsd.dylib
|
93
|
+
- ext/liblsd64.dll
|
92
94
|
- ext/lsd.c
|
93
95
|
- ext/lsd.h
|
94
96
|
- lib/tabula.rb
|