tabula-extractor 0.5.0-java → 0.5.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,3 +1,5 @@
1
+ .DS_Store
2
+ *.swp
1
3
  *.gem
2
4
  *.rbc
3
5
  .bundle
@@ -0,0 +1,10 @@
1
+ include Makefile.defaults
2
+
3
+ #CC = /usr/local/gcc-4.8.0-qt-4.8.4-for-mingw32/win32-gcc/bin/i586-mingw32-gcc
4
+ CC = /usr/bin/x86_64-w64-mingw32-gcc
5
+ CFLAGS := -Wall -Werror
6
+
7
+ lib: lib$(NAME).$(VERSION).dll
8
+
9
+ lib$(NAME).$(VERSION).dll: $(NAME).o
10
+ $(CC) -shared -o lib$(NAME)64.dll liblsd.def $^
data/ext/liblsd64.dll ADDED
Binary file
@@ -99,8 +99,8 @@ module Tabula
99
99
 
100
100
  # spaces are not detected, b/c they have height == 0
101
101
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select { |t|
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select { |t|
104
104
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
105
  }
106
106
  end
@@ -152,7 +152,7 @@ module Tabula
152
152
  down_tolerance = 0.95
153
153
 
154
154
  dist = self.horizontal_distance(other).abs
155
-
155
+
156
156
  rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
157
157
  rv
158
158
  end
@@ -287,20 +287,31 @@ module Tabula
287
287
 
288
288
  horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
289
289
  .group_by(&:top)
290
- .values.reduce([]) { |memo, rs|
291
- rs = rs.sort_by(&:left)
292
-
293
- memo << if rs.size > 1
294
- Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
295
- else
296
- rs.first
297
- end
290
+ .values.reduce([]) do |memo, rs|
298
291
 
299
- }
292
+ rs = rs.sort_by(&:left)
293
+ if rs.size > 1
294
+ memo +=
295
+ rs.each_cons(2)
296
+ .chunk { |p| p[1].left - p[0].right < 7 }
297
+ .select { |c| c[0] }
298
+ .map { |group|
299
+ group = group.last.flatten.uniq
300
+ Tabula::Ruling.new(group[0].top,
301
+ group[0].left,
302
+ group[-1].right - group[0].left,
303
+ 0)
304
+ }
305
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
306
+ else
307
+ memo << rs.first
308
+ end
309
+ memo
310
+ end
300
311
  .sort_by(&:top)
301
312
 
302
313
  h = []
303
- horiz.size.times do |i|
314
+ horiz.size.times do |i|
304
315
 
305
316
  if i == horiz.size - 1
306
317
  h << horiz[-1]
@@ -308,7 +319,7 @@ module Tabula
308
319
  end
309
320
 
310
321
  if skip
311
- skip = false;
322
+ skip = false;
312
323
  next
313
324
  end
314
325
  d = (horiz[i+1].top - horiz[i].top).abs
@@ -324,41 +335,55 @@ module Tabula
324
335
 
325
336
  vert = rulings.select { |r| r.vertical? && r.height > max_distance }
326
337
  .group_by(&:left)
327
- .values.reduce([]) { |memo, rs|
328
-
329
- rs = rs.sort_by(&:top)
330
- memo << if rs.size > 1
331
- Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
332
- else rs.first
333
- rs.first
334
- end
335
- }
336
- .sort_by(&:left)
337
-
338
- v = []
339
- vert.size.times do |i|
340
-
341
- if i == vert.size - 1
342
- v << vert[-1]
343
- break
344
- end
345
-
346
- if skip
347
- skip = false;
348
- next
349
- end
350
- d = (vert[i+1].left - vert[i].left).abs
338
+ .values
339
+ .reduce([]) do |memo, rs|
340
+
341
+ rs = rs.sort_by(&:top)
342
+
343
+ if rs.size > 1
344
+ # Here be dragons:
345
+ # merge consecutive segments of lines that are close enough
346
+ memo +=
347
+ rs.each_cons(2)
348
+ .chunk { |p| p[1].top - p[0].bottom < 7 }
349
+ .select { |c| c[0] }
350
+ .map { |group|
351
+ group = group.last.flatten.uniq
352
+ Tabula::Ruling.new(group[0].top,
353
+ group[0].left,
354
+ 0,
355
+ group[-1].bottom - group[0].top)
356
+ }
357
+ else
358
+ memo << rs.first
359
+ end
360
+ memo
361
+ end.sort_by(&:left)
362
+
363
+ # v = []
364
+
365
+ # vert.size.times do |i|
366
+ # if i == vert.size - 1
367
+ # v << vert[-1]
368
+ # break
369
+ # end
370
+
371
+ # if skip
372
+ # skip = false;
373
+ # next
374
+ # end
375
+ # d = (vert[i+1].left - vert[i].left).abs
376
+
377
+ # v << if d < 4 # THRESHOLD DISTANCE between vertical lines
378
+ # skip = true
379
+ # Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
380
+ # else
381
+ # vert[i]
382
+ # end
383
+ # end
384
+ # vert = v
351
385
 
352
- v << if d < 4 # THRESHOLD DISTANCE between vertical lines
353
- skip = true
354
- Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
355
- else
356
- vert[i]
357
- end
358
- end
359
- vert = v
360
386
 
361
-
362
387
  # - only keep horizontal rulings that intersect with at least one vertical ruling
363
388
  # - only keep vertical rulings that intersect with at least one horizontal ruling
364
389
  # yeah, it's a naive heuristic. but hey, it works.
@@ -16,7 +16,11 @@ module Tabula
16
16
  extend FFI::Library
17
17
  ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
18
18
  when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
19
- 'liblsd.dll'
19
+ if RbConfig::CONFIG['host_cpu'] == 'x86_64'
20
+ 'liblsd64.dll'
21
+ else
22
+ 'liblsd.dll'
23
+ end
20
24
  when /darwin|mac os/
21
25
  'liblsd.dylib'
22
26
  when /linux/
@@ -33,16 +37,27 @@ module Tabula
33
37
  attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
34
38
  attach_function :free_values, [ :pointer ], :void
35
39
 
36
- def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
40
+ DETECT_LINES_DEFAULTS = {
41
+ :scale_factor => nil,
42
+ :image_size => 2048
43
+ }
44
+
45
+ def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
46
+ options = DETECT_LINES_DEFAULTS.merge(options)
47
+
37
48
  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
38
- bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
49
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
50
+ bi = Tabula::Render.pageToBufferedImage(page,
51
+ options[:image_size])
39
52
  pdf_file.close
40
- detect_lines(bi,scale_factor)
53
+ detect_lines(bi,
54
+ options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
41
55
  end
42
56
 
43
57
  # image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
44
58
  # image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
45
59
  def LSD.detect_lines(image, scale_factor=1)
60
+
46
61
  bimage = if image.class == Java::JavaAwtImage::BufferedImage
47
62
  image
48
63
  elsif image.class == String
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.5.0'
2
+ VERSION = '0.5.1'
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.5.0
5
+ version: 0.5.1
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-12 00:00:00.000000000 Z
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -84,11 +84,13 @@ files:
84
84
  - ext/Makefile.linux32
85
85
  - ext/Makefile.linux64
86
86
  - ext/Makefile.mingw
87
+ - ext/Makefile.mingw64
87
88
  - ext/liblsd-linux32.so
88
89
  - ext/liblsd-linux64.so
89
90
  - ext/liblsd.def
90
91
  - ext/liblsd.dll
91
92
  - ext/liblsd.dylib
93
+ - ext/liblsd64.dll
92
94
  - ext/lsd.c
93
95
  - ext/lsd.h
94
96
  - lib/tabula.rb