tabula-extractor 0.5.0-java → 0.5.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,3 +1,5 @@
1
+ .DS_Store
2
+ *.swp
1
3
  *.gem
2
4
  *.rbc
3
5
  .bundle
@@ -0,0 +1,10 @@
1
+ include Makefile.defaults
2
+
3
+ #CC = /usr/local/gcc-4.8.0-qt-4.8.4-for-mingw32/win32-gcc/bin/i586-mingw32-gcc
4
+ CC = /usr/bin/x86_64-w64-mingw32-gcc
5
+ CFLAGS := -Wall -Werror
6
+
7
+ lib: lib$(NAME).$(VERSION).dll
8
+
9
+ lib$(NAME).$(VERSION).dll: $(NAME).o
10
+ $(CC) -shared -o lib$(NAME)64.dll liblsd.def $^
data/ext/liblsd64.dll ADDED
Binary file
@@ -99,8 +99,8 @@ module Tabula
99
99
 
100
100
  # spaces are not detected, b/c they have height == 0
101
101
  # ze = ZoneEntity.new(area[0], area[1], area[3] - area[1], area[2] - area[0])
102
- # self.texts.select { |t| t.overlaps? ze }
103
- self.texts.select { |t|
102
+ # self.texts.select { |t| t.overlaps? ze }
103
+ self.texts.select { |t|
104
104
  t.top > area[0] && t.top + t.height < area[2] && t.left > area[1] && t.left + t.width < area[3]
105
105
  }
106
106
  end
@@ -152,7 +152,7 @@ module Tabula
152
152
  down_tolerance = 0.95
153
153
 
154
154
  dist = self.horizontal_distance(other).abs
155
-
155
+
156
156
  rv = overlaps && (dist.between?(self.width_of_space * down_tolerance, self.width_of_space + up_tolerance))
157
157
  rv
158
158
  end
@@ -287,20 +287,31 @@ module Tabula
287
287
 
288
288
  horiz = rulings.select { |r| r.horizontal? && r.width > max_distance }
289
289
  .group_by(&:top)
290
- .values.reduce([]) { |memo, rs|
291
- rs = rs.sort_by(&:left)
292
-
293
- memo << if rs.size > 1
294
- Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
295
- else
296
- rs.first
297
- end
290
+ .values.reduce([]) do |memo, rs|
298
291
 
299
- }
292
+ rs = rs.sort_by(&:left)
293
+ if rs.size > 1
294
+ memo +=
295
+ rs.each_cons(2)
296
+ .chunk { |p| p[1].left - p[0].right < 7 }
297
+ .select { |c| c[0] }
298
+ .map { |group|
299
+ group = group.last.flatten.uniq
300
+ Tabula::Ruling.new(group[0].top,
301
+ group[0].left,
302
+ group[-1].right - group[0].left,
303
+ 0)
304
+ }
305
+ Tabula::Ruling.new(rs[0].top, rs[0].left, rs[-1].right - rs[0].left, 0)
306
+ else
307
+ memo << rs.first
308
+ end
309
+ memo
310
+ end
300
311
  .sort_by(&:top)
301
312
 
302
313
  h = []
303
- horiz.size.times do |i|
314
+ horiz.size.times do |i|
304
315
 
305
316
  if i == horiz.size - 1
306
317
  h << horiz[-1]
@@ -308,7 +319,7 @@ module Tabula
308
319
  end
309
320
 
310
321
  if skip
311
- skip = false;
322
+ skip = false;
312
323
  next
313
324
  end
314
325
  d = (horiz[i+1].top - horiz[i].top).abs
@@ -324,41 +335,55 @@ module Tabula
324
335
 
325
336
  vert = rulings.select { |r| r.vertical? && r.height > max_distance }
326
337
  .group_by(&:left)
327
- .values.reduce([]) { |memo, rs|
328
-
329
- rs = rs.sort_by(&:top)
330
- memo << if rs.size > 1
331
- Tabula::Ruling.new(rs[0].top, rs[0].left, 0, rs[-1].bottom - rs[0].top)
332
- else rs.first
333
- rs.first
334
- end
335
- }
336
- .sort_by(&:left)
337
-
338
- v = []
339
- vert.size.times do |i|
340
-
341
- if i == vert.size - 1
342
- v << vert[-1]
343
- break
344
- end
345
-
346
- if skip
347
- skip = false;
348
- next
349
- end
350
- d = (vert[i+1].left - vert[i].left).abs
338
+ .values
339
+ .reduce([]) do |memo, rs|
340
+
341
+ rs = rs.sort_by(&:top)
342
+
343
+ if rs.size > 1
344
+ # Here be dragons:
345
+ # merge consecutive segments of lines that are close enough
346
+ memo +=
347
+ rs.each_cons(2)
348
+ .chunk { |p| p[1].top - p[0].bottom < 7 }
349
+ .select { |c| c[0] }
350
+ .map { |group|
351
+ group = group.last.flatten.uniq
352
+ Tabula::Ruling.new(group[0].top,
353
+ group[0].left,
354
+ 0,
355
+ group[-1].bottom - group[0].top)
356
+ }
357
+ else
358
+ memo << rs.first
359
+ end
360
+ memo
361
+ end.sort_by(&:left)
362
+
363
+ # v = []
364
+
365
+ # vert.size.times do |i|
366
+ # if i == vert.size - 1
367
+ # v << vert[-1]
368
+ # break
369
+ # end
370
+
371
+ # if skip
372
+ # skip = false;
373
+ # next
374
+ # end
375
+ # d = (vert[i+1].left - vert[i].left).abs
376
+
377
+ # v << if d < 4 # THRESHOLD DISTANCE between vertical lines
378
+ # skip = true
379
+ # Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
380
+ # else
381
+ # vert[i]
382
+ # end
383
+ # end
384
+ # vert = v
351
385
 
352
- v << if d < 4 # THRESHOLD DISTANCE between vertical lines
353
- skip = true
354
- Tabula::Ruling.new([vert[i+1].top, vert[i].top].min, vert[i].left + d / 2, 0, [vert[i+1].height.abs, vert[i].height.abs].max)
355
- else
356
- vert[i]
357
- end
358
- end
359
- vert = v
360
386
 
361
-
362
387
  # - only keep horizontal rulings that intersect with at least one vertical ruling
363
388
  # - only keep vertical rulings that intersect with at least one horizontal ruling
364
389
  # yeah, it's a naive heuristic. but hey, it works.
@@ -16,7 +16,11 @@ module Tabula
16
16
  extend FFI::Library
17
17
  ffi_lib File.expand_path('../../ext/' + case RbConfig::CONFIG['host_os']
18
18
  when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
19
- 'liblsd.dll'
19
+ if RbConfig::CONFIG['host_cpu'] == 'x86_64'
20
+ 'liblsd64.dll'
21
+ else
22
+ 'liblsd.dll'
23
+ end
20
24
  when /darwin|mac os/
21
25
  'liblsd.dylib'
22
26
  when /linux/
@@ -33,16 +37,27 @@ module Tabula
33
37
  attach_function :lsd, [ :pointer, :buffer_in, :int, :int ], :pointer
34
38
  attach_function :free_values, [ :pointer ], :void
35
39
 
36
- def LSD.detect_lines_in_pdf_page(pdf_path, page_number, scale_factor=1)
40
+ DETECT_LINES_DEFAULTS = {
41
+ :scale_factor => nil,
42
+ :image_size => 2048
43
+ }
44
+
45
+ def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
46
+ options = DETECT_LINES_DEFAULTS.merge(options)
47
+
37
48
  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
38
- bi = Tabula::Render.pageToBufferedImage(pdf_file.getDocumentCatalog.getAllPages[page_number - 1])
49
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number - 1]
50
+ bi = Tabula::Render.pageToBufferedImage(page,
51
+ options[:image_size])
39
52
  pdf_file.close
40
- detect_lines(bi,scale_factor)
53
+ detect_lines(bi,
54
+ options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
41
55
  end
42
56
 
43
57
  # image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage
44
58
  # image to pixels: http://stackoverflow.com/questions/6524196/java-get-pixel-array-from-image
45
59
  def LSD.detect_lines(image, scale_factor=1)
60
+
46
61
  bimage = if image.class == Java::JavaAwtImage::BufferedImage
47
62
  image
48
63
  elsif image.class == String
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.5.0'
2
+ VERSION = '0.5.1'
3
3
  end
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.5.0
5
+ version: 0.5.1
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-12 00:00:00.000000000 Z
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: minitest
@@ -84,11 +84,13 @@ files:
84
84
  - ext/Makefile.linux32
85
85
  - ext/Makefile.linux64
86
86
  - ext/Makefile.mingw
87
+ - ext/Makefile.mingw64
87
88
  - ext/liblsd-linux32.so
88
89
  - ext/liblsd-linux64.so
89
90
  - ext/liblsd.def
90
91
  - ext/liblsd.dll
91
92
  - ext/liblsd.dylib
93
+ - ext/liblsd64.dll
92
94
  - ext/lsd.c
93
95
  - ext/lsd.h
94
96
  - lib/tabula.rb