tabula-extractor 0.6.4-java → 0.6.5-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -43,6 +43,18 @@ module Tabula
43
43
  :image_size => 2048
44
44
  }
45
45
 
46
+ def LSD.detect_lines_in_pdf(pdf_path, options={})
47
+ options = DETECT_LINES_DEFAULTS.merge(options)
48
+
49
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
50
+ lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
51
+ bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
52
+ detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
53
+ end
54
+ pdf_file.close
55
+ lines
56
+ end
57
+
46
58
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
47
59
  options = DETECT_LINES_DEFAULTS.merge(options)
48
60
 
@@ -67,10 +79,6 @@ module Tabula
67
79
  raise ArgumentError, 'image must be a string or a BufferedImage'
68
80
  end
69
81
 
70
- ImageIO.write(bimage,
71
- 'png',
72
- java.io.File.new("/tmp/white.png"))
73
-
74
82
  image = LSD.image_to_image_double(bimage)
75
83
 
76
84
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
@@ -40,8 +40,6 @@ module Tabula
40
40
  self.characters = []; self.fonts = {}
41
41
  end
42
42
 
43
-
44
-
45
43
  def processTextPosition(text)
46
44
  # return if text.getCharacter == ' '
47
45
 
@@ -182,6 +182,10 @@ module Tabula
182
182
  default_options = {:separators => []}
183
183
  options = default_options.merge(options)
184
184
 
185
+ if text_elements.empty?
186
+ return []
187
+ end
188
+
185
189
  extractor = TableExtractor.new(text_elements, options).text_elements
186
190
  lines = group_by_lines(text_elements)
187
191
  top = lines[0].text_elements.map(&:top).min
@@ -211,11 +215,11 @@ module Tabula
211
215
  end
212
216
  end
213
217
 
214
- table.lines.map do |l|
218
+ table.lines.map { |l|
215
219
  l.text_elements.map! { |te|
216
220
  te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
217
221
  }
218
- end
222
+ }.sort_by { |l| l.map { |te| te.top or 0 }.max }
219
223
 
220
224
  end
221
225
 
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.4'
2
+ VERSION = '0.6.5'
3
3
  end
@@ -11,6 +11,7 @@ Gem::Specification.new do |s|
11
11
  s.homepage = "https://github.com/jazzido/tabula-extractor"
12
12
  s.summary = %q{extract tables from PDF files}
13
13
  s.description = %q{extract tables from PDF files}
14
+ s.license = 'MIT'
14
15
 
15
16
  s.platform = 'java'
16
17
 
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.4
5
+ version: 0.6.5
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-07-09 00:00:00.000000000 Z
14
+ date: 2013-07-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -133,7 +133,8 @@ files:
133
133
  - test/data/tabla_subsidios.pdf
134
134
  - test/tests.rb
135
135
  homepage: https://github.com/jazzido/tabula-extractor
136
- licenses: []
136
+ licenses:
137
+ - MIT
137
138
  post_install_message:
138
139
  rdoc_options: []
139
140
  require_paths: