tabula-extractor 0.6.4-java → 0.6.5-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,6 +43,18 @@ module Tabula
43
43
  :image_size => 2048
44
44
  }
45
45
 
46
+ def LSD.detect_lines_in_pdf(pdf_path, options={})
47
+ options = DETECT_LINES_DEFAULTS.merge(options)
48
+
49
+ pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
50
+ lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
51
+ bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
52
+ detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
53
+ end
54
+ pdf_file.close
55
+ lines
56
+ end
57
+
46
58
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
47
59
  options = DETECT_LINES_DEFAULTS.merge(options)
48
60
 
@@ -67,10 +79,6 @@ module Tabula
67
79
  raise ArgumentError, 'image must be a string or a BufferedImage'
68
80
  end
69
81
 
70
- ImageIO.write(bimage,
71
- 'png',
72
- java.io.File.new("/tmp/white.png"))
73
-
74
82
  image = LSD.image_to_image_double(bimage)
75
83
 
76
84
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
@@ -40,8 +40,6 @@ module Tabula
40
40
  self.characters = []; self.fonts = {}
41
41
  end
42
42
 
43
-
44
-
45
43
  def processTextPosition(text)
46
44
  # return if text.getCharacter == ' '
47
45
 
@@ -182,6 +182,10 @@ module Tabula
182
182
  default_options = {:separators => []}
183
183
  options = default_options.merge(options)
184
184
 
185
+ if text_elements.empty?
186
+ return []
187
+ end
188
+
185
189
  extractor = TableExtractor.new(text_elements, options).text_elements
186
190
  lines = group_by_lines(text_elements)
187
191
  top = lines[0].text_elements.map(&:top).min
@@ -211,11 +215,11 @@ module Tabula
211
215
  end
212
216
  end
213
217
 
214
- table.lines.map do |l|
218
+ table.lines.map { |l|
215
219
  l.text_elements.map! { |te|
216
220
  te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
217
221
  }
218
- end
222
+ }.sort_by { |l| l.map { |te| te.top or 0 }.max }
219
223
 
220
224
  end
221
225
 
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.4'
2
+ VERSION = '0.6.5'
3
3
  end
@@ -11,6 +11,7 @@ Gem::Specification.new do |s|
11
11
  s.homepage = "https://github.com/jazzido/tabula-extractor"
12
12
  s.summary = %q{extract tables from PDF files}
13
13
  s.description = %q{extract tables from PDF files}
14
+ s.license = 'MIT'
14
15
 
15
16
  s.platform = 'java'
16
17
 
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.4
5
+ version: 0.6.5
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-07-09 00:00:00.000000000 Z
14
+ date: 2013-07-23 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
@@ -133,7 +133,8 @@ files:
133
133
  - test/data/tabla_subsidios.pdf
134
134
  - test/tests.rb
135
135
  homepage: https://github.com/jazzido/tabula-extractor
136
- licenses: []
136
+ licenses:
137
+ - MIT
137
138
  post_install_message:
138
139
  rdoc_options: []
139
140
  require_paths: