tabula-extractor 0.6.4-java → 0.6.5-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/tabula/line_segment_detector.rb +12 -4
- data/lib/tabula/pdf_dump.rb +0 -2
- data/lib/tabula/table_extractor.rb +6 -2
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +1 -0
- metadata +4 -3
@@ -43,6 +43,18 @@ module Tabula
|
|
43
43
|
:image_size => 2048
|
44
44
|
}
|
45
45
|
|
46
|
+
def LSD.detect_lines_in_pdf(pdf_path, options={})
|
47
|
+
options = DETECT_LINES_DEFAULTS.merge(options)
|
48
|
+
|
49
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
50
|
+
lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
|
51
|
+
bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
|
52
|
+
detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
|
53
|
+
end
|
54
|
+
pdf_file.close
|
55
|
+
lines
|
56
|
+
end
|
57
|
+
|
46
58
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
47
59
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
48
60
|
|
@@ -67,10 +79,6 @@ module Tabula
|
|
67
79
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
68
80
|
end
|
69
81
|
|
70
|
-
ImageIO.write(bimage,
|
71
|
-
'png',
|
72
|
-
java.io.File.new("/tmp/white.png"))
|
73
|
-
|
74
82
|
image = LSD.image_to_image_double(bimage)
|
75
83
|
|
76
84
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -182,6 +182,10 @@ module Tabula
|
|
182
182
|
default_options = {:separators => []}
|
183
183
|
options = default_options.merge(options)
|
184
184
|
|
185
|
+
if text_elements.empty?
|
186
|
+
return []
|
187
|
+
end
|
188
|
+
|
185
189
|
extractor = TableExtractor.new(text_elements, options).text_elements
|
186
190
|
lines = group_by_lines(text_elements)
|
187
191
|
top = lines[0].text_elements.map(&:top).min
|
@@ -211,11 +215,11 @@ module Tabula
|
|
211
215
|
end
|
212
216
|
end
|
213
217
|
|
214
|
-
table.lines.map
|
218
|
+
table.lines.map { |l|
|
215
219
|
l.text_elements.map! { |te|
|
216
220
|
te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
|
217
221
|
}
|
218
|
-
|
222
|
+
}.sort_by { |l| l.map { |te| te.top or 0 }.max }
|
219
223
|
|
220
224
|
end
|
221
225
|
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.5
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-07-
|
14
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -133,7 +133,8 @@ files:
|
|
133
133
|
- test/data/tabla_subsidios.pdf
|
134
134
|
- test/tests.rb
|
135
135
|
homepage: https://github.com/jazzido/tabula-extractor
|
136
|
-
licenses:
|
136
|
+
licenses:
|
137
|
+
- MIT
|
137
138
|
post_install_message:
|
138
139
|
rdoc_options: []
|
139
140
|
require_paths:
|