tabula-extractor 0.6.4-java → 0.6.5-java
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/tabula/line_segment_detector.rb +12 -4
- data/lib/tabula/pdf_dump.rb +0 -2
- data/lib/tabula/table_extractor.rb +6 -2
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +1 -0
- metadata +4 -3
@@ -43,6 +43,18 @@ module Tabula
|
|
43
43
|
:image_size => 2048
|
44
44
|
}
|
45
45
|
|
46
|
+
def LSD.detect_lines_in_pdf(pdf_path, options={})
|
47
|
+
options = DETECT_LINES_DEFAULTS.merge(options)
|
48
|
+
|
49
|
+
pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
|
50
|
+
lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
|
51
|
+
bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
|
52
|
+
detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
|
53
|
+
end
|
54
|
+
pdf_file.close
|
55
|
+
lines
|
56
|
+
end
|
57
|
+
|
46
58
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
47
59
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
48
60
|
|
@@ -67,10 +79,6 @@ module Tabula
|
|
67
79
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
68
80
|
end
|
69
81
|
|
70
|
-
ImageIO.write(bimage,
|
71
|
-
'png',
|
72
|
-
java.io.File.new("/tmp/white.png"))
|
73
|
-
|
74
82
|
image = LSD.image_to_image_double(bimage)
|
75
83
|
|
76
84
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
data/lib/tabula/pdf_dump.rb
CHANGED
@@ -182,6 +182,10 @@ module Tabula
|
|
182
182
|
default_options = {:separators => []}
|
183
183
|
options = default_options.merge(options)
|
184
184
|
|
185
|
+
if text_elements.empty?
|
186
|
+
return []
|
187
|
+
end
|
188
|
+
|
185
189
|
extractor = TableExtractor.new(text_elements, options).text_elements
|
186
190
|
lines = group_by_lines(text_elements)
|
187
191
|
top = lines[0].text_elements.map(&:top).min
|
@@ -211,11 +215,11 @@ module Tabula
|
|
211
215
|
end
|
212
216
|
end
|
213
217
|
|
214
|
-
table.lines.map
|
218
|
+
table.lines.map { |l|
|
215
219
|
l.text_elements.map! { |te|
|
216
220
|
te.nil? ? TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) : te
|
217
221
|
}
|
218
|
-
|
222
|
+
}.sort_by { |l| l.map { |te| te.top or 0 }.max }
|
219
223
|
|
220
224
|
end
|
221
225
|
|
data/lib/tabula/version.rb
CHANGED
data/tabula-extractor.gemspec
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.5
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-07-
|
14
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
@@ -133,7 +133,8 @@ files:
|
|
133
133
|
- test/data/tabla_subsidios.pdf
|
134
134
|
- test/tests.rb
|
135
135
|
homepage: https://github.com/jazzido/tabula-extractor
|
136
|
-
licenses:
|
136
|
+
licenses:
|
137
|
+
- MIT
|
137
138
|
post_install_message:
|
138
139
|
rdoc_options: []
|
139
140
|
require_paths:
|