tabula-extractor 0.6.1-java → 0.6.3-java
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/bin/tabula +7 -1
- data/lib/tabula/pdf_render.rb +6 -3
- data/lib/tabula/version.rb +1 -1
- data/test/tests.rb +7 -6
- metadata +16 -26
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -61,7 +61,13 @@ def main
|
|
61
61
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
62
|
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
63
63
|
extractor.extract.each_with_index do |page, page_index|
|
64
|
-
|
64
|
+
if opts[:guess]
|
65
|
+
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
|
66
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
67
|
+
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
68
|
+
else
|
69
|
+
page_areas = [area]
|
70
|
+
end
|
65
71
|
|
66
72
|
page_areas.each do |page_area|
|
67
73
|
text = page.get_text( page_area )
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -20,7 +20,8 @@ module Tabula
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
#ugh jruby; suppresses "ambiguous method" warning that arises due to Java's overloaded constructor.
|
24
|
+
TRANSPARENT_WHITE = java.awt.Color.java_class.constructor(Java::int, Java::int, Java::int, Java::int).new_instance(255, 255, 255, 0)
|
24
25
|
|
25
26
|
# 2048 width is important, if this is too small, thin lines won't be drawn.
|
26
27
|
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
@@ -30,8 +31,10 @@ module Tabula
|
|
30
31
|
rotation = java.lang.Math.toRadians(page.findRotation)
|
31
32
|
|
32
33
|
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
33
|
-
widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
34
|
-
|
34
|
+
#widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
35
|
+
widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
|
36
|
+
|
37
|
+
|
35
38
|
retval = if rotation != 0
|
36
39
|
BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
|
37
40
|
else
|
data/lib/tabula/version.rb
CHANGED
data/test/tests.rb
CHANGED
@@ -16,14 +16,14 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
16
16
|
|
17
17
|
i = 0
|
18
18
|
extractor.pages.each do |page|
|
19
|
-
assert_instance_of Tabula::Page, page
|
19
|
+
assert_instance_of Tabula::Page, page
|
20
20
|
i += 1
|
21
21
|
end
|
22
22
|
assert_equal 2, i
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
class TestTableGuesser <
|
26
|
+
class TestTableGuesser < Minitest::Test
|
27
27
|
end
|
28
28
|
|
29
29
|
class TestDumper < Minitest::Test
|
@@ -63,8 +63,8 @@ class TestExtractor < Minitest::Test
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_forest_disclosure_report_dont_regress
|
66
|
-
# this is the current state of the expected output. Ideally the output should be like
|
67
|
-
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
66
|
+
# this is the current state of the expected output. Ideally the output should be like
|
67
|
+
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
68
|
# and a solution for half-x-height-offset lines.
|
69
69
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
70
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
@@ -82,7 +82,7 @@ class TestExtractor < Minitest::Test
|
|
82
82
|
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
83
|
['TOTAL', '', '', '','$20.39'],
|
84
84
|
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
85
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
86
|
['TOTAL', '', '', '', '$5,010.33'],
|
87
87
|
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
88
|
['TOTAL', '', '', '', '$193.67'],
|
@@ -101,13 +101,14 @@ class TestExtractor < Minitest::Test
|
|
101
101
|
characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
|
102
102
|
#top left bottom right
|
103
103
|
expected = [
|
104
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
104
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
105
105
|
]
|
106
106
|
|
107
107
|
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
108
108
|
end
|
109
109
|
|
110
110
|
def test_forest_disclosure_report
|
111
|
+
skip "Skipping until we support multiline cells"
|
111
112
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
112
113
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
113
114
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,23 +11,21 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-06-
|
14
|
+
date: 2013-06-29 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
18
18
|
version_requirements: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- -
|
20
|
+
- - '>='
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
23
|
-
MA==
|
22
|
+
version: '0'
|
24
23
|
none: false
|
25
24
|
requirement: !ruby/object:Gem::Requirement
|
26
25
|
requirements:
|
27
|
-
- -
|
26
|
+
- - '>='
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
30
|
-
MA==
|
28
|
+
version: '0'
|
31
29
|
none: false
|
32
30
|
prerelease: false
|
33
31
|
type: :development
|
@@ -35,13 +33,13 @@ dependencies:
|
|
35
33
|
name: bundler
|
36
34
|
version_requirements: !ruby/object:Gem::Requirement
|
37
35
|
requirements:
|
38
|
-
- -
|
36
|
+
- - '>='
|
39
37
|
- !ruby/object:Gem::Version
|
40
38
|
version: 1.3.4
|
41
39
|
none: false
|
42
40
|
requirement: !ruby/object:Gem::Requirement
|
43
41
|
requirements:
|
44
|
-
- -
|
42
|
+
- - '>='
|
45
43
|
- !ruby/object:Gem::Version
|
46
44
|
version: 1.3.4
|
47
45
|
none: false
|
@@ -51,13 +49,13 @@ dependencies:
|
|
51
49
|
name: trollop
|
52
50
|
version_requirements: !ruby/object:Gem::Requirement
|
53
51
|
requirements:
|
54
|
-
- -
|
52
|
+
- - ~>
|
55
53
|
- !ruby/object:Gem::Version
|
56
54
|
version: '2.0'
|
57
55
|
none: false
|
58
56
|
requirement: !ruby/object:Gem::Requirement
|
59
57
|
requirements:
|
60
|
-
- -
|
58
|
+
- - ~>
|
61
59
|
- !ruby/object:Gem::Version
|
62
60
|
version: '2.0'
|
63
61
|
none: false
|
@@ -71,8 +69,8 @@ executables:
|
|
71
69
|
extensions: []
|
72
70
|
extra_rdoc_files: []
|
73
71
|
files:
|
74
|
-
-
|
75
|
-
-
|
72
|
+
- .gitignore
|
73
|
+
- .travis.yml
|
76
74
|
- AUTHORS.md
|
77
75
|
- Gemfile
|
78
76
|
- LICENSE.md
|
@@ -126,23 +124,15 @@ require_paths:
|
|
126
124
|
- lib
|
127
125
|
required_ruby_version: !ruby/object:Gem::Requirement
|
128
126
|
requirements:
|
129
|
-
- -
|
127
|
+
- - '>='
|
130
128
|
- !ruby/object:Gem::Version
|
131
|
-
|
132
|
-
- 0
|
133
|
-
hash: 2
|
134
|
-
version: !binary |-
|
135
|
-
MA==
|
129
|
+
version: '0'
|
136
130
|
none: false
|
137
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
132
|
requirements:
|
139
|
-
- -
|
133
|
+
- - '>='
|
140
134
|
- !ruby/object:Gem::Version
|
141
|
-
|
142
|
-
- 0
|
143
|
-
hash: 2
|
144
|
-
version: !binary |-
|
145
|
-
MA==
|
135
|
+
version: '0'
|
146
136
|
none: false
|
147
137
|
requirements: []
|
148
138
|
rubyforge_project:
|