tabula-extractor 0.6.1-java → 0.6.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/bin/tabula +7 -1
- data/lib/tabula/pdf_render.rb +6 -3
- data/lib/tabula/version.rb +1 -1
- data/test/tests.rb +7 -6
- metadata +16 -26
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -61,7 +61,13 @@ def main
|
|
61
61
|
out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
|
62
62
|
extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
|
63
63
|
extractor.extract.each_with_index do |page, page_index|
|
64
|
-
|
64
|
+
if opts[:guess]
|
65
|
+
lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
|
66
|
+
page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
|
67
|
+
page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
|
68
|
+
else
|
69
|
+
page_areas = [area]
|
70
|
+
end
|
65
71
|
|
66
72
|
page_areas.each do |page_area|
|
67
73
|
text = page.get_text( page_area )
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -20,7 +20,8 @@ module Tabula
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
#ugh jruby; suppresses "ambiguous method" warning that arises due to Java's overloaded constructor.
|
24
|
+
TRANSPARENT_WHITE = java.awt.Color.java_class.constructor(Java::int, Java::int, Java::int, Java::int).new_instance(255, 255, 255, 0)
|
24
25
|
|
25
26
|
# 2048 width is important, if this is too small, thin lines won't be drawn.
|
26
27
|
def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
|
@@ -30,8 +31,10 @@ module Tabula
|
|
30
31
|
rotation = java.lang.Math.toRadians(page.findRotation)
|
31
32
|
|
32
33
|
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
33
|
-
widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
34
|
-
|
34
|
+
#widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
35
|
+
widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
|
36
|
+
|
37
|
+
|
35
38
|
retval = if rotation != 0
|
36
39
|
BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
|
37
40
|
else
|
data/lib/tabula/version.rb
CHANGED
data/test/tests.rb
CHANGED
@@ -16,14 +16,14 @@ class TestPagesInfoExtractor < Minitest::Test
|
|
16
16
|
|
17
17
|
i = 0
|
18
18
|
extractor.pages.each do |page|
|
19
|
-
assert_instance_of Tabula::Page, page
|
19
|
+
assert_instance_of Tabula::Page, page
|
20
20
|
i += 1
|
21
21
|
end
|
22
22
|
assert_equal 2, i
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
class TestTableGuesser <
|
26
|
+
class TestTableGuesser < Minitest::Test
|
27
27
|
end
|
28
28
|
|
29
29
|
class TestDumper < Minitest::Test
|
@@ -63,8 +63,8 @@ class TestExtractor < Minitest::Test
|
|
63
63
|
end
|
64
64
|
|
65
65
|
def test_forest_disclosure_report_dont_regress
|
66
|
-
# this is the current state of the expected output. Ideally the output should be like
|
67
|
-
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
66
|
+
# this is the current state of the expected output. Ideally the output should be like
|
67
|
+
# test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
|
68
68
|
# and a solution for half-x-height-offset lines.
|
69
69
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
70
70
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
@@ -82,7 +82,7 @@ class TestExtractor < Minitest::Test
|
|
82
82
|
['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
|
83
83
|
['TOTAL', '', '', '','$20.39'],
|
84
84
|
['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
|
85
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
85
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
86
86
|
['TOTAL', '', '', '', '$5,010.33'],
|
87
87
|
['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
|
88
88
|
['TOTAL', '', '', '', '$193.67'],
|
@@ -101,13 +101,14 @@ class TestExtractor < Minitest::Test
|
|
101
101
|
characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
|
102
102
|
#top left bottom right
|
103
103
|
expected = [
|
104
|
-
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
104
|
+
["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
|
105
105
|
]
|
106
106
|
|
107
107
|
assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
|
108
108
|
end
|
109
109
|
|
110
110
|
def test_forest_disclosure_report
|
111
|
+
skip "Skipping until we support multiline cells"
|
111
112
|
pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
|
112
113
|
character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
|
113
114
|
lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.6.
|
5
|
+
version: 0.6.3
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- Manuel Aristarán
|
@@ -11,23 +11,21 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-06-
|
14
|
+
date: 2013-06-29 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: minitest
|
18
18
|
version_requirements: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- -
|
20
|
+
- - '>='
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
23
|
-
MA==
|
22
|
+
version: '0'
|
24
23
|
none: false
|
25
24
|
requirement: !ruby/object:Gem::Requirement
|
26
25
|
requirements:
|
27
|
-
- -
|
26
|
+
- - '>='
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
30
|
-
MA==
|
28
|
+
version: '0'
|
31
29
|
none: false
|
32
30
|
prerelease: false
|
33
31
|
type: :development
|
@@ -35,13 +33,13 @@ dependencies:
|
|
35
33
|
name: bundler
|
36
34
|
version_requirements: !ruby/object:Gem::Requirement
|
37
35
|
requirements:
|
38
|
-
- -
|
36
|
+
- - '>='
|
39
37
|
- !ruby/object:Gem::Version
|
40
38
|
version: 1.3.4
|
41
39
|
none: false
|
42
40
|
requirement: !ruby/object:Gem::Requirement
|
43
41
|
requirements:
|
44
|
-
- -
|
42
|
+
- - '>='
|
45
43
|
- !ruby/object:Gem::Version
|
46
44
|
version: 1.3.4
|
47
45
|
none: false
|
@@ -51,13 +49,13 @@ dependencies:
|
|
51
49
|
name: trollop
|
52
50
|
version_requirements: !ruby/object:Gem::Requirement
|
53
51
|
requirements:
|
54
|
-
- -
|
52
|
+
- - ~>
|
55
53
|
- !ruby/object:Gem::Version
|
56
54
|
version: '2.0'
|
57
55
|
none: false
|
58
56
|
requirement: !ruby/object:Gem::Requirement
|
59
57
|
requirements:
|
60
|
-
- -
|
58
|
+
- - ~>
|
61
59
|
- !ruby/object:Gem::Version
|
62
60
|
version: '2.0'
|
63
61
|
none: false
|
@@ -71,8 +69,8 @@ executables:
|
|
71
69
|
extensions: []
|
72
70
|
extra_rdoc_files: []
|
73
71
|
files:
|
74
|
-
-
|
75
|
-
-
|
72
|
+
- .gitignore
|
73
|
+
- .travis.yml
|
76
74
|
- AUTHORS.md
|
77
75
|
- Gemfile
|
78
76
|
- LICENSE.md
|
@@ -126,23 +124,15 @@ require_paths:
|
|
126
124
|
- lib
|
127
125
|
required_ruby_version: !ruby/object:Gem::Requirement
|
128
126
|
requirements:
|
129
|
-
- -
|
127
|
+
- - '>='
|
130
128
|
- !ruby/object:Gem::Version
|
131
|
-
|
132
|
-
- 0
|
133
|
-
hash: 2
|
134
|
-
version: !binary |-
|
135
|
-
MA==
|
129
|
+
version: '0'
|
136
130
|
none: false
|
137
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
132
|
requirements:
|
139
|
-
- -
|
133
|
+
- - '>='
|
140
134
|
- !ruby/object:Gem::Version
|
141
|
-
|
142
|
-
- 0
|
143
|
-
hash: 2
|
144
|
-
version: !binary |-
|
145
|
-
MA==
|
135
|
+
version: '0'
|
146
136
|
none: false
|
147
137
|
requirements: []
|
148
138
|
rubyforge_project:
|