tabula-extractor 0.6.1-java → 0.6.3-java

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby 'test/tests.rb'
9
+ ruby %{-J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -61,7 +61,13 @@ def main
61
61
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
62
  extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
63
63
  extractor.extract.each_with_index do |page, page_index|
64
- page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
64
+ if opts[:guess]
65
+ lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
66
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
67
+ page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
68
+ else
69
+ page_areas = [area]
70
+ end
65
71
 
66
72
  page_areas.each do |page_area|
67
73
  text = page.get_text( page_area )
@@ -20,7 +20,8 @@ module Tabula
20
20
  end
21
21
  end
22
22
 
23
- TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
23
+ #ugh jruby; suppresses "ambiguous method" warning that arises due to Java's overloaded constructor.
24
+ TRANSPARENT_WHITE = java.awt.Color.java_class.constructor(Java::int, Java::int, Java::int, Java::int).new_instance(255, 255, 255, 0)
24
25
 
25
26
  # 2048 width is important, if this is too small, thin lines won't be drawn.
26
27
  def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
@@ -30,8 +31,10 @@ module Tabula
30
31
  rotation = java.lang.Math.toRadians(page.findRotation)
31
32
 
32
33
  scaling = width / (rotation == 0 ? widthPt : heightPt)
33
- widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
34
-
34
+ #widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
35
+ widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
36
+
37
+
35
38
  retval = if rotation != 0
36
39
  BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
37
40
  else
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.1'
2
+ VERSION = '0.6.3'
3
3
  end
data/test/tests.rb CHANGED
@@ -16,14 +16,14 @@ class TestPagesInfoExtractor < Minitest::Test
16
16
 
17
17
  i = 0
18
18
  extractor.pages.each do |page|
19
- assert_instance_of Tabula::Page, page
19
+ assert_instance_of Tabula::Page, page
20
20
  i += 1
21
21
  end
22
22
  assert_equal 2, i
23
23
  end
24
24
  end
25
25
 
26
- class TestTableGuesser < MiniTest::Unit::TestCase
26
+ class TestTableGuesser < Minitest::Test
27
27
  end
28
28
 
29
29
  class TestDumper < Minitest::Test
@@ -63,8 +63,8 @@ class TestExtractor < Minitest::Test
63
63
  end
64
64
 
65
65
  def test_forest_disclosure_report_dont_regress
66
- # this is the current state of the expected output. Ideally the output should be like
67
- # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
66
+ # this is the current state of the expected output. Ideally the output should be like
67
+ # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
68
  # and a solution for half-x-height-offset lines.
69
69
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
70
  character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
@@ -82,7 +82,7 @@ class TestExtractor < Minitest::Test
82
82
  ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
83
  ['TOTAL', '', '', '','$20.39'],
84
84
  ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
85
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
86
  ['TOTAL', '', '', '', '$5,010.33'],
87
87
  ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
88
  ['TOTAL', '', '', '', '$193.67'],
@@ -101,13 +101,14 @@ class TestExtractor < Minitest::Test
101
101
  characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
102
  #top left bottom right
103
103
  expected = [
104
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
104
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
105
  ]
106
106
 
107
107
  assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
108
108
  end
109
109
 
110
110
  def test_forest_disclosure_report
111
+ skip "Skipping until we support multiline cells"
111
112
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
112
113
  character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
113
114
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.1
5
+ version: 0.6.3
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,23 +11,21 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-06-18 00:00:00.000000000 Z
14
+ date: 2013-06-29 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
18
18
  version_requirements: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ">="
20
+ - - '>='
21
21
  - !ruby/object:Gem::Version
22
- version: !binary |-
23
- MA==
22
+ version: '0'
24
23
  none: false
25
24
  requirement: !ruby/object:Gem::Requirement
26
25
  requirements:
27
- - - ">="
26
+ - - '>='
28
27
  - !ruby/object:Gem::Version
29
- version: !binary |-
30
- MA==
28
+ version: '0'
31
29
  none: false
32
30
  prerelease: false
33
31
  type: :development
@@ -35,13 +33,13 @@ dependencies:
35
33
  name: bundler
36
34
  version_requirements: !ruby/object:Gem::Requirement
37
35
  requirements:
38
- - - ">="
36
+ - - '>='
39
37
  - !ruby/object:Gem::Version
40
38
  version: 1.3.4
41
39
  none: false
42
40
  requirement: !ruby/object:Gem::Requirement
43
41
  requirements:
44
- - - ">="
42
+ - - '>='
45
43
  - !ruby/object:Gem::Version
46
44
  version: 1.3.4
47
45
  none: false
@@ -51,13 +49,13 @@ dependencies:
51
49
  name: trollop
52
50
  version_requirements: !ruby/object:Gem::Requirement
53
51
  requirements:
54
- - - "~>"
52
+ - - ~>
55
53
  - !ruby/object:Gem::Version
56
54
  version: '2.0'
57
55
  none: false
58
56
  requirement: !ruby/object:Gem::Requirement
59
57
  requirements:
60
- - - "~>"
58
+ - - ~>
61
59
  - !ruby/object:Gem::Version
62
60
  version: '2.0'
63
61
  none: false
@@ -71,8 +69,8 @@ executables:
71
69
  extensions: []
72
70
  extra_rdoc_files: []
73
71
  files:
74
- - ".gitignore"
75
- - ".travis.yml"
72
+ - .gitignore
73
+ - .travis.yml
76
74
  - AUTHORS.md
77
75
  - Gemfile
78
76
  - LICENSE.md
@@ -126,23 +124,15 @@ require_paths:
126
124
  - lib
127
125
  required_ruby_version: !ruby/object:Gem::Requirement
128
126
  requirements:
129
- - - ">="
127
+ - - '>='
130
128
  - !ruby/object:Gem::Version
131
- segments:
132
- - 0
133
- hash: 2
134
- version: !binary |-
135
- MA==
129
+ version: '0'
136
130
  none: false
137
131
  required_rubygems_version: !ruby/object:Gem::Requirement
138
132
  requirements:
139
- - - ">="
133
+ - - '>='
140
134
  - !ruby/object:Gem::Version
141
- segments:
142
- - 0
143
- hash: 2
144
- version: !binary |-
145
- MA==
135
+ version: '0'
146
136
  none: false
147
137
  requirements: []
148
138
  rubyforge_project: