tabula-extractor 0.6.1-java → 0.6.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby 'test/tests.rb'
9
+ ruby %{-J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -61,7 +61,13 @@ def main
61
61
  out = opts[:outfile] == '-' ? $stdout : File.new(opts[:outfile], 'w')
62
62
  extractor = Tabula::Extraction::CharacterExtractor.new(filename, parse_pages_arg(opts[:pages]))
63
63
  extractor.extract.each_with_index do |page, page_index|
64
- page_areas = opts[:guess] ? Tabula::TableGuesser::find_rects_on_page(Tabula::TableGuesser::load_pdf(filename), page_index) : [area]
64
+ if opts[:guess]
65
+ lines = Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(filename, page_index))
66
+ page_areas = Tabula::TableGuesser::find_rects_from_lines(lines)
67
+ page_areas.map!{|rect| rect.dims(:top, :left, :bottom, :right)}
68
+ else
69
+ page_areas = [area]
70
+ end
65
71
 
66
72
  page_areas.each do |page_area|
67
73
  text = page.get_text( page_area )
@@ -20,7 +20,8 @@ module Tabula
20
20
  end
21
21
  end
22
22
 
23
- TRANSPARENT_WHITE = java.awt.Color.new(255, 255, 255, 0)
23
+ #ugh jruby; suppresses "ambiguous method" warning that arises due to Java's overloaded constructor.
24
+ TRANSPARENT_WHITE = java.awt.Color.java_class.constructor(Java::int, Java::int, Java::int, Java::int).new_instance(255, 255, 255, 0)
24
25
 
25
26
  # 2048 width is important, if this is too small, thin lines won't be drawn.
26
27
  def self.pageToBufferedImage(page, width=2048, pageDrawerClass=PageDrawerNoText)
@@ -30,8 +31,10 @@ module Tabula
30
31
  rotation = java.lang.Math.toRadians(page.findRotation)
31
32
 
32
33
  scaling = width / (rotation == 0 ? widthPt : heightPt)
33
- widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
34
-
34
+ #widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
35
+ widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
36
+
37
+
35
38
  retval = if rotation != 0
36
39
  BufferedImage.new(heightPx, widthPx, BufferedImage::TYPE_BYTE_GRAY)
37
40
  else
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.6.1'
2
+ VERSION = '0.6.3'
3
3
  end
data/test/tests.rb CHANGED
@@ -16,14 +16,14 @@ class TestPagesInfoExtractor < Minitest::Test
16
16
 
17
17
  i = 0
18
18
  extractor.pages.each do |page|
19
- assert_instance_of Tabula::Page, page
19
+ assert_instance_of Tabula::Page, page
20
20
  i += 1
21
21
  end
22
22
  assert_equal 2, i
23
23
  end
24
24
  end
25
25
 
26
- class TestTableGuesser < MiniTest::Unit::TestCase
26
+ class TestTableGuesser < Minitest::Test
27
27
  end
28
28
 
29
29
  class TestDumper < Minitest::Test
@@ -63,8 +63,8 @@ class TestExtractor < Minitest::Test
63
63
  end
64
64
 
65
65
  def test_forest_disclosure_report_dont_regress
66
- # this is the current state of the expected output. Ideally the output should be like
67
- # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
66
+ # this is the current state of the expected output. Ideally the output should be like
67
+ # test_forest_disclosure_report, with spaces around the & in Regional Pulmonary & Sleep
68
68
  # and a solution for half-x-height-offset lines.
69
69
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
70
70
  character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
@@ -82,7 +82,7 @@ class TestExtractor < Minitest::Test
82
82
  ['AARON, JOHN', '', 'CLARKSVILLE, TN', 'MEALS', '$20.39'],
83
83
  ['TOTAL', '', '', '','$20.39'],
84
84
  ['AARON, JOSHUA, N', '', 'WEST GROVE, PA', 'MEALS', '$310.33'],
85
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
85
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
86
86
  ['TOTAL', '', '', '', '$5,010.33'],
87
87
  ['AARON, MAUREEN, M', '', 'MARTINSVILLE, VA', 'MEALS', '$193.67'],
88
88
  ['TOTAL', '', '', '', '$193.67'],
@@ -101,13 +101,14 @@ class TestExtractor < Minitest::Test
101
101
  characters = character_extractor.extract.next.get_text([170, 28, 185, 833])
102
102
  #top left bottom right
103
103
  expected = [
104
- ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
104
+ ["", "REGIONAL PULMONARY & SLEEP", "", "", ""], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE", "", "", ""],
105
105
  ]
106
106
 
107
107
  assert_equal expected, lines_to_array(Tabula.make_table_with_vertical_rulings(characters, :vertical_rulings => vertical_rulings))
108
108
  end
109
109
 
110
110
  def test_forest_disclosure_report
111
+ skip "Skipping until we support multiline cells"
111
112
  pdf_file_path = File.expand_path('data/frx_2012_disclosure.pdf', File.dirname(__FILE__))
112
113
  character_extractor = Tabula::Extraction::CharacterExtractor.new(pdf_file_path)
113
114
  lines = Tabula::TableGuesser.find_lines_on_page(pdf_file_path, 0)
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.6.1
5
+ version: 0.6.3
6
6
  platform: java
7
7
  authors:
8
8
  - Manuel Aristarán
@@ -11,23 +11,21 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-06-18 00:00:00.000000000 Z
14
+ date: 2013-06-29 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: minitest
18
18
  version_requirements: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ">="
20
+ - - '>='
21
21
  - !ruby/object:Gem::Version
22
- version: !binary |-
23
- MA==
22
+ version: '0'
24
23
  none: false
25
24
  requirement: !ruby/object:Gem::Requirement
26
25
  requirements:
27
- - - ">="
26
+ - - '>='
28
27
  - !ruby/object:Gem::Version
29
- version: !binary |-
30
- MA==
28
+ version: '0'
31
29
  none: false
32
30
  prerelease: false
33
31
  type: :development
@@ -35,13 +33,13 @@ dependencies:
35
33
  name: bundler
36
34
  version_requirements: !ruby/object:Gem::Requirement
37
35
  requirements:
38
- - - ">="
36
+ - - '>='
39
37
  - !ruby/object:Gem::Version
40
38
  version: 1.3.4
41
39
  none: false
42
40
  requirement: !ruby/object:Gem::Requirement
43
41
  requirements:
44
- - - ">="
42
+ - - '>='
45
43
  - !ruby/object:Gem::Version
46
44
  version: 1.3.4
47
45
  none: false
@@ -51,13 +49,13 @@ dependencies:
51
49
  name: trollop
52
50
  version_requirements: !ruby/object:Gem::Requirement
53
51
  requirements:
54
- - - "~>"
52
+ - - ~>
55
53
  - !ruby/object:Gem::Version
56
54
  version: '2.0'
57
55
  none: false
58
56
  requirement: !ruby/object:Gem::Requirement
59
57
  requirements:
60
- - - "~>"
58
+ - - ~>
61
59
  - !ruby/object:Gem::Version
62
60
  version: '2.0'
63
61
  none: false
@@ -71,8 +69,8 @@ executables:
71
69
  extensions: []
72
70
  extra_rdoc_files: []
73
71
  files:
74
- - ".gitignore"
75
- - ".travis.yml"
72
+ - .gitignore
73
+ - .travis.yml
76
74
  - AUTHORS.md
77
75
  - Gemfile
78
76
  - LICENSE.md
@@ -126,23 +124,15 @@ require_paths:
126
124
  - lib
127
125
  required_ruby_version: !ruby/object:Gem::Requirement
128
126
  requirements:
129
- - - ">="
127
+ - - '>='
130
128
  - !ruby/object:Gem::Version
131
- segments:
132
- - 0
133
- hash: 2
134
- version: !binary |-
135
- MA==
129
+ version: '0'
136
130
  none: false
137
131
  required_rubygems_version: !ruby/object:Gem::Requirement
138
132
  requirements:
139
- - - ">="
133
+ - - '>='
140
134
  - !ruby/object:Gem::Version
141
- segments:
142
- - 0
143
- hash: 2
144
- version: !binary |-
145
- MA==
135
+ version: '0'
146
136
  none: false
147
137
  requirements: []
148
138
  rubyforge_project: