tabula-extractor 0.7.2-java → 0.7.4-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
@@ -31,6 +31,8 @@ module Tabula
31
31
  :extract_ruling_lines => true
32
32
  }
33
33
 
34
+ # TODO: the +pages+ constructor argument does not make sense
35
+ # now that we have +extract_page+ and +extract_pages+
34
36
  def initialize(pdf_filename, pages=[1], password='', options={})
35
37
  raise Errno::ENOENT unless File.exists?(pdf_filename)
36
38
  @pdf_filename = pdf_filename
@@ -47,39 +49,67 @@ module Tabula
47
49
  @transformed_clipping_path = nil
48
50
  self.clipping_paths = []
49
51
  @rulings = []
50
- @min_char_width = @min_char_height = 1000000
52
+ @min_char_width = @min_char_height = Float::MAX
51
53
  end
52
54
 
53
- def extract
55
+ def close!
56
+ self.ensure_open!
57
+ @pdf_file.close
58
+ @pdf_file_closed = true
59
+ end
60
+
61
+ def ensure_open!
62
+ raise "Document is closed" if @pdf_file_closed
63
+ end
64
+
65
+ ##
66
+ # extract objects from a page. Returns an instance of +Tabula::Page+
67
+ # (+page_number+ is 1-based. i.e., first page is number 1)
68
+ def extract_page(page_number)
69
+ self.ensure_open!
70
+
71
+ if page_number-1 >= @all_pages.size || (page_number) < 0
72
+ raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}"
73
+ end
74
+
75
+ page = @all_pages.get(page_number-1)
76
+ contents = page.getContents
77
+ return nil if contents.nil?
78
+
79
+ self.clear!
80
+ self.drawPage(page)
81
+ Tabula::Page.new(@pdf_filename,
82
+ page.findCropBox.width,
83
+ page.findCropBox.height,
84
+ page.getRotation.to_i,
85
+ page_number, #one-indexed, just like +page_number+ is.
86
+ self.characters,
87
+ self.rulings,
88
+ @min_char_width,
89
+ @min_char_height)
90
+ end
91
+
92
+ def extract(pages=nil)
93
+ self.ensure_open!
94
+ pages = if pages == :all
95
+ (1..@all_pages.size)
96
+ elsif pages.nil?
97
+ @pages
98
+ else
99
+ pages
100
+ end
101
+
54
102
  Enumerator.new do |y|
55
- begin
56
- @pages.each do |i|
57
- page = @all_pages.get(i-1)
58
- contents = page.getContents
59
- next if contents.nil?
60
-
61
- self.clear!
62
- self.drawPage(page)
63
- p = Tabula::Page.new(@pdf_filename,
64
- page.findCropBox.width,
65
- page.findCropBox.height,
66
- page.getRotation.to_i,
67
- i, #one-indexed, just like `i` is.
68
- self.characters,
69
- self.rulings,
70
- @min_char_width,
71
- @min_char_height)
72
- y.yield p
73
- end
74
- ensure
75
- @pdf_file.close
76
- end # begin
103
+ pages.each do |i|
104
+ y.yield self.extract_page(i)
105
+ end
77
106
  end
78
107
  end
79
108
 
80
109
  def clear!
81
110
  self.characters.clear
82
111
  self.clipping_paths.clear
112
+ @min_char_width = @min_char_height = Float::MAX
83
113
  @page_transform = nil
84
114
  @rulings.clear
85
115
  end
@@ -118,8 +148,14 @@ module Tabula
118
148
 
119
149
  path = self.pathToList(self.getLinePath)
120
150
 
151
+ # skip paths whose first operation is not a MOVETO
152
+ # or contains operations other than LINETO, MOVETO or CLOSE
121
153
  if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
122
- || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO && p.first != java.awt.geom.PathIterator::SEG_MOVETO && p.first != java.awt.geom.PathIterator::SEG_CLOSE }
154
+ || path[1..-1].any? { |p|
155
+ p.first != java.awt.geom.PathIterator::SEG_LINETO \
156
+ && p.first != java.awt.geom.PathIterator::SEG_MOVETO \
157
+ && p.first != java.awt.geom.PathIterator::SEG_CLOSE
158
+ }
123
159
  self.getLinePath.reset
124
160
  return
125
161
  end
@@ -129,26 +165,57 @@ module Tabula
129
165
  strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
130
166
  color_filter = self.options[:line_color_filter]
131
167
 
168
+ if !color_filter.nil? && !color_filter.call(strokeColorComps)
169
+ self.getLinePath.reset
170
+ return
171
+ end
172
+
173
+ # skip the first path operation save it as the starting position
132
174
  first = path.shift
133
- start_pos = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
175
+ # last_move
176
+ start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
177
+
178
+ end_pos = nil
134
179
 
135
180
  path.each do |p|
136
- end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
137
- line = (start_pos <=> end_pos) == -1 \
138
- ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
139
- : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
140
-
141
- if p[0] == java.awt.geom.PathIterator::SEG_LINETO \
142
- && (color_filter.nil? ? true : color_filter.call(strokeColorComps)) \
143
- && line.intersects(ccp_bounds)
144
- # convert line to rectangle for clipping it to the current clippath
145
- # sucks, but awt doesn't have methods for this
146
- tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
147
- @rulings << ::Tabula::Ruling.new(tmp.getY,
148
- tmp.getX,
149
- tmp.getWidth,
150
- tmp.getHeight,
151
- filter_by_color.to_a)
181
+ case p[0]
182
+ when java.awt.geom.PathIterator::SEG_LINETO
183
+ end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
184
+ line = (start_pos <=> end_pos) == -1 \
185
+ ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
186
+ : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
187
+
188
+ if line.intersects(ccp_bounds)
189
+ # convert line to rectangle for clipping it to the current clippath
190
+ # sucks, but awt doesn't have methods for this
191
+ tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
192
+ @rulings << ::Tabula::Ruling.new(tmp.getY,
193
+ tmp.getX,
194
+ tmp.getWidth,
195
+ tmp.getHeight,
196
+ filter_by_color.to_a)
197
+ end
198
+ when java.awt.geom.PathIterator::SEG_MOVETO
199
+ last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
200
+ when java.awt.geom.PathIterator::SEG_CLOSE
201
+ # according to PathIterator docs:
202
+ # "the preceding subpath should be closed by appending a line segment
203
+ # back to the point corresponding to the most recent SEG_MOVETO."
204
+
205
+ line = (end_pos <=> last_move) == -1 \
206
+ ? java.awt.geom.Line2D::Float.new(end_pos, last_move) \
207
+ : java.awt.geom.Line2D::Float.new(last_move, end_pos)
208
+
209
+ if line.intersects(ccp_bounds)
210
+ # convert line to rectangle for clipping it to the current clippath
211
+ # sucks, but awt doesn't have methods for this
212
+ tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
213
+ @rulings << ::Tabula::Ruling.new(tmp.getY,
214
+ tmp.getX,
215
+ tmp.getWidth,
216
+ tmp.getHeight,
217
+ filter_by_color.to_a)
218
+ end
152
219
  end
153
220
  start_pos = end_pos
154
221
  end
@@ -201,22 +268,21 @@ module Tabula
201
268
  c = text.getCharacter
202
269
  h = text.getHeightDir.round(2)
203
270
 
204
- if c == ' ' || c == ' ' # replace non-breaking space for space
271
+ if c == ' ' # replace non-breaking space for space
205
272
  c = ' '
206
- h = text.getWidth.round(2)
207
273
  end
208
274
 
209
275
  te = Tabula::TextElement.new(text.getY.round(2) - h,
210
276
  text.getX.round(2),
211
- text.getWidth.round(2),
277
+ text.getWidthDirAdj,
212
278
  # ugly hack follows: we need spaces to have a height, so we can
213
279
  # test for vertical overlap. height == width seems a safe bet.
214
- h,
280
+ text.getHeightDir,
215
281
  text.getFont,
216
- text.getFontSize.round(2),
282
+ text.getFontSize,
217
283
  c,
218
284
  # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
219
- text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
285
+ (text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace,
220
286
  text.getDir)
221
287
 
222
288
  ccp_bounds = self.currentClippingPath
@@ -246,7 +312,6 @@ module Tabula
246
312
  end
247
313
 
248
314
  def rulings
249
- return [] if @rulings.empty?
250
315
  @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
251
316
  end
252
317
 
@@ -1,11 +1,6 @@
1
- require 'java'
2
1
  require 'rbconfig'
3
-
4
2
  require 'ffi'
5
3
 
6
- require_relative './entities'
7
- require_relative './pdf_render'
8
- require_relative './extraction'
9
4
 
10
5
  java_import javax.imageio.ImageIO
11
6
  java_import java.awt.image.BufferedImage
@@ -42,10 +42,12 @@ module Tabula
42
42
  page = [page]
43
43
  end
44
44
 
45
- pdf_page = Extraction::ObjectExtractor.new(pdf_path,
46
- page,
47
- options[:password]) \
48
- .extract.next
45
+ extractor = Extraction::ObjectExtractor.new(pdf_path,
46
+ page,
47
+ options[:password])
48
+
49
+ pdf_page = extractor.extract.next
50
+ extractor.close!
49
51
 
50
52
  if ["spreadsheet", "original"].include? options[:extraction_method]
51
53
  use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
@@ -54,39 +56,32 @@ module Tabula
54
56
  end
55
57
 
56
58
  if use_spreadsheet_extraction_method
57
- table = pdf_page.get_area(area).spreadsheets.inject(&:+)
58
- else
59
- use_detected_lines = false
60
- if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
61
- detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
62
- area)
63
-
64
- # only use lines if at least 80% of them cover at least 90%
65
- # of the height of area of interest
66
-
67
- # TODO this heuristic SUCKS
68
- # what if only a couple columns is delimited with vertical rulings?
69
- # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
70
- # idea: detect columns without considering rulings, detect vertical rulings
71
- # calculate ratio and try to come up with a threshold
72
- use_detected_lines = detected_vertical_rulings.size > 2 \
73
- && (detected_vertical_rulings.count { |vl|
74
- vl.height / area.height > 0.9
75
- } / detected_vertical_rulings.size.to_f) >= 0.8
76
-
77
- end
78
-
79
- table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
80
-
81
- # fixes up the table a little bit, replacing nils with empty TextElements
82
- # and sorting the lines.
83
- table.lines.each do |l|
84
- l.text_elements = l.text_elements.map do |te|
85
- te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
86
- end
87
- end
88
- table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
89
- table
59
+ return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+)
90
60
  end
61
+
62
+ use_detected_lines = false
63
+ if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
64
+ detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
65
+ area)
66
+
67
+ # only use lines if at least 80% of them cover at least 90%
68
+ # of the height of area of interest
69
+
70
+ # TODO this heuristic SUCKS
71
+ # what if only a couple columns is delimited with vertical rulings?
72
+ # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
73
+ # idea: detect columns without considering rulings, detect vertical rulings
74
+ # calculate ratio and try to come up with a threshold
75
+ use_detected_lines = detected_vertical_rulings.size > 2 \
76
+ && (detected_vertical_rulings.count { |vl|
77
+ vl.height / area.height > 0.9
78
+ } / detected_vertical_rulings.size.to_f) >= 0.8
79
+
80
+ end
81
+
82
+ pdf_page
83
+ .get_area(area)
84
+ .get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
85
+
91
86
  end
92
87
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.2'
2
+ VERSION = '0.7.4'
3
3
  end
@@ -15,17 +15,14 @@ Gem::Specification.new do |s|
15
15
 
16
16
  s.platform = 'java'
17
17
 
18
- shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
19
- s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
20
- s.test_files = `git ls-files -- {test,features}/*`.split("\n")
18
+ s.files = `git ls-files`.split("\n").reject { |f| f =~ /^test\// }
21
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
20
  s.require_paths = ["lib"]
23
21
 
24
- s.add_development_dependency 'minitest'
25
22
  s.add_development_dependency 'bundler', '>= 1.3.4'
26
23
  s.add_development_dependency 'ruby-debug'
27
24
  s.add_development_dependency 'pry'
25
+ s.add_development_dependency 'minitest'
28
26
 
29
27
  s.add_runtime_dependency "trollop", ["~> 2.0"]
30
- # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
31
28
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.7.4
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
@@ -10,38 +10,38 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-01-20 00:00:00.000000000 Z
13
+ date: 2014-05-09 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: minitest
16
+ name: bundler
17
17
  version_requirements: !ruby/object:Gem::Requirement
18
18
  requirements:
19
19
  - - '>='
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 1.3.4
22
22
  requirement: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 1.3.4
27
27
  prerelease: false
28
28
  type: :development
29
29
  - !ruby/object:Gem::Dependency
30
- name: bundler
30
+ name: ruby-debug
31
31
  version_requirements: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - '>='
34
34
  - !ruby/object:Gem::Version
35
- version: 1.3.4
35
+ version: '0'
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
- version: 1.3.4
40
+ version: '0'
41
41
  prerelease: false
42
42
  type: :development
43
43
  - !ruby/object:Gem::Dependency
44
- name: ruby-debug
44
+ name: pry
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - '>='
@@ -55,7 +55,7 @@ dependencies:
55
55
  prerelease: false
56
56
  type: :development
57
57
  - !ruby/object:Gem::Dependency
58
- name: pry
58
+ name: minitest
59
59
  version_requirements: !ruby/object:Gem::Requirement
60
60
  requirements:
61
61
  - - '>='
@@ -99,21 +99,6 @@ files:
99
99
  - README.md
100
100
  - Rakefile
101
101
  - bin/tabula
102
- - ext/COPYING
103
- - ext/Makefile.OSX
104
- - ext/Makefile.defaults
105
- - ext/Makefile.linux32
106
- - ext/Makefile.linux64
107
- - ext/Makefile.mingw
108
- - ext/Makefile.mingw64
109
- - ext/liblsd-linux32.so
110
- - ext/liblsd-linux64.so
111
- - ext/liblsd.def
112
- - ext/liblsd.dll
113
- - ext/liblsd.dylib
114
- - ext/liblsd64.dll
115
- - ext/lsd.c
116
- - ext/lsd.h
117
102
  - lib/tabula.rb
118
103
  - lib/tabula/core_ext.rb
119
104
  - lib/tabula/entities.rb
@@ -125,6 +110,7 @@ files:
125
110
  - lib/tabula/entities/ruling.rb
126
111
  - lib/tabula/entities/spreadsheet.rb
127
112
  - lib/tabula/entities/table.rb
113
+ - lib/tabula/entities/tabular.rb
128
114
  - lib/tabula/entities/text_chunk.rb
129
115
  - lib/tabula/entities/text_element.rb
130
116
  - lib/tabula/entities/text_element_index.rb
@@ -143,40 +129,6 @@ files:
143
129
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
144
130
  - target/slf4j-api-1.6.3.jar
145
131
  - target/trove4j-3.0.3.jar
146
- - test/data/47008204D_USA.page4.pdf
147
- - test/data/560015757GV_China.page1.pdf
148
- - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
149
- - test/data/GSK_2012_Q4.page437.pdf
150
- - test/data/S2MNCEbirdisland.pdf
151
- - test/data/argentina_diputados_voting_record.pdf
152
- - test/data/bo_page24.pdf
153
- - test/data/campaign_donors.pdf
154
- - test/data/frx_2012_disclosure.pdf
155
- - test/data/frx_2012_disclosure.tsv
156
- - test/data/gre.pdf
157
- - test/data/no_tables.pdf
158
- - test/data/nyc_2013fiscalreporttables.pdf
159
- - test/data/puertos1.pdf
160
- - test/data/spanning_cells.csv
161
- - test/data/spanning_cells.pdf
162
- - test/data/strongschools.pdf
163
- - test/data/sydney_disclosure_contract.pdf
164
- - test/data/tabla_subsidios.pdf
165
- - test/data/vertical_rulings_bug.pdf
166
- - test/data/vietnam3.pdf
167
- - test/data/wc2012.pdf
168
- - test/heuristic-test-set/original/560015757GV_China.page1.pdf
169
- - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
170
- - test/heuristic-test-set/original/bo_page24.pdf
171
- - test/heuristic-test-set/original/campaign_donors.pdf
172
- - test/heuristic-test-set/original/cs076pct.pdf
173
- - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
174
- - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
175
- - test/heuristic-test-set/spreadsheet/strongschools.pdf
176
- - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
177
- - test/heuristic.rb
178
- - test/test_bin_tabula.sh
179
- - test/tests.rb
180
132
  homepage: https://github.com/jazzido/tabula-extractor
181
133
  licenses:
182
134
  - MIT
@@ -197,42 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
149
  version: '0'
198
150
  requirements: []
199
151
  rubyforge_project:
200
- rubygems_version: 2.1.9
152
+ rubygems_version: 2.2.2
201
153
  signing_key:
202
154
  specification_version: 4
203
155
  summary: extract tables from PDF files
204
- test_files:
205
- - test/data/47008204D_USA.page4.pdf
206
- - test/data/560015757GV_China.page1.pdf
207
- - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
208
- - test/data/GSK_2012_Q4.page437.pdf
209
- - test/data/S2MNCEbirdisland.pdf
210
- - test/data/argentina_diputados_voting_record.pdf
211
- - test/data/bo_page24.pdf
212
- - test/data/campaign_donors.pdf
213
- - test/data/frx_2012_disclosure.pdf
214
- - test/data/frx_2012_disclosure.tsv
215
- - test/data/gre.pdf
216
- - test/data/no_tables.pdf
217
- - test/data/nyc_2013fiscalreporttables.pdf
218
- - test/data/puertos1.pdf
219
- - test/data/spanning_cells.csv
220
- - test/data/spanning_cells.pdf
221
- - test/data/strongschools.pdf
222
- - test/data/sydney_disclosure_contract.pdf
223
- - test/data/tabla_subsidios.pdf
224
- - test/data/vertical_rulings_bug.pdf
225
- - test/data/vietnam3.pdf
226
- - test/data/wc2012.pdf
227
- - test/heuristic-test-set/original/560015757GV_China.page1.pdf
228
- - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
229
- - test/heuristic-test-set/original/bo_page24.pdf
230
- - test/heuristic-test-set/original/campaign_donors.pdf
231
- - test/heuristic-test-set/original/cs076pct.pdf
232
- - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
233
- - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
234
- - test/heuristic-test-set/spreadsheet/strongschools.pdf
235
- - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
236
- - test/heuristic.rb
237
- - test/test_bin_tabula.sh
238
- - test/tests.rb
156
+ test_files: []