tabula-extractor 0.7.2-java → 0.7.4-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +4 -8
  4. data/bin/tabula +3 -3
  5. data/lib/tabula.rb +9 -5
  6. data/lib/tabula/entities.rb +1 -0
  7. data/lib/tabula/entities/cell.rb +6 -4
  8. data/lib/tabula/entities/has_cells.rb +22 -78
  9. data/lib/tabula/entities/line.rb +52 -6
  10. data/lib/tabula/entities/page.rb +43 -50
  11. data/lib/tabula/entities/ruling.rb +83 -105
  12. data/lib/tabula/entities/spreadsheet.rb +74 -11
  13. data/lib/tabula/entities/table.rb +55 -37
  14. data/lib/tabula/entities/tabular.rb +42 -0
  15. data/lib/tabula/entities/text_chunk.rb +55 -52
  16. data/lib/tabula/entities/text_element.rb +129 -62
  17. data/lib/tabula/entities/zone_entity.rb +15 -6
  18. data/lib/tabula/extraction.rb +114 -49
  19. data/lib/tabula/line_segment_detector.rb +0 -5
  20. data/lib/tabula/table_extractor.rb +32 -37
  21. data/lib/tabula/version.rb +1 -1
  22. data/tabula-extractor.gemspec +2 -5
  23. metadata +13 -95
  24. data/ext/COPYING +0 -661
  25. data/ext/Makefile.OSX +0 -18
  26. data/ext/Makefile.defaults +0 -9
  27. data/ext/Makefile.linux32 +0 -11
  28. data/ext/Makefile.linux64 +0 -12
  29. data/ext/Makefile.mingw +0 -10
  30. data/ext/Makefile.mingw64 +0 -10
  31. data/ext/liblsd-linux32.so +0 -0
  32. data/ext/liblsd-linux64.so +0 -0
  33. data/ext/liblsd.def +0 -3
  34. data/ext/liblsd.dll +0 -0
  35. data/ext/liblsd.dylib +0 -0
  36. data/ext/liblsd64.dll +0 -0
  37. data/ext/lsd.c +0 -2270
  38. data/ext/lsd.h +0 -283
  39. data/test/data/47008204D_USA.page4.pdf +0 -0
  40. data/test/data/560015757GV_China.page1.pdf +0 -0
  41. data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
  42. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  43. data/test/data/S2MNCEbirdisland.pdf +0 -0
  44. data/test/data/argentina_diputados_voting_record.pdf +0 -0
  45. data/test/data/bo_page24.pdf +0 -0
  46. data/test/data/campaign_donors.pdf +0 -0
  47. data/test/data/frx_2012_disclosure.pdf +0 -0
  48. data/test/data/frx_2012_disclosure.tsv +0 -88
  49. data/test/data/gre.pdf +0 -0
  50. data/test/data/no_tables.pdf +0 -0
  51. data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
  52. data/test/data/puertos1.pdf +0 -0
  53. data/test/data/spanning_cells.csv +0 -21
  54. data/test/data/spanning_cells.pdf +0 -0
  55. data/test/data/strongschools.pdf +0 -0
  56. data/test/data/sydney_disclosure_contract.pdf +0 -0
  57. data/test/data/tabla_subsidios.pdf +0 -0
  58. data/test/data/vertical_rulings_bug.pdf +0 -0
  59. data/test/data/vietnam3.pdf +0 -0
  60. data/test/data/wc2012.pdf +0 -0
  61. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  62. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  63. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  64. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  65. data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
  66. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  67. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  68. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  69. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  70. data/test/heuristic.rb +0 -50
  71. data/test/test_bin_tabula.sh +0 -7
  72. data/test/tests.rb +0 -603
@@ -31,6 +31,8 @@ module Tabula
31
31
  :extract_ruling_lines => true
32
32
  }
33
33
 
34
+ # TODO: the +pages+ constructor argument does not make sense
35
+ # now that we have +extract_page+ and +extract_pages+
34
36
  def initialize(pdf_filename, pages=[1], password='', options={})
35
37
  raise Errno::ENOENT unless File.exists?(pdf_filename)
36
38
  @pdf_filename = pdf_filename
@@ -47,39 +49,67 @@ module Tabula
47
49
  @transformed_clipping_path = nil
48
50
  self.clipping_paths = []
49
51
  @rulings = []
50
- @min_char_width = @min_char_height = 1000000
52
+ @min_char_width = @min_char_height = Float::MAX
51
53
  end
52
54
 
53
- def extract
55
+ def close!
56
+ self.ensure_open!
57
+ @pdf_file.close
58
+ @pdf_file_closed = true
59
+ end
60
+
61
+ def ensure_open!
62
+ raise "Document is closed" if @pdf_file_closed
63
+ end
64
+
65
+ ##
66
+ # extract objects from a page. Returns an instance of +Tabula::Page+
67
+ # (+page_number+ is 1-based. i.e., first page is number 1)
68
+ def extract_page(page_number)
69
+ self.ensure_open!
70
+
71
+ if page_number-1 >= @all_pages.size || (page_number) < 0
72
+ raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}"
73
+ end
74
+
75
+ page = @all_pages.get(page_number-1)
76
+ contents = page.getContents
77
+ return nil if contents.nil?
78
+
79
+ self.clear!
80
+ self.drawPage(page)
81
+ Tabula::Page.new(@pdf_filename,
82
+ page.findCropBox.width,
83
+ page.findCropBox.height,
84
+ page.getRotation.to_i,
85
+ page_number, #one-indexed, just like +page_number+ is.
86
+ self.characters,
87
+ self.rulings,
88
+ @min_char_width,
89
+ @min_char_height)
90
+ end
91
+
92
+ def extract(pages=nil)
93
+ self.ensure_open!
94
+ pages = if pages == :all
95
+ (1..@all_pages.size)
96
+ elsif pages.nil?
97
+ @pages
98
+ else
99
+ pages
100
+ end
101
+
54
102
  Enumerator.new do |y|
55
- begin
56
- @pages.each do |i|
57
- page = @all_pages.get(i-1)
58
- contents = page.getContents
59
- next if contents.nil?
60
-
61
- self.clear!
62
- self.drawPage(page)
63
- p = Tabula::Page.new(@pdf_filename,
64
- page.findCropBox.width,
65
- page.findCropBox.height,
66
- page.getRotation.to_i,
67
- i, #one-indexed, just like `i` is.
68
- self.characters,
69
- self.rulings,
70
- @min_char_width,
71
- @min_char_height)
72
- y.yield p
73
- end
74
- ensure
75
- @pdf_file.close
76
- end # begin
103
+ pages.each do |i|
104
+ y.yield self.extract_page(i)
105
+ end
77
106
  end
78
107
  end
79
108
 
80
109
  def clear!
81
110
  self.characters.clear
82
111
  self.clipping_paths.clear
112
+ @min_char_width = @min_char_height = Float::MAX
83
113
  @page_transform = nil
84
114
  @rulings.clear
85
115
  end
@@ -118,8 +148,14 @@ module Tabula
118
148
 
119
149
  path = self.pathToList(self.getLinePath)
120
150
 
151
+ # skip paths whose first operation is not a MOVETO
152
+ # or contains operations other than LINETO, MOVETO or CLOSE
121
153
  if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
122
- || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO && p.first != java.awt.geom.PathIterator::SEG_MOVETO && p.first != java.awt.geom.PathIterator::SEG_CLOSE }
154
+ || path[1..-1].any? { |p|
155
+ p.first != java.awt.geom.PathIterator::SEG_LINETO \
156
+ && p.first != java.awt.geom.PathIterator::SEG_MOVETO \
157
+ && p.first != java.awt.geom.PathIterator::SEG_CLOSE
158
+ }
123
159
  self.getLinePath.reset
124
160
  return
125
161
  end
@@ -129,26 +165,57 @@ module Tabula
129
165
  strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
130
166
  color_filter = self.options[:line_color_filter]
131
167
 
168
+ if !color_filter.nil? && !color_filter.call(strokeColorComps)
169
+ self.getLinePath.reset
170
+ return
171
+ end
172
+
173
+ # skip the first path operation save it as the starting position
132
174
  first = path.shift
133
- start_pos = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
175
+ # last_move
176
+ start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
177
+
178
+ end_pos = nil
134
179
 
135
180
  path.each do |p|
136
- end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
137
- line = (start_pos <=> end_pos) == -1 \
138
- ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
139
- : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
140
-
141
- if p[0] == java.awt.geom.PathIterator::SEG_LINETO \
142
- && (color_filter.nil? ? true : color_filter.call(strokeColorComps)) \
143
- && line.intersects(ccp_bounds)
144
- # convert line to rectangle for clipping it to the current clippath
145
- # sucks, but awt doesn't have methods for this
146
- tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
147
- @rulings << ::Tabula::Ruling.new(tmp.getY,
148
- tmp.getX,
149
- tmp.getWidth,
150
- tmp.getHeight,
151
- filter_by_color.to_a)
181
+ case p[0]
182
+ when java.awt.geom.PathIterator::SEG_LINETO
183
+ end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
184
+ line = (start_pos <=> end_pos) == -1 \
185
+ ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
186
+ : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
187
+
188
+ if line.intersects(ccp_bounds)
189
+ # convert line to rectangle for clipping it to the current clippath
190
+ # sucks, but awt doesn't have methods for this
191
+ tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
192
+ @rulings << ::Tabula::Ruling.new(tmp.getY,
193
+ tmp.getX,
194
+ tmp.getWidth,
195
+ tmp.getHeight,
196
+ filter_by_color.to_a)
197
+ end
198
+ when java.awt.geom.PathIterator::SEG_MOVETO
199
+ last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
200
+ when java.awt.geom.PathIterator::SEG_CLOSE
201
+ # according to PathIterator docs:
202
+ # "the preceding subpath should be closed by appending a line segment
203
+ # back to the point corresponding to the most recent SEG_MOVETO."
204
+
205
+ line = (end_pos <=> last_move) == -1 \
206
+ ? java.awt.geom.Line2D::Float.new(end_pos, last_move) \
207
+ : java.awt.geom.Line2D::Float.new(last_move, end_pos)
208
+
209
+ if line.intersects(ccp_bounds)
210
+ # convert line to rectangle for clipping it to the current clippath
211
+ # sucks, but awt doesn't have methods for this
212
+ tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
213
+ @rulings << ::Tabula::Ruling.new(tmp.getY,
214
+ tmp.getX,
215
+ tmp.getWidth,
216
+ tmp.getHeight,
217
+ filter_by_color.to_a)
218
+ end
152
219
  end
153
220
  start_pos = end_pos
154
221
  end
@@ -201,22 +268,21 @@ module Tabula
201
268
  c = text.getCharacter
202
269
  h = text.getHeightDir.round(2)
203
270
 
204
- if c == ' ' || c == ' ' # replace non-breaking space for space
271
+ if c == ' ' # replace non-breaking space for space
205
272
  c = ' '
206
- h = text.getWidth.round(2)
207
273
  end
208
274
 
209
275
  te = Tabula::TextElement.new(text.getY.round(2) - h,
210
276
  text.getX.round(2),
211
- text.getWidth.round(2),
277
+ text.getWidthDirAdj,
212
278
  # ugly hack follows: we need spaces to have a height, so we can
213
279
  # test for vertical overlap. height == width seems a safe bet.
214
- h,
280
+ text.getHeightDir,
215
281
  text.getFont,
216
- text.getFontSize.round(2),
282
+ text.getFontSize,
217
283
  c,
218
284
  # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
219
- text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
285
+ (text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace,
220
286
  text.getDir)
221
287
 
222
288
  ccp_bounds = self.currentClippingPath
@@ -246,7 +312,6 @@ module Tabula
246
312
  end
247
313
 
248
314
  def rulings
249
- return [] if @rulings.empty?
250
315
  @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
251
316
  end
252
317
 
@@ -1,11 +1,6 @@
1
- require 'java'
2
1
  require 'rbconfig'
3
-
4
2
  require 'ffi'
5
3
 
6
- require_relative './entities'
7
- require_relative './pdf_render'
8
- require_relative './extraction'
9
4
 
10
5
  java_import javax.imageio.ImageIO
11
6
  java_import java.awt.image.BufferedImage
@@ -42,10 +42,12 @@ module Tabula
42
42
  page = [page]
43
43
  end
44
44
 
45
- pdf_page = Extraction::ObjectExtractor.new(pdf_path,
46
- page,
47
- options[:password]) \
48
- .extract.next
45
+ extractor = Extraction::ObjectExtractor.new(pdf_path,
46
+ page,
47
+ options[:password])
48
+
49
+ pdf_page = extractor.extract.next
50
+ extractor.close!
49
51
 
50
52
  if ["spreadsheet", "original"].include? options[:extraction_method]
51
53
  use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
@@ -54,39 +56,32 @@ module Tabula
54
56
  end
55
57
 
56
58
  if use_spreadsheet_extraction_method
57
- table = pdf_page.get_area(area).spreadsheets.inject(&:+)
58
- else
59
- use_detected_lines = false
60
- if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
61
- detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
62
- area)
63
-
64
- # only use lines if at least 80% of them cover at least 90%
65
- # of the height of area of interest
66
-
67
- # TODO this heuristic SUCKS
68
- # what if only a couple columns is delimited with vertical rulings?
69
- # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
70
- # idea: detect columns without considering rulings, detect vertical rulings
71
- # calculate ratio and try to come up with a threshold
72
- use_detected_lines = detected_vertical_rulings.size > 2 \
73
- && (detected_vertical_rulings.count { |vl|
74
- vl.height / area.height > 0.9
75
- } / detected_vertical_rulings.size.to_f) >= 0.8
76
-
77
- end
78
-
79
- table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
80
-
81
- # fixes up the table a little bit, replacing nils with empty TextElements
82
- # and sorting the lines.
83
- table.lines.each do |l|
84
- l.text_elements = l.text_elements.map do |te|
85
- te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
86
- end
87
- end
88
- table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
89
- table
59
+ return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+)
90
60
  end
61
+
62
+ use_detected_lines = false
63
+ if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
64
+ detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
65
+ area)
66
+
67
+ # only use lines if at least 80% of them cover at least 90%
68
+ # of the height of area of interest
69
+
70
+ # TODO this heuristic SUCKS
71
+ # what if only a couple columns is delimited with vertical rulings?
72
+ # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
73
+ # idea: detect columns without considering rulings, detect vertical rulings
74
+ # calculate ratio and try to come up with a threshold
75
+ use_detected_lines = detected_vertical_rulings.size > 2 \
76
+ && (detected_vertical_rulings.count { |vl|
77
+ vl.height / area.height > 0.9
78
+ } / detected_vertical_rulings.size.to_f) >= 0.8
79
+
80
+ end
81
+
82
+ pdf_page
83
+ .get_area(area)
84
+ .get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])
85
+
91
86
  end
92
87
  end
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.2'
2
+ VERSION = '0.7.4'
3
3
  end
@@ -15,17 +15,14 @@ Gem::Specification.new do |s|
15
15
 
16
16
  s.platform = 'java'
17
17
 
18
- shared_libs = ['liblsd.dylib', 'liblsd-linux64.so', 'liblsd-linux32.so', 'liblsd.dll', 'liblsd64.dll'].map { |f| 'ext/' + f }
19
- s.files = `git ls-files`.split("\n") + shared_libs.map.reject { |f| !File.exists?(f) }
20
- s.test_files = `git ls-files -- {test,features}/*`.split("\n")
18
+ s.files = `git ls-files`.split("\n").reject { |f| f =~ /^test\// }
21
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
20
  s.require_paths = ["lib"]
23
21
 
24
- s.add_development_dependency 'minitest'
25
22
  s.add_development_dependency 'bundler', '>= 1.3.4'
26
23
  s.add_development_dependency 'ruby-debug'
27
24
  s.add_development_dependency 'pry'
25
+ s.add_development_dependency 'minitest'
28
26
 
29
27
  s.add_runtime_dependency "trollop", ["~> 2.0"]
30
- # s.add_runtime_dependency "algorithms", ["~> 0.6.1"]
31
28
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.2
4
+ version: 0.7.4
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
@@ -10,38 +10,38 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-01-20 00:00:00.000000000 Z
13
+ date: 2014-05-09 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
- name: minitest
16
+ name: bundler
17
17
  version_requirements: !ruby/object:Gem::Requirement
18
18
  requirements:
19
19
  - - '>='
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 1.3.4
22
22
  requirement: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: 1.3.4
27
27
  prerelease: false
28
28
  type: :development
29
29
  - !ruby/object:Gem::Dependency
30
- name: bundler
30
+ name: ruby-debug
31
31
  version_requirements: !ruby/object:Gem::Requirement
32
32
  requirements:
33
33
  - - '>='
34
34
  - !ruby/object:Gem::Version
35
- version: 1.3.4
35
+ version: '0'
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
- version: 1.3.4
40
+ version: '0'
41
41
  prerelease: false
42
42
  type: :development
43
43
  - !ruby/object:Gem::Dependency
44
- name: ruby-debug
44
+ name: pry
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - '>='
@@ -55,7 +55,7 @@ dependencies:
55
55
  prerelease: false
56
56
  type: :development
57
57
  - !ruby/object:Gem::Dependency
58
- name: pry
58
+ name: minitest
59
59
  version_requirements: !ruby/object:Gem::Requirement
60
60
  requirements:
61
61
  - - '>='
@@ -99,21 +99,6 @@ files:
99
99
  - README.md
100
100
  - Rakefile
101
101
  - bin/tabula
102
- - ext/COPYING
103
- - ext/Makefile.OSX
104
- - ext/Makefile.defaults
105
- - ext/Makefile.linux32
106
- - ext/Makefile.linux64
107
- - ext/Makefile.mingw
108
- - ext/Makefile.mingw64
109
- - ext/liblsd-linux32.so
110
- - ext/liblsd-linux64.so
111
- - ext/liblsd.def
112
- - ext/liblsd.dll
113
- - ext/liblsd.dylib
114
- - ext/liblsd64.dll
115
- - ext/lsd.c
116
- - ext/lsd.h
117
102
  - lib/tabula.rb
118
103
  - lib/tabula/core_ext.rb
119
104
  - lib/tabula/entities.rb
@@ -125,6 +110,7 @@ files:
125
110
  - lib/tabula/entities/ruling.rb
126
111
  - lib/tabula/entities/spreadsheet.rb
127
112
  - lib/tabula/entities/table.rb
113
+ - lib/tabula/entities/tabular.rb
128
114
  - lib/tabula/entities/text_chunk.rb
129
115
  - lib/tabula/entities/text_element.rb
130
116
  - lib/tabula/entities/text_element_index.rb
@@ -143,40 +129,6 @@ files:
143
129
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
144
130
  - target/slf4j-api-1.6.3.jar
145
131
  - target/trove4j-3.0.3.jar
146
- - test/data/47008204D_USA.page4.pdf
147
- - test/data/560015757GV_China.page1.pdf
148
- - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
149
- - test/data/GSK_2012_Q4.page437.pdf
150
- - test/data/S2MNCEbirdisland.pdf
151
- - test/data/argentina_diputados_voting_record.pdf
152
- - test/data/bo_page24.pdf
153
- - test/data/campaign_donors.pdf
154
- - test/data/frx_2012_disclosure.pdf
155
- - test/data/frx_2012_disclosure.tsv
156
- - test/data/gre.pdf
157
- - test/data/no_tables.pdf
158
- - test/data/nyc_2013fiscalreporttables.pdf
159
- - test/data/puertos1.pdf
160
- - test/data/spanning_cells.csv
161
- - test/data/spanning_cells.pdf
162
- - test/data/strongschools.pdf
163
- - test/data/sydney_disclosure_contract.pdf
164
- - test/data/tabla_subsidios.pdf
165
- - test/data/vertical_rulings_bug.pdf
166
- - test/data/vietnam3.pdf
167
- - test/data/wc2012.pdf
168
- - test/heuristic-test-set/original/560015757GV_China.page1.pdf
169
- - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
170
- - test/heuristic-test-set/original/bo_page24.pdf
171
- - test/heuristic-test-set/original/campaign_donors.pdf
172
- - test/heuristic-test-set/original/cs076pct.pdf
173
- - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
174
- - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
175
- - test/heuristic-test-set/spreadsheet/strongschools.pdf
176
- - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
177
- - test/heuristic.rb
178
- - test/test_bin_tabula.sh
179
- - test/tests.rb
180
132
  homepage: https://github.com/jazzido/tabula-extractor
181
133
  licenses:
182
134
  - MIT
@@ -197,42 +149,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
197
149
  version: '0'
198
150
  requirements: []
199
151
  rubyforge_project:
200
- rubygems_version: 2.1.9
152
+ rubygems_version: 2.2.2
201
153
  signing_key:
202
154
  specification_version: 4
203
155
  summary: extract tables from PDF files
204
- test_files:
205
- - test/data/47008204D_USA.page4.pdf
206
- - test/data/560015757GV_China.page1.pdf
207
- - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
208
- - test/data/GSK_2012_Q4.page437.pdf
209
- - test/data/S2MNCEbirdisland.pdf
210
- - test/data/argentina_diputados_voting_record.pdf
211
- - test/data/bo_page24.pdf
212
- - test/data/campaign_donors.pdf
213
- - test/data/frx_2012_disclosure.pdf
214
- - test/data/frx_2012_disclosure.tsv
215
- - test/data/gre.pdf
216
- - test/data/no_tables.pdf
217
- - test/data/nyc_2013fiscalreporttables.pdf
218
- - test/data/puertos1.pdf
219
- - test/data/spanning_cells.csv
220
- - test/data/spanning_cells.pdf
221
- - test/data/strongschools.pdf
222
- - test/data/sydney_disclosure_contract.pdf
223
- - test/data/tabla_subsidios.pdf
224
- - test/data/vertical_rulings_bug.pdf
225
- - test/data/vietnam3.pdf
226
- - test/data/wc2012.pdf
227
- - test/heuristic-test-set/original/560015757GV_China.page1.pdf
228
- - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
229
- - test/heuristic-test-set/original/bo_page24.pdf
230
- - test/heuristic-test-set/original/campaign_donors.pdf
231
- - test/heuristic-test-set/original/cs076pct.pdf
232
- - test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf
233
- - test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf
234
- - test/heuristic-test-set/spreadsheet/strongschools.pdf
235
- - test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf
236
- - test/heuristic.rb
237
- - test/test_bin_tabula.sh
238
- - test/tests.rb
156
+ test_files: []