tabula-extractor 0.7.1-java → 0.7.2-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fea5751c9b78705fbfda50be1e29011d40249b04
4
- data.tar.gz: 2d7bb156a073636467b34e44b3f8b6a364920354
3
+ metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
4
+ data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
5
5
  SHA512:
6
- metadata.gz: d8d1f1932397b599b3d724bb4e8ca5fd73f516e198a391625031ab83f8e5f0a2743d51dcc02953ee5a53c0664330c8e8e1f5309f72493e9f2e232068b06085a1
7
- data.tar.gz: e292d6320db6a42799418b6651f63a88446f85c6787659597e77b9f7c38e698b69a10c78edbdeea49baedc0d92f77602b847aef90ab1f8f2ab10356493dbb397
6
+ metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
7
+ data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
data/.gitignore CHANGED
@@ -13,6 +13,8 @@ spec/reports
13
13
  test/tmp
14
14
  test/version_tmp
15
15
  tmp
16
+ /*.pdf
17
+ /*.csv
16
18
 
17
19
  # YARD artifacts
18
20
  .yardoc
data/README.md CHANGED
@@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs
44
44
  extraction (if there are ruling lines separating each
45
45
  cell, as in a PDF of an Excel spreadsheet)
46
46
  --silent, -i: Suppress all stderr output.
47
+ --use-line-returns, -u: Use embedded line returns in cells.
47
48
  --version, -v: Print version and exit
48
49
  --help, -h: Show this message
49
50
  ```
@@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs
52
53
 
53
54
  `tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
54
55
 
56
+ Here's a very basic example:
57
+
58
+ ````ruby
59
+ require 'tabula'
60
+
61
+ pdf_file_path = "whatever.pdf"
62
+ outfilename = "whatever.csv"
63
+
64
+ out = open(outfilename, 'w')
65
+
66
+ extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
67
+ extractor.extract.each do |pdf_page|
68
+ pdf_page.spreadsheets.each do |spreadsheet|
69
+ out << spreadsheet.to_csv
70
+ out << "\n\n"
71
+ end
72
+ end
73
+ out.close
74
+
75
+ ````
76
+
55
77
  ## Notes
56
78
 
57
79
  `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby %{--debug -X-C -J-Xmx512m test/tests.rb}
9
+ ruby %{-X+C -J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -47,6 +47,7 @@ EOS
47
47
  opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
48
48
  opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
49
49
  opt :silent, 'Suppress all stderr output.'
50
+ opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
50
51
  end
51
52
 
52
53
  if !opts[:columns].nil?
@@ -94,6 +95,12 @@ def main
94
95
  else
95
96
  nil
96
97
  end
98
+ use_line_returns = if opts[:use_line_returns]
99
+ true
100
+ else
101
+ false
102
+ end
103
+
97
104
  extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
98
105
  extractor.extract.each_with_index do |pdf_page, page_index|
99
106
 
@@ -111,7 +118,7 @@ def main
111
118
  STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
112
119
  end
113
120
  end
114
- tables = pdf_page.spreadsheets.map(&:rows)
121
+ tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
115
122
  else
116
123
  STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
117
124
  if opts[:guess]
@@ -4,6 +4,10 @@ module Tabula
4
4
  end
5
5
 
6
6
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
7
+ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
8
+ require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
9
+ require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
10
+
7
11
 
8
12
  import 'java.util.logging.LogManager'
9
13
  import 'java.util.logging.Level'
@@ -121,7 +121,7 @@ class Line2D::Float
121
121
 
122
122
  end
123
123
 
124
- class Rectangle2D::Float
124
+ class Rectangle2D
125
125
  SIMILARITY_DIVISOR = 20
126
126
 
127
127
  alias_method :top, :minY
@@ -2,6 +2,7 @@ require_relative './entities/zone_entity'
2
2
  require_relative './entities/cell'
3
3
  require_relative './entities/has_cells'
4
4
  require_relative './entities/line'
5
+ require_relative './entities/text_element_index'
5
6
  require_relative './entities/page'
6
7
  require_relative './entities/page_area'
7
8
  require_relative './entities/ruling'
@@ -15,11 +15,14 @@ module Tabula
15
15
  @ruling_lines = ruling_lines
16
16
  @file_path = file_path
17
17
  @number_one_indexed = number
18
- self.texts = texts
19
18
  @cells = []
20
19
  @spreadsheets = nil
21
20
  @min_char_width = min_char_width
22
21
  @min_char_height = min_char_height
22
+ @spatial_index = TextElementIndex.new
23
+
24
+ self.texts = texts
25
+ self.texts.each { |te| @spatial_index << te }
23
26
  end
24
27
 
25
28
  def min_char_width
@@ -54,10 +57,10 @@ module Tabula
54
57
  def get_table(options={})
55
58
  options = {:vertical_rulings => []}.merge(options)
56
59
  if texts.empty?
57
- return []
60
+ return Tabula::Table.new(0, [])
58
61
  end
59
62
 
60
- text_chunks = TextElement.merge_words(self.texts, options).sort
63
+ text_chunks = TextElement.merge_words(self.texts.sort, options).sort
61
64
 
62
65
  lines = TextChunk.group_by_lines(text_chunks)
63
66
 
@@ -65,7 +68,8 @@ module Tabula
65
68
  columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
66
69
  separators = columns.sort.reverse
67
70
  else
68
- columns = TextChunk.column_positions(text_chunks)
71
+ columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
72
+ text_chunks)
69
73
  separators = columns[1..-1].sort.reverse
70
74
  end
71
75
 
@@ -123,8 +127,8 @@ module Tabula
123
127
  spreadsheets(options).each do |spreadsheet|
124
128
  spreadsheet.cells.each do |cell|
125
129
  cell.text_elements = page.get_cell_text(cell)
126
- spreadsheet.cells_resolved = true
127
130
  end
131
+ spreadsheet.cells_resolved = true
128
132
  end
129
133
  end
130
134
 
@@ -176,9 +180,17 @@ module Tabula
176
180
  if area.nil?
177
181
  texts
178
182
  else
179
- texts.select do |t|
180
- area.contains(t)
181
- end
183
+ @spatial_index.contains(area)
184
+ end
185
+ end
186
+
187
+ def fill_in_cell_texts!(areas)
188
+ texts.each do |t|
189
+ area = areas.find{|a| a.contains(t) }
190
+ area.text_elements << t unless area.nil?
191
+ end
192
+ areas.each do |area|
193
+ area.text_elements = TextElement.merge_words(area.text_elements)
182
194
  end
183
195
  end
184
196
 
@@ -53,7 +53,7 @@ module Tabula
53
53
  if array_of_rows.size > 2
54
54
  if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
55
55
  missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
56
- # puts missing_spots.inspect
56
+
57
57
  missing_spots.each do |missing_spot|
58
58
  missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
59
59
  missing_spot_placeholder.placeholder = true
@@ -92,5 +92,17 @@ module Tabula
92
92
  'data' => rows,
93
93
  }.to_json(*a)
94
94
  end
95
+
96
+ def to_csv
97
+ out = StringIO.new
98
+ Tabula::Writers.CSV(rows, out)
99
+ out.string
100
+ end
101
+
102
+ def to_tsv
103
+ out = StringIO.new
104
+ Tabula::Writers.TSV(rows, out)
105
+ out.string
106
+ end
95
107
  end
96
108
  end
@@ -31,11 +31,9 @@ module Tabula
31
31
 
32
32
  ##
33
33
  # calculate estimated columns from an iterable of TextChunk
34
- def self.column_positions(text_chunks)
34
+ def self.column_positions(top, text_chunks)
35
35
  right = 0
36
36
  columns = []
37
- lines = TextChunk.group_by_lines(text_chunks)
38
- top = lines.first.text_elements.map(&:top).min
39
37
 
40
38
  text_chunks.each do |te|
41
39
  next if te.text =~ ONLY_SPACES_RE
@@ -2,16 +2,17 @@ module Tabula
2
2
  ##
3
3
  # a Glyph
4
4
  class TextElement < ZoneEntity
5
- attr_accessor :font, :font_size, :text, :width_of_space
5
+ attr_accessor :font, :font_size, :text, :width_of_space, :direction
6
6
 
7
7
  TOLERANCE_FACTOR = 0.25
8
8
 
9
- def initialize(top, left, width, height, font, font_size, text, width_of_space)
9
+ def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
10
10
  super(top, left, width, height)
11
11
  self.font = font
12
12
  self.font_size = font_size
13
13
  self.text = text
14
14
  self.width_of_space = width_of_space
15
+ self.direction = direction
15
16
  end
16
17
 
17
18
  EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
@@ -31,40 +32,45 @@ module Tabula
31
32
  current_chunk = chunks.last
32
33
  prev_char = current_chunk.text_elements.last
33
34
 
34
- # any vertical ruling goes across prev_char and char?
35
- across_vertical_ruling = vertical_ruling_locations.any? { |loc|
36
- prev_char.left < loc && char.left > loc
37
- }
38
-
39
- # should we add a space?
40
- if (prev_char.text != " ") && (char.text != " ") \
41
- && !across_vertical_ruling \
42
- && prev_char.should_add_space?(char)
43
-
44
- sp = self.new(prev_char.top,
45
- prev_char.right,
46
- prev_char.width_of_space,
47
- prev_char.width_of_space, # width == height for spaces
48
- prev_char.font,
49
- prev_char.font_size,
50
- ' ',
51
- prev_char.width_of_space)
52
- chunks.last << sp
53
- prev_char = sp
54
- end
55
-
56
- # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
57
- # that they ought to be merged by that account.
58
- # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
59
- # Why are both of those `.left`?, you might ask. The intuition is that a letter
60
- # that starts on the left of a vertical ruling ought to remain on the left of it.
61
- if !across_vertical_ruling && prev_char.should_merge?(char)
62
- chunks.last << char
35
+ # if same char AND overlapped, skip
36
+ if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
37
+ chunks
63
38
  else
64
- # create a new chunk
65
- chunks << TextChunk.create_from_text_element(char)
39
+ # any vertical ruling goes across prev_char and char?
40
+ across_vertical_ruling = vertical_ruling_locations.any? { |loc|
41
+ prev_char.left < loc && char.left > loc
42
+ }
43
+
44
+ # should we add a space?
45
+ if (prev_char.text != " ") && (char.text != " ") \
46
+ && !across_vertical_ruling \
47
+ && prev_char.should_add_space?(char)
48
+
49
+ sp = self.new(prev_char.top,
50
+ prev_char.right,
51
+ prev_char.width_of_space,
52
+ prev_char.width_of_space, # width == height for spaces
53
+ prev_char.font,
54
+ prev_char.font_size,
55
+ ' ',
56
+ prev_char.width_of_space)
57
+ chunks.last << sp
58
+ prev_char = sp
59
+ end
60
+
61
+ # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
62
+ # that they ought to be merged by that account.
63
+ # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
64
+ # Why are both of those `.left`?, you might ask. The intuition is that a letter
65
+ # that starts on the left of a vertical ruling ought to remain on the left of it.
66
+ if !across_vertical_ruling && prev_char.should_merge?(char)
67
+ chunks.last << char
68
+ else
69
+ # create a new chunk
70
+ chunks << TextChunk.create_from_text_element(char)
71
+ end
72
+ chunks
66
73
  end
67
- chunks
68
74
  end
69
75
  end
70
76
 
@@ -108,5 +114,17 @@ module Tabula
108
114
  def ==(other)
109
115
  self.text.strip == other.text.strip
110
116
  end
117
+
118
+ # sort in lexicographic (reading) order
119
+ def <=>(other)
120
+ if self.vertically_overlaps?(other)
121
+ self.left <=> other.left
122
+ elsif self.top < other.top
123
+ -1
124
+ else
125
+ 1
126
+ end
127
+ end
128
+
111
129
  end
112
130
  end
@@ -0,0 +1,55 @@
1
+ module Tabula
2
+ class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
3
+
4
+ attr_reader :te_dict
5
+
6
+ class SaveToListProcedure
7
+ include Java::GnuTroveProcedure::TIntProcedure
8
+
9
+ attr_reader :list
10
+
11
+ def initialize(parent)
12
+ @parent = parent
13
+ @list = []
14
+ end
15
+
16
+ def execute(id)
17
+ @list << @parent.te_dict[id]
18
+ return true
19
+ end
20
+
21
+ def reset!
22
+ @list = []
23
+ end
24
+
25
+ end
26
+
27
+ def initialize
28
+ super
29
+ self.init(nil)
30
+ @te_dict = {}
31
+ @save_to_list = SaveToListProcedure.new(self)
32
+ end
33
+
34
+ def <<(text_element)
35
+ r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
36
+ text_element.top,
37
+ text_element.right,
38
+ text_element.bottom)
39
+ @te_dict[text_element.object_id] = text_element
40
+ self.add(r, text_element.object_id)
41
+ end
42
+
43
+ def contains(zone_entity)
44
+ r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
45
+ zone_entity.top,
46
+ zone_entity.right,
47
+ zone_entity.bottom)
48
+ @save_to_list.reset!
49
+ super(r, @save_to_list)
50
+
51
+ # sort in lexicographic (reading) order
52
+ @save_to_list.list.sort
53
+ end
54
+ end
55
+ end
@@ -203,12 +203,12 @@ module Tabula
203
203
 
204
204
  if c == ' ' || c == ' ' # replace non-breaking space for space
205
205
  c = ' '
206
- h = text.getWidthDirAdj.round(2)
206
+ h = text.getWidth.round(2)
207
207
  end
208
208
 
209
- te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
210
- text.getXDirAdj.round(2),
211
- text.getWidthDirAdj.round(2),
209
+ te = Tabula::TextElement.new(text.getY.round(2) - h,
210
+ text.getX.round(2),
211
+ text.getWidth.round(2),
212
212
  # ugly hack follows: we need spaces to have a height, so we can
213
213
  # test for vertical overlap. height == width seems a safe bet.
214
214
  h,
@@ -216,7 +216,8 @@ module Tabula
216
216
  text.getFontSize.round(2),
217
217
  c,
218
218
  # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
219
- text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace)
219
+ text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
220
+ text.getDir)
220
221
 
221
222
  ccp_bounds = self.currentClippingPath
222
223
 
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.1'
2
+ VERSION = '0.7.2'
3
3
  end
Binary file
Binary file
@@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test
206
206
  table = Tabula.extract_table(pdf_file_path,
207
207
  1,
208
208
  [106.01, 48.09, 227.31, 551.89],
209
- :detect_ruling_lines => true)
209
+ :detect_ruling_lines => true,
210
+ :extraction_method => "original")
210
211
 
211
212
  expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
212
213
 
@@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test
527
528
  assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
528
529
  end
529
530
  end
531
+
532
+ def test_remove_repeated_text
533
+ top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
534
+
535
+ table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
536
+ 1,
537
+ [top,left,bottom,right],
538
+ :detect_ruling_lines => false,
539
+ :extraction_method => 'original')
540
+
541
+ ary = table_to_array(table)
542
+ assert_equal ary[1][1], "$ 18,969,610"
543
+ assert_equal ary[1][2], "$ 18,157,722"
544
+ end
545
+
546
+ def test_remove_overlapping_text
547
+ # one of those PDFs that put characters on top of another to make text "bold"
548
+ top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
549
+ table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
550
+ 1,
551
+ [top,left,bottom,right],
552
+ :detect_ruling_lines => false,
553
+ :extraction_method => 'original')
554
+
555
+ ary = table_to_array(table)
556
+ assert_equal ary.first.first, "Community development"
557
+ end
558
+
559
+ def test_cells_including_line_returns
560
+ data = []
561
+ pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
562
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
563
+ pdf_page.spreadsheets.each do |spreadsheet|
564
+ spreadsheet.cells.each do |cell|
565
+ cell.text_elements = pdf_page.get_cell_text(cell)
566
+ cell.options = ({:use_line_returns => true, :cell_debug => 0})
567
+ data << cell.text
568
+ end
569
+ end
570
+ end
571
+ assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
572
+ end
573
+
530
574
  end
531
575
 
532
576
  class TestIsTabularHeuristic < Minitest::Test
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-01-18 00:00:00.000000000 Z
13
+ date: 2014-01-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: minitest
@@ -127,6 +127,7 @@ files:
127
127
  - lib/tabula/entities/table.rb
128
128
  - lib/tabula/entities/text_chunk.rb
129
129
  - lib/tabula/entities/text_element.rb
130
+ - lib/tabula/entities/text_element_index.rb
130
131
  - lib/tabula/entities/zone_entity.rb
131
132
  - lib/tabula/extraction.rb
132
133
  - lib/tabula/line_segment_detector.rb
@@ -138,7 +139,10 @@ files:
138
139
  - lib/tabula/version.rb
139
140
  - lib/tabula/writers.rb
140
141
  - tabula-extractor.gemspec
142
+ - target/jsi-1.1.0-SNAPSHOT.jar
141
143
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
144
+ - target/slf4j-api-1.6.3.jar
145
+ - target/trove4j-3.0.3.jar
142
146
  - test/data/47008204D_USA.page4.pdf
143
147
  - test/data/560015757GV_China.page1.pdf
144
148
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
@@ -151,13 +155,16 @@ files:
151
155
  - test/data/frx_2012_disclosure.tsv
152
156
  - test/data/gre.pdf
153
157
  - test/data/no_tables.pdf
158
+ - test/data/nyc_2013fiscalreporttables.pdf
154
159
  - test/data/puertos1.pdf
155
160
  - test/data/spanning_cells.csv
156
161
  - test/data/spanning_cells.pdf
157
162
  - test/data/strongschools.pdf
163
+ - test/data/sydney_disclosure_contract.pdf
158
164
  - test/data/tabla_subsidios.pdf
159
165
  - test/data/vertical_rulings_bug.pdf
160
166
  - test/data/vietnam3.pdf
167
+ - test/data/wc2012.pdf
161
168
  - test/heuristic-test-set/original/560015757GV_China.page1.pdf
162
169
  - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
163
170
  - test/heuristic-test-set/original/bo_page24.pdf
@@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
190
197
  version: '0'
191
198
  requirements: []
192
199
  rubyforge_project:
193
- rubygems_version: 2.2.1
200
+ rubygems_version: 2.1.9
194
201
  signing_key:
195
202
  specification_version: 4
196
203
  summary: extract tables from PDF files
@@ -207,13 +214,16 @@ test_files:
207
214
  - test/data/frx_2012_disclosure.tsv
208
215
  - test/data/gre.pdf
209
216
  - test/data/no_tables.pdf
217
+ - test/data/nyc_2013fiscalreporttables.pdf
210
218
  - test/data/puertos1.pdf
211
219
  - test/data/spanning_cells.csv
212
220
  - test/data/spanning_cells.pdf
213
221
  - test/data/strongschools.pdf
222
+ - test/data/sydney_disclosure_contract.pdf
214
223
  - test/data/tabla_subsidios.pdf
215
224
  - test/data/vertical_rulings_bug.pdf
216
225
  - test/data/vietnam3.pdf
226
+ - test/data/wc2012.pdf
217
227
  - test/heuristic-test-set/original/560015757GV_China.page1.pdf
218
228
  - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
219
229
  - test/heuristic-test-set/original/bo_page24.pdf