tabula-extractor 0.7.1-java → 0.7.2-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fea5751c9b78705fbfda50be1e29011d40249b04
4
- data.tar.gz: 2d7bb156a073636467b34e44b3f8b6a364920354
3
+ metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
4
+ data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
5
5
  SHA512:
6
- metadata.gz: d8d1f1932397b599b3d724bb4e8ca5fd73f516e198a391625031ab83f8e5f0a2743d51dcc02953ee5a53c0664330c8e8e1f5309f72493e9f2e232068b06085a1
7
- data.tar.gz: e292d6320db6a42799418b6651f63a88446f85c6787659597e77b9f7c38e698b69a10c78edbdeea49baedc0d92f77602b847aef90ab1f8f2ab10356493dbb397
6
+ metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
7
+ data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
data/.gitignore CHANGED
@@ -13,6 +13,8 @@ spec/reports
13
13
  test/tmp
14
14
  test/version_tmp
15
15
  tmp
16
+ /*.pdf
17
+ /*.csv
16
18
 
17
19
  # YARD artifacts
18
20
  .yardoc
data/README.md CHANGED
@@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs
44
44
  extraction (if there are ruling lines separating each
45
45
  cell, as in a PDF of an Excel spreadsheet)
46
46
  --silent, -i: Suppress all stderr output.
47
+ --use-line-returns, -u: Use embedded line returns in cells.
47
48
  --version, -v: Print version and exit
48
49
  --help, -h: Show this message
49
50
  ```
@@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs
52
53
 
53
54
  `tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
54
55
 
56
+ Here's a very basic example:
57
+
58
+ ````ruby
59
+ require 'tabula'
60
+
61
+ pdf_file_path = "whatever.pdf"
62
+ outfilename = "whatever.csv"
63
+
64
+ out = open(outfilename, 'w')
65
+
66
+ extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
67
+ extractor.extract.each do |pdf_page|
68
+ pdf_page.spreadsheets.each do |spreadsheet|
69
+ out << spreadsheet.to_csv
70
+ out << "\n\n"
71
+ end
72
+ end
73
+ out.close
74
+
75
+ ````
76
+
55
77
  ## Notes
56
78
 
57
79
  `tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake'
6
6
  Bundler::GemHelper.install_tasks
7
7
 
8
8
  task :test do
9
- ruby %{--debug -X-C -J-Xmx512m test/tests.rb}
9
+ ruby %{-X+C -J-Xmx512m test/tests.rb}
10
10
  end
11
11
 
12
12
  task :default => [:test]
data/bin/tabula CHANGED
@@ -47,6 +47,7 @@ EOS
47
47
  opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
48
48
  opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
49
49
  opt :silent, 'Suppress all stderr output.'
50
+ opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
50
51
  end
51
52
 
52
53
  if !opts[:columns].nil?
@@ -94,6 +95,12 @@ def main
94
95
  else
95
96
  nil
96
97
  end
98
+ use_line_returns = if opts[:use_line_returns]
99
+ true
100
+ else
101
+ false
102
+ end
103
+
97
104
  extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
98
105
  extractor.extract.each_with_index do |pdf_page, page_index|
99
106
 
@@ -111,7 +118,7 @@ def main
111
118
  STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
112
119
  end
113
120
  end
114
- tables = pdf_page.spreadsheets.map(&:rows)
121
+ tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
115
122
  else
116
123
  STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
117
124
  if opts[:guess]
@@ -4,6 +4,10 @@ module Tabula
4
4
  end
5
5
 
6
6
  require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
7
+ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
8
+ require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
9
+ require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
10
+
7
11
 
8
12
  import 'java.util.logging.LogManager'
9
13
  import 'java.util.logging.Level'
@@ -121,7 +121,7 @@ class Line2D::Float
121
121
 
122
122
  end
123
123
 
124
- class Rectangle2D::Float
124
+ class Rectangle2D
125
125
  SIMILARITY_DIVISOR = 20
126
126
 
127
127
  alias_method :top, :minY
@@ -2,6 +2,7 @@ require_relative './entities/zone_entity'
2
2
  require_relative './entities/cell'
3
3
  require_relative './entities/has_cells'
4
4
  require_relative './entities/line'
5
+ require_relative './entities/text_element_index'
5
6
  require_relative './entities/page'
6
7
  require_relative './entities/page_area'
7
8
  require_relative './entities/ruling'
@@ -15,11 +15,14 @@ module Tabula
15
15
  @ruling_lines = ruling_lines
16
16
  @file_path = file_path
17
17
  @number_one_indexed = number
18
- self.texts = texts
19
18
  @cells = []
20
19
  @spreadsheets = nil
21
20
  @min_char_width = min_char_width
22
21
  @min_char_height = min_char_height
22
+ @spatial_index = TextElementIndex.new
23
+
24
+ self.texts = texts
25
+ self.texts.each { |te| @spatial_index << te }
23
26
  end
24
27
 
25
28
  def min_char_width
@@ -54,10 +57,10 @@ module Tabula
54
57
  def get_table(options={})
55
58
  options = {:vertical_rulings => []}.merge(options)
56
59
  if texts.empty?
57
- return []
60
+ return Tabula::Table.new(0, [])
58
61
  end
59
62
 
60
- text_chunks = TextElement.merge_words(self.texts, options).sort
63
+ text_chunks = TextElement.merge_words(self.texts.sort, options).sort
61
64
 
62
65
  lines = TextChunk.group_by_lines(text_chunks)
63
66
 
@@ -65,7 +68,8 @@ module Tabula
65
68
  columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
66
69
  separators = columns.sort.reverse
67
70
  else
68
- columns = TextChunk.column_positions(text_chunks)
71
+ columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
72
+ text_chunks)
69
73
  separators = columns[1..-1].sort.reverse
70
74
  end
71
75
 
@@ -123,8 +127,8 @@ module Tabula
123
127
  spreadsheets(options).each do |spreadsheet|
124
128
  spreadsheet.cells.each do |cell|
125
129
  cell.text_elements = page.get_cell_text(cell)
126
- spreadsheet.cells_resolved = true
127
130
  end
131
+ spreadsheet.cells_resolved = true
128
132
  end
129
133
  end
130
134
 
@@ -176,9 +180,17 @@ module Tabula
176
180
  if area.nil?
177
181
  texts
178
182
  else
179
- texts.select do |t|
180
- area.contains(t)
181
- end
183
+ @spatial_index.contains(area)
184
+ end
185
+ end
186
+
187
+ def fill_in_cell_texts!(areas)
188
+ texts.each do |t|
189
+ area = areas.find{|a| a.contains(t) }
190
+ area.text_elements << t unless area.nil?
191
+ end
192
+ areas.each do |area|
193
+ area.text_elements = TextElement.merge_words(area.text_elements)
182
194
  end
183
195
  end
184
196
 
@@ -53,7 +53,7 @@ module Tabula
53
53
  if array_of_rows.size > 2
54
54
  if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
55
55
  missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
56
- # puts missing_spots.inspect
56
+
57
57
  missing_spots.each do |missing_spot|
58
58
  missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
59
59
  missing_spot_placeholder.placeholder = true
@@ -92,5 +92,17 @@ module Tabula
92
92
  'data' => rows,
93
93
  }.to_json(*a)
94
94
  end
95
+
96
+ def to_csv
97
+ out = StringIO.new
98
+ Tabula::Writers.CSV(rows, out)
99
+ out.string
100
+ end
101
+
102
+ def to_tsv
103
+ out = StringIO.new
104
+ Tabula::Writers.TSV(rows, out)
105
+ out.string
106
+ end
95
107
  end
96
108
  end
@@ -31,11 +31,9 @@ module Tabula
31
31
 
32
32
  ##
33
33
  # calculate estimated columns from an iterable of TextChunk
34
- def self.column_positions(text_chunks)
34
+ def self.column_positions(top, text_chunks)
35
35
  right = 0
36
36
  columns = []
37
- lines = TextChunk.group_by_lines(text_chunks)
38
- top = lines.first.text_elements.map(&:top).min
39
37
 
40
38
  text_chunks.each do |te|
41
39
  next if te.text =~ ONLY_SPACES_RE
@@ -2,16 +2,17 @@ module Tabula
2
2
  ##
3
3
  # a Glyph
4
4
  class TextElement < ZoneEntity
5
- attr_accessor :font, :font_size, :text, :width_of_space
5
+ attr_accessor :font, :font_size, :text, :width_of_space, :direction
6
6
 
7
7
  TOLERANCE_FACTOR = 0.25
8
8
 
9
- def initialize(top, left, width, height, font, font_size, text, width_of_space)
9
+ def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
10
10
  super(top, left, width, height)
11
11
  self.font = font
12
12
  self.font_size = font_size
13
13
  self.text = text
14
14
  self.width_of_space = width_of_space
15
+ self.direction = direction
15
16
  end
16
17
 
17
18
  EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
@@ -31,40 +32,45 @@ module Tabula
31
32
  current_chunk = chunks.last
32
33
  prev_char = current_chunk.text_elements.last
33
34
 
34
- # any vertical ruling goes across prev_char and char?
35
- across_vertical_ruling = vertical_ruling_locations.any? { |loc|
36
- prev_char.left < loc && char.left > loc
37
- }
38
-
39
- # should we add a space?
40
- if (prev_char.text != " ") && (char.text != " ") \
41
- && !across_vertical_ruling \
42
- && prev_char.should_add_space?(char)
43
-
44
- sp = self.new(prev_char.top,
45
- prev_char.right,
46
- prev_char.width_of_space,
47
- prev_char.width_of_space, # width == height for spaces
48
- prev_char.font,
49
- prev_char.font_size,
50
- ' ',
51
- prev_char.width_of_space)
52
- chunks.last << sp
53
- prev_char = sp
54
- end
55
-
56
- # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
57
- # that they ought to be merged by that account.
58
- # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
59
- # Why are both of those `.left`?, you might ask. The intuition is that a letter
60
- # that starts on the left of a vertical ruling ought to remain on the left of it.
61
- if !across_vertical_ruling && prev_char.should_merge?(char)
62
- chunks.last << char
35
+ # if same char AND overlapped, skip
36
+ if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
37
+ chunks
63
38
  else
64
- # create a new chunk
65
- chunks << TextChunk.create_from_text_element(char)
39
+ # any vertical ruling goes across prev_char and char?
40
+ across_vertical_ruling = vertical_ruling_locations.any? { |loc|
41
+ prev_char.left < loc && char.left > loc
42
+ }
43
+
44
+ # should we add a space?
45
+ if (prev_char.text != " ") && (char.text != " ") \
46
+ && !across_vertical_ruling \
47
+ && prev_char.should_add_space?(char)
48
+
49
+ sp = self.new(prev_char.top,
50
+ prev_char.right,
51
+ prev_char.width_of_space,
52
+ prev_char.width_of_space, # width == height for spaces
53
+ prev_char.font,
54
+ prev_char.font_size,
55
+ ' ',
56
+ prev_char.width_of_space)
57
+ chunks.last << sp
58
+ prev_char = sp
59
+ end
60
+
61
+ # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
62
+ # that they ought to be merged by that account.
63
+ # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
64
+ # Why are both of those `.left`?, you might ask. The intuition is that a letter
65
+ # that starts on the left of a vertical ruling ought to remain on the left of it.
66
+ if !across_vertical_ruling && prev_char.should_merge?(char)
67
+ chunks.last << char
68
+ else
69
+ # create a new chunk
70
+ chunks << TextChunk.create_from_text_element(char)
71
+ end
72
+ chunks
66
73
  end
67
- chunks
68
74
  end
69
75
  end
70
76
 
@@ -108,5 +114,17 @@ module Tabula
108
114
  def ==(other)
109
115
  self.text.strip == other.text.strip
110
116
  end
117
+
118
+ # sort in lexicographic (reading) order
119
+ def <=>(other)
120
+ if self.vertically_overlaps?(other)
121
+ self.left <=> other.left
122
+ elsif self.top < other.top
123
+ -1
124
+ else
125
+ 1
126
+ end
127
+ end
128
+
111
129
  end
112
130
  end
@@ -0,0 +1,55 @@
1
+ module Tabula
2
+ class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
3
+
4
+ attr_reader :te_dict
5
+
6
+ class SaveToListProcedure
7
+ include Java::GnuTroveProcedure::TIntProcedure
8
+
9
+ attr_reader :list
10
+
11
+ def initialize(parent)
12
+ @parent = parent
13
+ @list = []
14
+ end
15
+
16
+ def execute(id)
17
+ @list << @parent.te_dict[id]
18
+ return true
19
+ end
20
+
21
+ def reset!
22
+ @list = []
23
+ end
24
+
25
+ end
26
+
27
+ def initialize
28
+ super
29
+ self.init(nil)
30
+ @te_dict = {}
31
+ @save_to_list = SaveToListProcedure.new(self)
32
+ end
33
+
34
+ def <<(text_element)
35
+ r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
36
+ text_element.top,
37
+ text_element.right,
38
+ text_element.bottom)
39
+ @te_dict[text_element.object_id] = text_element
40
+ self.add(r, text_element.object_id)
41
+ end
42
+
43
+ def contains(zone_entity)
44
+ r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
45
+ zone_entity.top,
46
+ zone_entity.right,
47
+ zone_entity.bottom)
48
+ @save_to_list.reset!
49
+ super(r, @save_to_list)
50
+
51
+ # sort in lexicographic (reading) order
52
+ @save_to_list.list.sort
53
+ end
54
+ end
55
+ end
@@ -203,12 +203,12 @@ module Tabula
203
203
 
204
204
  if c == ' ' || c == ' ' # replace non-breaking space for space
205
205
  c = ' '
206
- h = text.getWidthDirAdj.round(2)
206
+ h = text.getWidth.round(2)
207
207
  end
208
208
 
209
- te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
210
- text.getXDirAdj.round(2),
211
- text.getWidthDirAdj.round(2),
209
+ te = Tabula::TextElement.new(text.getY.round(2) - h,
210
+ text.getX.round(2),
211
+ text.getWidth.round(2),
212
212
  # ugly hack follows: we need spaces to have a height, so we can
213
213
  # test for vertical overlap. height == width seems a safe bet.
214
214
  h,
@@ -216,7 +216,8 @@ module Tabula
216
216
  text.getFontSize.round(2),
217
217
  c,
218
218
  # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
219
- text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace)
219
+ text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
220
+ text.getDir)
220
221
 
221
222
  ccp_bounds = self.currentClippingPath
222
223
 
@@ -1,3 +1,3 @@
1
1
  module Tabula
2
- VERSION = '0.7.1'
2
+ VERSION = '0.7.2'
3
3
  end
Binary file
Binary file
@@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test
206
206
  table = Tabula.extract_table(pdf_file_path,
207
207
  1,
208
208
  [106.01, 48.09, 227.31, 551.89],
209
- :detect_ruling_lines => true)
209
+ :detect_ruling_lines => true,
210
+ :extraction_method => "original")
210
211
 
211
212
  expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
212
213
 
@@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test
527
528
  assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
528
529
  end
529
530
  end
531
+
532
+ def test_remove_repeated_text
533
+ top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
534
+
535
+ table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
536
+ 1,
537
+ [top,left,bottom,right],
538
+ :detect_ruling_lines => false,
539
+ :extraction_method => 'original')
540
+
541
+ ary = table_to_array(table)
542
+ assert_equal ary[1][1], "$ 18,969,610"
543
+ assert_equal ary[1][2], "$ 18,157,722"
544
+ end
545
+
546
+ def test_remove_overlapping_text
547
+ # one of those PDFs that put characters on top of another to make text "bold"
548
+ top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
549
+ table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
550
+ 1,
551
+ [top,left,bottom,right],
552
+ :detect_ruling_lines => false,
553
+ :extraction_method => 'original')
554
+
555
+ ary = table_to_array(table)
556
+ assert_equal ary.first.first, "Community development"
557
+ end
558
+
559
+ def test_cells_including_line_returns
560
+ data = []
561
+ pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
562
+ Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
563
+ pdf_page.spreadsheets.each do |spreadsheet|
564
+ spreadsheet.cells.each do |cell|
565
+ cell.text_elements = pdf_page.get_cell_text(cell)
566
+ cell.options = ({:use_line_returns => true, :cell_debug => 0})
567
+ data << cell.text
568
+ end
569
+ end
570
+ end
571
+ assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
572
+ end
573
+
530
574
  end
531
575
 
532
576
  class TestIsTabularHeuristic < Minitest::Test
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: java
6
6
  authors:
7
7
  - Manuel Aristarán
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-01-18 00:00:00.000000000 Z
13
+ date: 2014-01-20 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: minitest
@@ -127,6 +127,7 @@ files:
127
127
  - lib/tabula/entities/table.rb
128
128
  - lib/tabula/entities/text_chunk.rb
129
129
  - lib/tabula/entities/text_element.rb
130
+ - lib/tabula/entities/text_element_index.rb
130
131
  - lib/tabula/entities/zone_entity.rb
131
132
  - lib/tabula/extraction.rb
132
133
  - lib/tabula/line_segment_detector.rb
@@ -138,7 +139,10 @@ files:
138
139
  - lib/tabula/version.rb
139
140
  - lib/tabula/writers.rb
140
141
  - tabula-extractor.gemspec
142
+ - target/jsi-1.1.0-SNAPSHOT.jar
141
143
  - target/pdfbox-app-2.0.0-SNAPSHOT.jar
144
+ - target/slf4j-api-1.6.3.jar
145
+ - target/trove4j-3.0.3.jar
142
146
  - test/data/47008204D_USA.page4.pdf
143
147
  - test/data/560015757GV_China.page1.pdf
144
148
  - test/data/ClinicalResearchDisclosureReport2012Q2.pdf
@@ -151,13 +155,16 @@ files:
151
155
  - test/data/frx_2012_disclosure.tsv
152
156
  - test/data/gre.pdf
153
157
  - test/data/no_tables.pdf
158
+ - test/data/nyc_2013fiscalreporttables.pdf
154
159
  - test/data/puertos1.pdf
155
160
  - test/data/spanning_cells.csv
156
161
  - test/data/spanning_cells.pdf
157
162
  - test/data/strongschools.pdf
163
+ - test/data/sydney_disclosure_contract.pdf
158
164
  - test/data/tabla_subsidios.pdf
159
165
  - test/data/vertical_rulings_bug.pdf
160
166
  - test/data/vietnam3.pdf
167
+ - test/data/wc2012.pdf
161
168
  - test/heuristic-test-set/original/560015757GV_China.page1.pdf
162
169
  - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
163
170
  - test/heuristic-test-set/original/bo_page24.pdf
@@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
190
197
  version: '0'
191
198
  requirements: []
192
199
  rubyforge_project:
193
- rubygems_version: 2.2.1
200
+ rubygems_version: 2.1.9
194
201
  signing_key:
195
202
  specification_version: 4
196
203
  summary: extract tables from PDF files
@@ -207,13 +214,16 @@ test_files:
207
214
  - test/data/frx_2012_disclosure.tsv
208
215
  - test/data/gre.pdf
209
216
  - test/data/no_tables.pdf
217
+ - test/data/nyc_2013fiscalreporttables.pdf
210
218
  - test/data/puertos1.pdf
211
219
  - test/data/spanning_cells.csv
212
220
  - test/data/spanning_cells.pdf
213
221
  - test/data/strongschools.pdf
222
+ - test/data/sydney_disclosure_contract.pdf
214
223
  - test/data/tabla_subsidios.pdf
215
224
  - test/data/vertical_rulings_bug.pdf
216
225
  - test/data/vietnam3.pdf
226
+ - test/data/wc2012.pdf
217
227
  - test/heuristic-test-set/original/560015757GV_China.page1.pdf
218
228
  - test/heuristic-test-set/original/S2MNCEbirdisland.pdf
219
229
  - test/heuristic-test-set/original/bo_page24.pdf