tabula-extractor 0.7.1-java → 0.7.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +22 -0
- data/Rakefile +1 -1
- data/bin/tabula +8 -1
- data/lib/tabula.rb +4 -0
- data/lib/tabula/core_ext.rb +1 -1
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/page.rb +20 -8
- data/lib/tabula/entities/spreadsheet.rb +1 -1
- data/lib/tabula/entities/table.rb +12 -0
- data/lib/tabula/entities/text_chunk.rb +1 -3
- data/lib/tabula/entities/text_element.rb +52 -34
- data/lib/tabula/entities/text_element_index.rb +55 -0
- data/lib/tabula/extraction.rb +6 -5
- data/lib/tabula/version.rb +1 -1
- data/target/jsi-1.1.0-SNAPSHOT.jar +0 -0
- data/target/slf4j-api-1.6.3.jar +0 -0
- data/target/trove4j-3.0.3.jar +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/tests.rb +45 -1
- metadata +13 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
|
4
|
+
data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
|
7
|
+
data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs
|
|
44
44
|
extraction (if there are ruling lines separating each
|
45
45
|
cell, as in a PDF of an Excel spreadsheet)
|
46
46
|
--silent, -i: Suppress all stderr output.
|
47
|
+
--use-line-returns, -u: Use embedded line returns in cells.
|
47
48
|
--version, -v: Print version and exit
|
48
49
|
--help, -h: Show this message
|
49
50
|
```
|
@@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs
|
|
52
53
|
|
53
54
|
`tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
|
54
55
|
|
56
|
+
Here's a very basic example:
|
57
|
+
|
58
|
+
````ruby
|
59
|
+
require 'tabula'
|
60
|
+
|
61
|
+
pdf_file_path = "whatever.pdf"
|
62
|
+
outfilename = "whatever.csv"
|
63
|
+
|
64
|
+
out = open(outfilename, 'w')
|
65
|
+
|
66
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
67
|
+
extractor.extract.each do |pdf_page|
|
68
|
+
pdf_page.spreadsheets.each do |spreadsheet|
|
69
|
+
out << spreadsheet.to_csv
|
70
|
+
out << "\n\n"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
out.close
|
74
|
+
|
75
|
+
````
|
76
|
+
|
55
77
|
## Notes
|
56
78
|
|
57
79
|
`tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
|
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -47,6 +47,7 @@ EOS
|
|
47
47
|
opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
|
48
48
|
opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
|
49
49
|
opt :silent, 'Suppress all stderr output.'
|
50
|
+
opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
|
50
51
|
end
|
51
52
|
|
52
53
|
if !opts[:columns].nil?
|
@@ -94,6 +95,12 @@ def main
|
|
94
95
|
else
|
95
96
|
nil
|
96
97
|
end
|
98
|
+
use_line_returns = if opts[:use_line_returns]
|
99
|
+
true
|
100
|
+
else
|
101
|
+
false
|
102
|
+
end
|
103
|
+
|
97
104
|
extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
98
105
|
extractor.extract.each_with_index do |pdf_page, page_index|
|
99
106
|
|
@@ -111,7 +118,7 @@ def main
|
|
111
118
|
STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
|
112
119
|
end
|
113
120
|
end
|
114
|
-
tables = pdf_page.spreadsheets.map(&:rows)
|
121
|
+
tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
|
115
122
|
else
|
116
123
|
STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
|
117
124
|
if opts[:guess]
|
data/lib/tabula.rb
CHANGED
@@ -4,6 +4,10 @@ module Tabula
|
|
4
4
|
end
|
5
5
|
|
6
6
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
7
|
+
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
8
|
+
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
9
|
+
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
10
|
+
|
7
11
|
|
8
12
|
import 'java.util.logging.LogManager'
|
9
13
|
import 'java.util.logging.Level'
|
data/lib/tabula/core_ext.rb
CHANGED
data/lib/tabula/entities.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative './entities/zone_entity'
|
|
2
2
|
require_relative './entities/cell'
|
3
3
|
require_relative './entities/has_cells'
|
4
4
|
require_relative './entities/line'
|
5
|
+
require_relative './entities/text_element_index'
|
5
6
|
require_relative './entities/page'
|
6
7
|
require_relative './entities/page_area'
|
7
8
|
require_relative './entities/ruling'
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -15,11 +15,14 @@ module Tabula
|
|
15
15
|
@ruling_lines = ruling_lines
|
16
16
|
@file_path = file_path
|
17
17
|
@number_one_indexed = number
|
18
|
-
self.texts = texts
|
19
18
|
@cells = []
|
20
19
|
@spreadsheets = nil
|
21
20
|
@min_char_width = min_char_width
|
22
21
|
@min_char_height = min_char_height
|
22
|
+
@spatial_index = TextElementIndex.new
|
23
|
+
|
24
|
+
self.texts = texts
|
25
|
+
self.texts.each { |te| @spatial_index << te }
|
23
26
|
end
|
24
27
|
|
25
28
|
def min_char_width
|
@@ -54,10 +57,10 @@ module Tabula
|
|
54
57
|
def get_table(options={})
|
55
58
|
options = {:vertical_rulings => []}.merge(options)
|
56
59
|
if texts.empty?
|
57
|
-
return []
|
60
|
+
return Tabula::Table.new(0, [])
|
58
61
|
end
|
59
62
|
|
60
|
-
text_chunks = TextElement.merge_words(self.texts, options).sort
|
63
|
+
text_chunks = TextElement.merge_words(self.texts.sort, options).sort
|
61
64
|
|
62
65
|
lines = TextChunk.group_by_lines(text_chunks)
|
63
66
|
|
@@ -65,7 +68,8 @@ module Tabula
|
|
65
68
|
columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
|
66
69
|
separators = columns.sort.reverse
|
67
70
|
else
|
68
|
-
columns = TextChunk.column_positions(
|
71
|
+
columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
|
72
|
+
text_chunks)
|
69
73
|
separators = columns[1..-1].sort.reverse
|
70
74
|
end
|
71
75
|
|
@@ -123,8 +127,8 @@ module Tabula
|
|
123
127
|
spreadsheets(options).each do |spreadsheet|
|
124
128
|
spreadsheet.cells.each do |cell|
|
125
129
|
cell.text_elements = page.get_cell_text(cell)
|
126
|
-
spreadsheet.cells_resolved = true
|
127
130
|
end
|
131
|
+
spreadsheet.cells_resolved = true
|
128
132
|
end
|
129
133
|
end
|
130
134
|
|
@@ -176,9 +180,17 @@ module Tabula
|
|
176
180
|
if area.nil?
|
177
181
|
texts
|
178
182
|
else
|
179
|
-
|
180
|
-
|
181
|
-
|
183
|
+
@spatial_index.contains(area)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def fill_in_cell_texts!(areas)
|
188
|
+
texts.each do |t|
|
189
|
+
area = areas.find{|a| a.contains(t) }
|
190
|
+
area.text_elements << t unless area.nil?
|
191
|
+
end
|
192
|
+
areas.each do |area|
|
193
|
+
area.text_elements = TextElement.merge_words(area.text_elements)
|
182
194
|
end
|
183
195
|
end
|
184
196
|
|
@@ -53,7 +53,7 @@ module Tabula
|
|
53
53
|
if array_of_rows.size > 2
|
54
54
|
if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
|
55
55
|
missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
|
56
|
-
|
56
|
+
|
57
57
|
missing_spots.each do |missing_spot|
|
58
58
|
missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
|
59
59
|
missing_spot_placeholder.placeholder = true
|
@@ -92,5 +92,17 @@ module Tabula
|
|
92
92
|
'data' => rows,
|
93
93
|
}.to_json(*a)
|
94
94
|
end
|
95
|
+
|
96
|
+
def to_csv
|
97
|
+
out = StringIO.new
|
98
|
+
Tabula::Writers.CSV(rows, out)
|
99
|
+
out.string
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_tsv
|
103
|
+
out = StringIO.new
|
104
|
+
Tabula::Writers.TSV(rows, out)
|
105
|
+
out.string
|
106
|
+
end
|
95
107
|
end
|
96
108
|
end
|
@@ -31,11 +31,9 @@ module Tabula
|
|
31
31
|
|
32
32
|
##
|
33
33
|
# calculate estimated columns from an iterable of TextChunk
|
34
|
-
def self.column_positions(text_chunks)
|
34
|
+
def self.column_positions(top, text_chunks)
|
35
35
|
right = 0
|
36
36
|
columns = []
|
37
|
-
lines = TextChunk.group_by_lines(text_chunks)
|
38
|
-
top = lines.first.text_elements.map(&:top).min
|
39
37
|
|
40
38
|
text_chunks.each do |te|
|
41
39
|
next if te.text =~ ONLY_SPACES_RE
|
@@ -2,16 +2,17 @@ module Tabula
|
|
2
2
|
##
|
3
3
|
# a Glyph
|
4
4
|
class TextElement < ZoneEntity
|
5
|
-
attr_accessor :font, :font_size, :text, :width_of_space
|
5
|
+
attr_accessor :font, :font_size, :text, :width_of_space, :direction
|
6
6
|
|
7
7
|
TOLERANCE_FACTOR = 0.25
|
8
8
|
|
9
|
-
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
9
|
+
def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
|
10
10
|
super(top, left, width, height)
|
11
11
|
self.font = font
|
12
12
|
self.font_size = font_size
|
13
13
|
self.text = text
|
14
14
|
self.width_of_space = width_of_space
|
15
|
+
self.direction = direction
|
15
16
|
end
|
16
17
|
|
17
18
|
EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
|
@@ -31,40 +32,45 @@ module Tabula
|
|
31
32
|
current_chunk = chunks.last
|
32
33
|
prev_char = current_chunk.text_elements.last
|
33
34
|
|
34
|
-
#
|
35
|
-
|
36
|
-
|
37
|
-
}
|
38
|
-
|
39
|
-
# should we add a space?
|
40
|
-
if (prev_char.text != " ") && (char.text != " ") \
|
41
|
-
&& !across_vertical_ruling \
|
42
|
-
&& prev_char.should_add_space?(char)
|
43
|
-
|
44
|
-
sp = self.new(prev_char.top,
|
45
|
-
prev_char.right,
|
46
|
-
prev_char.width_of_space,
|
47
|
-
prev_char.width_of_space, # width == height for spaces
|
48
|
-
prev_char.font,
|
49
|
-
prev_char.font_size,
|
50
|
-
' ',
|
51
|
-
prev_char.width_of_space)
|
52
|
-
chunks.last << sp
|
53
|
-
prev_char = sp
|
54
|
-
end
|
55
|
-
|
56
|
-
# should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
|
57
|
-
# that they ought to be merged by that account.
|
58
|
-
# we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
|
59
|
-
# Why are both of those `.left`?, you might ask. The intuition is that a letter
|
60
|
-
# that starts on the left of a vertical ruling ought to remain on the left of it.
|
61
|
-
if !across_vertical_ruling && prev_char.should_merge?(char)
|
62
|
-
chunks.last << char
|
35
|
+
# if same char AND overlapped, skip
|
36
|
+
if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
|
37
|
+
chunks
|
63
38
|
else
|
64
|
-
#
|
65
|
-
|
39
|
+
# any vertical ruling goes across prev_char and char?
|
40
|
+
across_vertical_ruling = vertical_ruling_locations.any? { |loc|
|
41
|
+
prev_char.left < loc && char.left > loc
|
42
|
+
}
|
43
|
+
|
44
|
+
# should we add a space?
|
45
|
+
if (prev_char.text != " ") && (char.text != " ") \
|
46
|
+
&& !across_vertical_ruling \
|
47
|
+
&& prev_char.should_add_space?(char)
|
48
|
+
|
49
|
+
sp = self.new(prev_char.top,
|
50
|
+
prev_char.right,
|
51
|
+
prev_char.width_of_space,
|
52
|
+
prev_char.width_of_space, # width == height for spaces
|
53
|
+
prev_char.font,
|
54
|
+
prev_char.font_size,
|
55
|
+
' ',
|
56
|
+
prev_char.width_of_space)
|
57
|
+
chunks.last << sp
|
58
|
+
prev_char = sp
|
59
|
+
end
|
60
|
+
|
61
|
+
# should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
|
62
|
+
# that they ought to be merged by that account.
|
63
|
+
# we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
|
64
|
+
# Why are both of those `.left`?, you might ask. The intuition is that a letter
|
65
|
+
# that starts on the left of a vertical ruling ought to remain on the left of it.
|
66
|
+
if !across_vertical_ruling && prev_char.should_merge?(char)
|
67
|
+
chunks.last << char
|
68
|
+
else
|
69
|
+
# create a new chunk
|
70
|
+
chunks << TextChunk.create_from_text_element(char)
|
71
|
+
end
|
72
|
+
chunks
|
66
73
|
end
|
67
|
-
chunks
|
68
74
|
end
|
69
75
|
end
|
70
76
|
|
@@ -108,5 +114,17 @@ module Tabula
|
|
108
114
|
def ==(other)
|
109
115
|
self.text.strip == other.text.strip
|
110
116
|
end
|
117
|
+
|
118
|
+
# sort in lexicographic (reading) order
|
119
|
+
def <=>(other)
|
120
|
+
if self.vertically_overlaps?(other)
|
121
|
+
self.left <=> other.left
|
122
|
+
elsif self.top < other.top
|
123
|
+
-1
|
124
|
+
else
|
125
|
+
1
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
111
129
|
end
|
112
130
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Tabula
|
2
|
+
class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
|
3
|
+
|
4
|
+
attr_reader :te_dict
|
5
|
+
|
6
|
+
class SaveToListProcedure
|
7
|
+
include Java::GnuTroveProcedure::TIntProcedure
|
8
|
+
|
9
|
+
attr_reader :list
|
10
|
+
|
11
|
+
def initialize(parent)
|
12
|
+
@parent = parent
|
13
|
+
@list = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def execute(id)
|
17
|
+
@list << @parent.te_dict[id]
|
18
|
+
return true
|
19
|
+
end
|
20
|
+
|
21
|
+
def reset!
|
22
|
+
@list = []
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
super
|
29
|
+
self.init(nil)
|
30
|
+
@te_dict = {}
|
31
|
+
@save_to_list = SaveToListProcedure.new(self)
|
32
|
+
end
|
33
|
+
|
34
|
+
def <<(text_element)
|
35
|
+
r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
|
36
|
+
text_element.top,
|
37
|
+
text_element.right,
|
38
|
+
text_element.bottom)
|
39
|
+
@te_dict[text_element.object_id] = text_element
|
40
|
+
self.add(r, text_element.object_id)
|
41
|
+
end
|
42
|
+
|
43
|
+
def contains(zone_entity)
|
44
|
+
r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
|
45
|
+
zone_entity.top,
|
46
|
+
zone_entity.right,
|
47
|
+
zone_entity.bottom)
|
48
|
+
@save_to_list.reset!
|
49
|
+
super(r, @save_to_list)
|
50
|
+
|
51
|
+
# sort in lexicographic (reading) order
|
52
|
+
@save_to_list.list.sort
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/tabula/extraction.rb
CHANGED
@@ -203,12 +203,12 @@ module Tabula
|
|
203
203
|
|
204
204
|
if c == ' ' || c == ' ' # replace non-breaking space for space
|
205
205
|
c = ' '
|
206
|
-
h = text.
|
206
|
+
h = text.getWidth.round(2)
|
207
207
|
end
|
208
208
|
|
209
|
-
te = Tabula::TextElement.new(text.
|
210
|
-
text.
|
211
|
-
text.
|
209
|
+
te = Tabula::TextElement.new(text.getY.round(2) - h,
|
210
|
+
text.getX.round(2),
|
211
|
+
text.getWidth.round(2),
|
212
212
|
# ugly hack follows: we need spaces to have a height, so we can
|
213
213
|
# test for vertical overlap. height == width seems a safe bet.
|
214
214
|
h,
|
@@ -216,7 +216,8 @@ module Tabula
|
|
216
216
|
text.getFontSize.round(2),
|
217
217
|
c,
|
218
218
|
# workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
|
219
|
-
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace
|
219
|
+
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
|
220
|
+
text.getDir)
|
220
221
|
|
221
222
|
ccp_bounds = self.currentClippingPath
|
222
223
|
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test
|
|
206
206
|
table = Tabula.extract_table(pdf_file_path,
|
207
207
|
1,
|
208
208
|
[106.01, 48.09, 227.31, 551.89],
|
209
|
-
:detect_ruling_lines => true
|
209
|
+
:detect_ruling_lines => true,
|
210
|
+
:extraction_method => "original")
|
210
211
|
|
211
212
|
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
212
213
|
|
@@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test
|
|
527
528
|
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
528
529
|
end
|
529
530
|
end
|
531
|
+
|
532
|
+
def test_remove_repeated_text
|
533
|
+
top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
|
534
|
+
|
535
|
+
table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
|
536
|
+
1,
|
537
|
+
[top,left,bottom,right],
|
538
|
+
:detect_ruling_lines => false,
|
539
|
+
:extraction_method => 'original')
|
540
|
+
|
541
|
+
ary = table_to_array(table)
|
542
|
+
assert_equal ary[1][1], "$ 18,969,610"
|
543
|
+
assert_equal ary[1][2], "$ 18,157,722"
|
544
|
+
end
|
545
|
+
|
546
|
+
def test_remove_overlapping_text
|
547
|
+
# one of those PDFs that put characters on top of another to make text "bold"
|
548
|
+
top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
|
549
|
+
table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
|
550
|
+
1,
|
551
|
+
[top,left,bottom,right],
|
552
|
+
:detect_ruling_lines => false,
|
553
|
+
:extraction_method => 'original')
|
554
|
+
|
555
|
+
ary = table_to_array(table)
|
556
|
+
assert_equal ary.first.first, "Community development"
|
557
|
+
end
|
558
|
+
|
559
|
+
def test_cells_including_line_returns
|
560
|
+
data = []
|
561
|
+
pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
|
562
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
563
|
+
pdf_page.spreadsheets.each do |spreadsheet|
|
564
|
+
spreadsheet.cells.each do |cell|
|
565
|
+
cell.text_elements = pdf_page.get_cell_text(cell)
|
566
|
+
cell.options = ({:use_line_returns => true, :cell_debug => 0})
|
567
|
+
data << cell.text
|
568
|
+
end
|
569
|
+
end
|
570
|
+
end
|
571
|
+
assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
|
572
|
+
end
|
573
|
+
|
530
574
|
end
|
531
575
|
|
532
576
|
class TestIsTabularHeuristic < Minitest::Test
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-01-
|
13
|
+
date: 2014-01-20 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: minitest
|
@@ -127,6 +127,7 @@ files:
|
|
127
127
|
- lib/tabula/entities/table.rb
|
128
128
|
- lib/tabula/entities/text_chunk.rb
|
129
129
|
- lib/tabula/entities/text_element.rb
|
130
|
+
- lib/tabula/entities/text_element_index.rb
|
130
131
|
- lib/tabula/entities/zone_entity.rb
|
131
132
|
- lib/tabula/extraction.rb
|
132
133
|
- lib/tabula/line_segment_detector.rb
|
@@ -138,7 +139,10 @@ files:
|
|
138
139
|
- lib/tabula/version.rb
|
139
140
|
- lib/tabula/writers.rb
|
140
141
|
- tabula-extractor.gemspec
|
142
|
+
- target/jsi-1.1.0-SNAPSHOT.jar
|
141
143
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
144
|
+
- target/slf4j-api-1.6.3.jar
|
145
|
+
- target/trove4j-3.0.3.jar
|
142
146
|
- test/data/47008204D_USA.page4.pdf
|
143
147
|
- test/data/560015757GV_China.page1.pdf
|
144
148
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
@@ -151,13 +155,16 @@ files:
|
|
151
155
|
- test/data/frx_2012_disclosure.tsv
|
152
156
|
- test/data/gre.pdf
|
153
157
|
- test/data/no_tables.pdf
|
158
|
+
- test/data/nyc_2013fiscalreporttables.pdf
|
154
159
|
- test/data/puertos1.pdf
|
155
160
|
- test/data/spanning_cells.csv
|
156
161
|
- test/data/spanning_cells.pdf
|
157
162
|
- test/data/strongschools.pdf
|
163
|
+
- test/data/sydney_disclosure_contract.pdf
|
158
164
|
- test/data/tabla_subsidios.pdf
|
159
165
|
- test/data/vertical_rulings_bug.pdf
|
160
166
|
- test/data/vietnam3.pdf
|
167
|
+
- test/data/wc2012.pdf
|
161
168
|
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
162
169
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
163
170
|
- test/heuristic-test-set/original/bo_page24.pdf
|
@@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
197
|
version: '0'
|
191
198
|
requirements: []
|
192
199
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.1.9
|
194
201
|
signing_key:
|
195
202
|
specification_version: 4
|
196
203
|
summary: extract tables from PDF files
|
@@ -207,13 +214,16 @@ test_files:
|
|
207
214
|
- test/data/frx_2012_disclosure.tsv
|
208
215
|
- test/data/gre.pdf
|
209
216
|
- test/data/no_tables.pdf
|
217
|
+
- test/data/nyc_2013fiscalreporttables.pdf
|
210
218
|
- test/data/puertos1.pdf
|
211
219
|
- test/data/spanning_cells.csv
|
212
220
|
- test/data/spanning_cells.pdf
|
213
221
|
- test/data/strongschools.pdf
|
222
|
+
- test/data/sydney_disclosure_contract.pdf
|
214
223
|
- test/data/tabla_subsidios.pdf
|
215
224
|
- test/data/vertical_rulings_bug.pdf
|
216
225
|
- test/data/vietnam3.pdf
|
226
|
+
- test/data/wc2012.pdf
|
217
227
|
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
218
228
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
219
229
|
- test/heuristic-test-set/original/bo_page24.pdf
|