tabula-extractor 0.7.1-java → 0.7.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +22 -0
- data/Rakefile +1 -1
- data/bin/tabula +8 -1
- data/lib/tabula.rb +4 -0
- data/lib/tabula/core_ext.rb +1 -1
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/page.rb +20 -8
- data/lib/tabula/entities/spreadsheet.rb +1 -1
- data/lib/tabula/entities/table.rb +12 -0
- data/lib/tabula/entities/text_chunk.rb +1 -3
- data/lib/tabula/entities/text_element.rb +52 -34
- data/lib/tabula/entities/text_element_index.rb +55 -0
- data/lib/tabula/extraction.rb +6 -5
- data/lib/tabula/version.rb +1 -1
- data/target/jsi-1.1.0-SNAPSHOT.jar +0 -0
- data/target/slf4j-api-1.6.3.jar +0 -0
- data/target/trove4j-3.0.3.jar +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/tests.rb +45 -1
- metadata +13 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 089f1213abcf17bb66c982d40b2145a0f452297c
|
4
|
+
data.tar.gz: 80151791aae887fe11108e3f39c03e06eb29cdea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28a058d979fc405094a416fbcd05c65f3cbd25e8269ce8e22f023a27fe0e5f3d4ef39f8b1e60c00ffc964bd858eb379ae71826f9eb4e70447b4181be96d97efe
|
7
|
+
data.tar.gz: c601699c5639a72a0ce4149b50f523cebe914a6767f3dd449979dde6c9a6c7ae269a270f7da0bfd33d92a52199d3a74866aef69121e6033f593c48341f3dc9f4
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -44,6 +44,7 @@ Tabula helps you extract tables from PDFs
|
|
44
44
|
extraction (if there are ruling lines separating each
|
45
45
|
cell, as in a PDF of an Excel spreadsheet)
|
46
46
|
--silent, -i: Suppress all stderr output.
|
47
|
+
--use-line-returns, -u: Use embedded line returns in cells.
|
47
48
|
--version, -v: Print version and exit
|
48
49
|
--help, -h: Show this message
|
49
50
|
```
|
@@ -52,6 +53,27 @@ Tabula helps you extract tables from PDFs
|
|
52
53
|
|
53
54
|
`tabula-extractor` is a RubyGem that you can use to programmatically extract tabular data, using the Tabula engine, in your scripts or applications. We don't have docs yet, but [the tests](test/tests.rb) are a good source of information.
|
54
55
|
|
56
|
+
Here's a very basic example:
|
57
|
+
|
58
|
+
````ruby
|
59
|
+
require 'tabula'
|
60
|
+
|
61
|
+
pdf_file_path = "whatever.pdf"
|
62
|
+
outfilename = "whatever.csv"
|
63
|
+
|
64
|
+
out = open(outfilename, 'w')
|
65
|
+
|
66
|
+
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
67
|
+
extractor.extract.each do |pdf_page|
|
68
|
+
pdf_page.spreadsheets.each do |spreadsheet|
|
69
|
+
out << spreadsheet.to_csv
|
70
|
+
out << "\n\n"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
out.close
|
74
|
+
|
75
|
+
````
|
76
|
+
|
55
77
|
## Notes
|
56
78
|
|
57
79
|
`tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
|
data/Rakefile
CHANGED
data/bin/tabula
CHANGED
@@ -47,6 +47,7 @@ EOS
|
|
47
47
|
opt :spreadsheet, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
|
48
48
|
opt :no_spreadsheet, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)"
|
49
49
|
opt :silent, 'Suppress all stderr output.'
|
50
|
+
opt :use_line_returns, 'Use embedded line returns in cells. (Only in spreadsheet mode.)'
|
50
51
|
end
|
51
52
|
|
52
53
|
if !opts[:columns].nil?
|
@@ -94,6 +95,12 @@ def main
|
|
94
95
|
else
|
95
96
|
nil
|
96
97
|
end
|
98
|
+
use_line_returns = if opts[:use_line_returns]
|
99
|
+
true
|
100
|
+
else
|
101
|
+
false
|
102
|
+
end
|
103
|
+
|
97
104
|
extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
98
105
|
extractor.extract.each_with_index do |pdf_page, page_index|
|
99
106
|
|
@@ -111,7 +118,7 @@ def main
|
|
111
118
|
STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{spreadsheet.dims(:top, :left, :bottom, :right)}"
|
112
119
|
end
|
113
120
|
end
|
114
|
-
tables = pdf_page.spreadsheets.map(&:rows)
|
121
|
+
tables = pdf_page.spreadsheets(:use_line_returns=> use_line_returns).map(&:rows)
|
115
122
|
else
|
116
123
|
STDERR.puts "Page #{pdf_page.number(:one_indexed)}: #{page_area.to_s}" if opts[:debug]
|
117
124
|
if opts[:guess]
|
data/lib/tabula.rb
CHANGED
@@ -4,6 +4,10 @@ module Tabula
|
|
4
4
|
end
|
5
5
|
|
6
6
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
7
|
+
require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
8
|
+
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
9
|
+
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
10
|
+
|
7
11
|
|
8
12
|
import 'java.util.logging.LogManager'
|
9
13
|
import 'java.util.logging.Level'
|
data/lib/tabula/core_ext.rb
CHANGED
data/lib/tabula/entities.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative './entities/zone_entity'
|
|
2
2
|
require_relative './entities/cell'
|
3
3
|
require_relative './entities/has_cells'
|
4
4
|
require_relative './entities/line'
|
5
|
+
require_relative './entities/text_element_index'
|
5
6
|
require_relative './entities/page'
|
6
7
|
require_relative './entities/page_area'
|
7
8
|
require_relative './entities/ruling'
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -15,11 +15,14 @@ module Tabula
|
|
15
15
|
@ruling_lines = ruling_lines
|
16
16
|
@file_path = file_path
|
17
17
|
@number_one_indexed = number
|
18
|
-
self.texts = texts
|
19
18
|
@cells = []
|
20
19
|
@spreadsheets = nil
|
21
20
|
@min_char_width = min_char_width
|
22
21
|
@min_char_height = min_char_height
|
22
|
+
@spatial_index = TextElementIndex.new
|
23
|
+
|
24
|
+
self.texts = texts
|
25
|
+
self.texts.each { |te| @spatial_index << te }
|
23
26
|
end
|
24
27
|
|
25
28
|
def min_char_width
|
@@ -54,10 +57,10 @@ module Tabula
|
|
54
57
|
def get_table(options={})
|
55
58
|
options = {:vertical_rulings => []}.merge(options)
|
56
59
|
if texts.empty?
|
57
|
-
return []
|
60
|
+
return Tabula::Table.new(0, [])
|
58
61
|
end
|
59
62
|
|
60
|
-
text_chunks = TextElement.merge_words(self.texts, options).sort
|
63
|
+
text_chunks = TextElement.merge_words(self.texts.sort, options).sort
|
61
64
|
|
62
65
|
lines = TextChunk.group_by_lines(text_chunks)
|
63
66
|
|
@@ -65,7 +68,8 @@ module Tabula
|
|
65
68
|
columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
|
66
69
|
separators = columns.sort.reverse
|
67
70
|
else
|
68
|
-
columns = TextChunk.column_positions(
|
71
|
+
columns = TextChunk.column_positions(lines.first.text_elements.min_by(&:top).top,
|
72
|
+
text_chunks)
|
69
73
|
separators = columns[1..-1].sort.reverse
|
70
74
|
end
|
71
75
|
|
@@ -123,8 +127,8 @@ module Tabula
|
|
123
127
|
spreadsheets(options).each do |spreadsheet|
|
124
128
|
spreadsheet.cells.each do |cell|
|
125
129
|
cell.text_elements = page.get_cell_text(cell)
|
126
|
-
spreadsheet.cells_resolved = true
|
127
130
|
end
|
131
|
+
spreadsheet.cells_resolved = true
|
128
132
|
end
|
129
133
|
end
|
130
134
|
|
@@ -176,9 +180,17 @@ module Tabula
|
|
176
180
|
if area.nil?
|
177
181
|
texts
|
178
182
|
else
|
179
|
-
|
180
|
-
|
181
|
-
|
183
|
+
@spatial_index.contains(area)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def fill_in_cell_texts!(areas)
|
188
|
+
texts.each do |t|
|
189
|
+
area = areas.find{|a| a.contains(t) }
|
190
|
+
area.text_elements << t unless area.nil?
|
191
|
+
end
|
192
|
+
areas.each do |area|
|
193
|
+
area.text_elements = TextElement.merge_words(area.text_elements)
|
182
194
|
end
|
183
195
|
end
|
184
196
|
|
@@ -53,7 +53,7 @@ module Tabula
|
|
53
53
|
if array_of_rows.size > 2
|
54
54
|
if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
|
55
55
|
missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
|
56
|
-
|
56
|
+
|
57
57
|
missing_spots.each do |missing_spot|
|
58
58
|
missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
|
59
59
|
missing_spot_placeholder.placeholder = true
|
@@ -92,5 +92,17 @@ module Tabula
|
|
92
92
|
'data' => rows,
|
93
93
|
}.to_json(*a)
|
94
94
|
end
|
95
|
+
|
96
|
+
def to_csv
|
97
|
+
out = StringIO.new
|
98
|
+
Tabula::Writers.CSV(rows, out)
|
99
|
+
out.string
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_tsv
|
103
|
+
out = StringIO.new
|
104
|
+
Tabula::Writers.TSV(rows, out)
|
105
|
+
out.string
|
106
|
+
end
|
95
107
|
end
|
96
108
|
end
|
@@ -31,11 +31,9 @@ module Tabula
|
|
31
31
|
|
32
32
|
##
|
33
33
|
# calculate estimated columns from an iterable of TextChunk
|
34
|
-
def self.column_positions(text_chunks)
|
34
|
+
def self.column_positions(top, text_chunks)
|
35
35
|
right = 0
|
36
36
|
columns = []
|
37
|
-
lines = TextChunk.group_by_lines(text_chunks)
|
38
|
-
top = lines.first.text_elements.map(&:top).min
|
39
37
|
|
40
38
|
text_chunks.each do |te|
|
41
39
|
next if te.text =~ ONLY_SPACES_RE
|
@@ -2,16 +2,17 @@ module Tabula
|
|
2
2
|
##
|
3
3
|
# a Glyph
|
4
4
|
class TextElement < ZoneEntity
|
5
|
-
attr_accessor :font, :font_size, :text, :width_of_space
|
5
|
+
attr_accessor :font, :font_size, :text, :width_of_space, :direction
|
6
6
|
|
7
7
|
TOLERANCE_FACTOR = 0.25
|
8
8
|
|
9
|
-
def initialize(top, left, width, height, font, font_size, text, width_of_space)
|
9
|
+
def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0)
|
10
10
|
super(top, left, width, height)
|
11
11
|
self.font = font
|
12
12
|
self.font_size = font_size
|
13
13
|
self.text = text
|
14
14
|
self.width_of_space = width_of_space
|
15
|
+
self.direction = direction
|
15
16
|
end
|
16
17
|
|
17
18
|
EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
|
@@ -31,40 +32,45 @@ module Tabula
|
|
31
32
|
current_chunk = chunks.last
|
32
33
|
prev_char = current_chunk.text_elements.last
|
33
34
|
|
34
|
-
#
|
35
|
-
|
36
|
-
|
37
|
-
}
|
38
|
-
|
39
|
-
# should we add a space?
|
40
|
-
if (prev_char.text != " ") && (char.text != " ") \
|
41
|
-
&& !across_vertical_ruling \
|
42
|
-
&& prev_char.should_add_space?(char)
|
43
|
-
|
44
|
-
sp = self.new(prev_char.top,
|
45
|
-
prev_char.right,
|
46
|
-
prev_char.width_of_space,
|
47
|
-
prev_char.width_of_space, # width == height for spaces
|
48
|
-
prev_char.font,
|
49
|
-
prev_char.font_size,
|
50
|
-
' ',
|
51
|
-
prev_char.width_of_space)
|
52
|
-
chunks.last << sp
|
53
|
-
prev_char = sp
|
54
|
-
end
|
55
|
-
|
56
|
-
# should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
|
57
|
-
# that they ought to be merged by that account.
|
58
|
-
# we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
|
59
|
-
# Why are both of those `.left`?, you might ask. The intuition is that a letter
|
60
|
-
# that starts on the left of a vertical ruling ought to remain on the left of it.
|
61
|
-
if !across_vertical_ruling && prev_char.should_merge?(char)
|
62
|
-
chunks.last << char
|
35
|
+
# if same char AND overlapped, skip
|
36
|
+
if prev_char.text == char.text && prev_char.overlaps_with_ratio?(char, 0.85)
|
37
|
+
chunks
|
63
38
|
else
|
64
|
-
#
|
65
|
-
|
39
|
+
# any vertical ruling goes across prev_char and char?
|
40
|
+
across_vertical_ruling = vertical_ruling_locations.any? { |loc|
|
41
|
+
prev_char.left < loc && char.left > loc
|
42
|
+
}
|
43
|
+
|
44
|
+
# should we add a space?
|
45
|
+
if (prev_char.text != " ") && (char.text != " ") \
|
46
|
+
&& !across_vertical_ruling \
|
47
|
+
&& prev_char.should_add_space?(char)
|
48
|
+
|
49
|
+
sp = self.new(prev_char.top,
|
50
|
+
prev_char.right,
|
51
|
+
prev_char.width_of_space,
|
52
|
+
prev_char.width_of_space, # width == height for spaces
|
53
|
+
prev_char.font,
|
54
|
+
prev_char.font_size,
|
55
|
+
' ',
|
56
|
+
prev_char.width_of_space)
|
57
|
+
chunks.last << sp
|
58
|
+
prev_char = sp
|
59
|
+
end
|
60
|
+
|
61
|
+
# should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
|
62
|
+
# that they ought to be merged by that account.
|
63
|
+
# we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
|
64
|
+
# Why are both of those `.left`?, you might ask. The intuition is that a letter
|
65
|
+
# that starts on the left of a vertical ruling ought to remain on the left of it.
|
66
|
+
if !across_vertical_ruling && prev_char.should_merge?(char)
|
67
|
+
chunks.last << char
|
68
|
+
else
|
69
|
+
# create a new chunk
|
70
|
+
chunks << TextChunk.create_from_text_element(char)
|
71
|
+
end
|
72
|
+
chunks
|
66
73
|
end
|
67
|
-
chunks
|
68
74
|
end
|
69
75
|
end
|
70
76
|
|
@@ -108,5 +114,17 @@ module Tabula
|
|
108
114
|
def ==(other)
|
109
115
|
self.text.strip == other.text.strip
|
110
116
|
end
|
117
|
+
|
118
|
+
# sort in lexicographic (reading) order
|
119
|
+
def <=>(other)
|
120
|
+
if self.vertically_overlaps?(other)
|
121
|
+
self.left <=> other.left
|
122
|
+
elsif self.top < other.top
|
123
|
+
-1
|
124
|
+
else
|
125
|
+
1
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
111
129
|
end
|
112
130
|
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Tabula
|
2
|
+
class TextElementIndex < Java::ComInfomatiqJsiRtree::RTree
|
3
|
+
|
4
|
+
attr_reader :te_dict
|
5
|
+
|
6
|
+
class SaveToListProcedure
|
7
|
+
include Java::GnuTroveProcedure::TIntProcedure
|
8
|
+
|
9
|
+
attr_reader :list
|
10
|
+
|
11
|
+
def initialize(parent)
|
12
|
+
@parent = parent
|
13
|
+
@list = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def execute(id)
|
17
|
+
@list << @parent.te_dict[id]
|
18
|
+
return true
|
19
|
+
end
|
20
|
+
|
21
|
+
def reset!
|
22
|
+
@list = []
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
super
|
29
|
+
self.init(nil)
|
30
|
+
@te_dict = {}
|
31
|
+
@save_to_list = SaveToListProcedure.new(self)
|
32
|
+
end
|
33
|
+
|
34
|
+
def <<(text_element)
|
35
|
+
r = Java::ComInfomatiqJsi::Rectangle.new(text_element.left,
|
36
|
+
text_element.top,
|
37
|
+
text_element.right,
|
38
|
+
text_element.bottom)
|
39
|
+
@te_dict[text_element.object_id] = text_element
|
40
|
+
self.add(r, text_element.object_id)
|
41
|
+
end
|
42
|
+
|
43
|
+
def contains(zone_entity)
|
44
|
+
r = Java::ComInfomatiqJsi::Rectangle.new(zone_entity.left,
|
45
|
+
zone_entity.top,
|
46
|
+
zone_entity.right,
|
47
|
+
zone_entity.bottom)
|
48
|
+
@save_to_list.reset!
|
49
|
+
super(r, @save_to_list)
|
50
|
+
|
51
|
+
# sort in lexicographic (reading) order
|
52
|
+
@save_to_list.list.sort
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/tabula/extraction.rb
CHANGED
@@ -203,12 +203,12 @@ module Tabula
|
|
203
203
|
|
204
204
|
if c == ' ' || c == ' ' # replace non-breaking space for space
|
205
205
|
c = ' '
|
206
|
-
h = text.
|
206
|
+
h = text.getWidth.round(2)
|
207
207
|
end
|
208
208
|
|
209
|
-
te = Tabula::TextElement.new(text.
|
210
|
-
text.
|
211
|
-
text.
|
209
|
+
te = Tabula::TextElement.new(text.getY.round(2) - h,
|
210
|
+
text.getX.round(2),
|
211
|
+
text.getWidth.round(2),
|
212
212
|
# ugly hack follows: we need spaces to have a height, so we can
|
213
213
|
# test for vertical overlap. height == width seems a safe bet.
|
214
214
|
h,
|
@@ -216,7 +216,8 @@ module Tabula
|
|
216
216
|
text.getFontSize.round(2),
|
217
217
|
c,
|
218
218
|
# workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
|
219
|
-
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace
|
219
|
+
text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
|
220
|
+
text.getDir)
|
220
221
|
|
221
222
|
ccp_bounds = self.currentClippingPath
|
222
223
|
|
data/lib/tabula/version.rb
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/test/tests.rb
CHANGED
@@ -206,7 +206,8 @@ class TestExtractor < Minitest::Test
|
|
206
206
|
table = Tabula.extract_table(pdf_file_path,
|
207
207
|
1,
|
208
208
|
[106.01, 48.09, 227.31, 551.89],
|
209
|
-
:detect_ruling_lines => true
|
209
|
+
:detect_ruling_lines => true,
|
210
|
+
:extraction_method => "original")
|
210
211
|
|
211
212
|
expected = Tabula::Table.new_from_array([["AANONSEN, DEBORAH, A", "", "STATEN ISLAND, NY", "MEALS", "$85.00"], ["TOTAL", "", "", "", "$85.00"], ["AARON, CAREN, T", "", "RICHMOND, VA", "EDUCATIONAL ITEMS", "$78.80"], ["AARON, CAREN, T", "", "RICHMOND, VA", "MEALS", "$392.45"], ["TOTAL", "", "", "", "$471.25"], ["AARON, JOHN", "", "CLARKSVILLE, TN", "MEALS", "$20.39"], ["TOTAL", "", "", "", "$20.39"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "MEALS", "$310.33"], ["", "REGIONAL PULMONARY & SLEEP"], ["AARON, JOSHUA, N", "", "WEST GROVE, PA", "SPEAKING FEES", "$4,700.00"], ["", "MEDICINE"], ["TOTAL", "", "", "", "$5,010.33"], ["AARON, MAUREEN, M", "", "MARTINSVILLE, VA", "MEALS", "$193.67"], ["TOTAL", "", "", "", "$193.67"], ["AARON, MICHAEL, L", "", "WEST ISLIP, NY", "MEALS", "$19.50"], ["TOTAL", "", "", "", "$19.50"], ["AARON, MICHAEL, R", "", "BROOKLYN, NY", "MEALS", "$65.92"]])
|
212
213
|
|
@@ -527,6 +528,49 @@ class TestExtractor < Minitest::Test
|
|
527
528
|
assert_equal ["TOTAL", "453,515", "895,111", "456,431", "718,382", "487,183", "886,211", "494,220", "816,623", "495,580", "810,565", "627,469", "1,248,804", "540,367"], table.last
|
528
529
|
end
|
529
530
|
end
|
531
|
+
|
532
|
+
def test_remove_repeated_text
|
533
|
+
top, left, bottom, right = 106.07142857142858, 50.91428571428572, 141.42857142857144, 755.2285714285715
|
534
|
+
|
535
|
+
table = Tabula.extract_table(File.expand_path('data/nyc_2013fiscalreporttables.pdf', File.dirname(__FILE__)),
|
536
|
+
1,
|
537
|
+
[top,left,bottom,right],
|
538
|
+
:detect_ruling_lines => false,
|
539
|
+
:extraction_method => 'original')
|
540
|
+
|
541
|
+
ary = table_to_array(table)
|
542
|
+
assert_equal ary[1][1], "$ 18,969,610"
|
543
|
+
assert_equal ary[1][2], "$ 18,157,722"
|
544
|
+
end
|
545
|
+
|
546
|
+
def test_remove_overlapping_text
|
547
|
+
# one of those PDFs that put characters on top of another to make text "bold"
|
548
|
+
top,left,bottom,right = 399.98571428571427, 36.06428571428571, 425.1214285714285, 544.2428571428571
|
549
|
+
table = Tabula.extract_table(File.expand_path('data/wc2012.pdf', File.dirname(__FILE__)),
|
550
|
+
1,
|
551
|
+
[top,left,bottom,right],
|
552
|
+
:detect_ruling_lines => false,
|
553
|
+
:extraction_method => 'original')
|
554
|
+
|
555
|
+
ary = table_to_array(table)
|
556
|
+
assert_equal ary.first.first, "Community development"
|
557
|
+
end
|
558
|
+
|
559
|
+
def test_cells_including_line_returns
|
560
|
+
data = []
|
561
|
+
pdf_file_path = "./test/data/sydney_disclosure_contract.pdf"
|
562
|
+
Tabula::Extraction::ObjectExtractor.new(pdf_file_path, [1]).extract.each do |pdf_page|
|
563
|
+
pdf_page.spreadsheets.each do |spreadsheet|
|
564
|
+
spreadsheet.cells.each do |cell|
|
565
|
+
cell.text_elements = pdf_page.get_cell_text(cell)
|
566
|
+
cell.options = ({:use_line_returns => true, :cell_debug => 0})
|
567
|
+
data << cell.text
|
568
|
+
end
|
569
|
+
end
|
570
|
+
end
|
571
|
+
assert_equal ["1295", "Name: Reino International Pty Ltd trading as Duncan Solutions \nAddress: 15/39 Herbet Street, St Leonards NSW 2065", "N/A", "Effective Date: 13 May 2013 \nDuration: 15 Weeks", "Supply, Installation and Maintenance of Parking Ticket Machines", "$3,148,800.00exgst", "N/A", "N/A", "Open Tender \nTender evaluation criteria included: \n- The schedule of prices \n- Compliance with technical specifications/Technical assessment \n- Operational Plan including maintenance procedures"], data
|
572
|
+
end
|
573
|
+
|
530
574
|
end
|
531
575
|
|
532
576
|
class TestIsTabularHeuristic < Minitest::Test
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tabula-extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Manuel Aristarán
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-01-
|
13
|
+
date: 2014-01-20 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: minitest
|
@@ -127,6 +127,7 @@ files:
|
|
127
127
|
- lib/tabula/entities/table.rb
|
128
128
|
- lib/tabula/entities/text_chunk.rb
|
129
129
|
- lib/tabula/entities/text_element.rb
|
130
|
+
- lib/tabula/entities/text_element_index.rb
|
130
131
|
- lib/tabula/entities/zone_entity.rb
|
131
132
|
- lib/tabula/extraction.rb
|
132
133
|
- lib/tabula/line_segment_detector.rb
|
@@ -138,7 +139,10 @@ files:
|
|
138
139
|
- lib/tabula/version.rb
|
139
140
|
- lib/tabula/writers.rb
|
140
141
|
- tabula-extractor.gemspec
|
142
|
+
- target/jsi-1.1.0-SNAPSHOT.jar
|
141
143
|
- target/pdfbox-app-2.0.0-SNAPSHOT.jar
|
144
|
+
- target/slf4j-api-1.6.3.jar
|
145
|
+
- target/trove4j-3.0.3.jar
|
142
146
|
- test/data/47008204D_USA.page4.pdf
|
143
147
|
- test/data/560015757GV_China.page1.pdf
|
144
148
|
- test/data/ClinicalResearchDisclosureReport2012Q2.pdf
|
@@ -151,13 +155,16 @@ files:
|
|
151
155
|
- test/data/frx_2012_disclosure.tsv
|
152
156
|
- test/data/gre.pdf
|
153
157
|
- test/data/no_tables.pdf
|
158
|
+
- test/data/nyc_2013fiscalreporttables.pdf
|
154
159
|
- test/data/puertos1.pdf
|
155
160
|
- test/data/spanning_cells.csv
|
156
161
|
- test/data/spanning_cells.pdf
|
157
162
|
- test/data/strongschools.pdf
|
163
|
+
- test/data/sydney_disclosure_contract.pdf
|
158
164
|
- test/data/tabla_subsidios.pdf
|
159
165
|
- test/data/vertical_rulings_bug.pdf
|
160
166
|
- test/data/vietnam3.pdf
|
167
|
+
- test/data/wc2012.pdf
|
161
168
|
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
162
169
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
163
170
|
- test/heuristic-test-set/original/bo_page24.pdf
|
@@ -190,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
197
|
version: '0'
|
191
198
|
requirements: []
|
192
199
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.1.9
|
194
201
|
signing_key:
|
195
202
|
specification_version: 4
|
196
203
|
summary: extract tables from PDF files
|
@@ -207,13 +214,16 @@ test_files:
|
|
207
214
|
- test/data/frx_2012_disclosure.tsv
|
208
215
|
- test/data/gre.pdf
|
209
216
|
- test/data/no_tables.pdf
|
217
|
+
- test/data/nyc_2013fiscalreporttables.pdf
|
210
218
|
- test/data/puertos1.pdf
|
211
219
|
- test/data/spanning_cells.csv
|
212
220
|
- test/data/spanning_cells.pdf
|
213
221
|
- test/data/strongschools.pdf
|
222
|
+
- test/data/sydney_disclosure_contract.pdf
|
214
223
|
- test/data/tabla_subsidios.pdf
|
215
224
|
- test/data/vertical_rulings_bug.pdf
|
216
225
|
- test/data/vietnam3.pdf
|
226
|
+
- test/data/wc2012.pdf
|
217
227
|
- test/heuristic-test-set/original/560015757GV_China.page1.pdf
|
218
228
|
- test/heuristic-test-set/original/S2MNCEbirdisland.pdf
|
219
229
|
- test/heuristic-test-set/original/bo_page24.pdf
|