tabula-extractor 0.7.2-java → 0.7.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
|
4
|
+
data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
|
7
|
+
data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine
|
|
7
7
|
|
8
8
|
## Installation
|
9
9
|
|
10
|
-
|
10
|
+
`tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
|
11
11
|
|
12
12
|
``
|
13
13
|
jruby -S gem install tabula-extractor
|
@@ -57,12 +57,12 @@ Here's a very basic example:
|
|
57
57
|
|
58
58
|
````ruby
|
59
59
|
require 'tabula'
|
60
|
-
|
60
|
+
|
61
61
|
pdf_file_path = "whatever.pdf"
|
62
62
|
outfilename = "whatever.csv"
|
63
|
-
|
63
|
+
|
64
64
|
out = open(outfilename, 'w')
|
65
|
-
|
65
|
+
|
66
66
|
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
67
67
|
extractor.extract.each do |pdf_page|
|
68
68
|
pdf_page.spreadsheets.each do |spreadsheet|
|
@@ -73,7 +73,3 @@ end
|
|
73
73
|
out.close
|
74
74
|
|
75
75
|
````
|
76
|
-
|
77
|
-
## Notes
|
78
|
-
|
79
|
-
`tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
|
data/bin/tabula
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
1
|
+
#!/usr/bin/env jruby -J-Djava.awt.headless=true
|
2
2
|
# encoding: utf-8
|
3
3
|
require 'trollop'
|
4
4
|
require_relative '../lib/tabula'
|
@@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg)
|
|
9
9
|
if(pages_arg == 'all')
|
10
10
|
return :all
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
ranges = pages_arg.split(',').map(&:strip)
|
14
14
|
pages = []
|
15
15
|
ranges.each do |range|
|
@@ -100,7 +100,7 @@ def main
|
|
100
100
|
else
|
101
101
|
false
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
105
105
|
extractor.extract.each_with_index do |pdf_page, page_index|
|
106
106
|
|
data/lib/tabula.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Tabula
|
2
2
|
PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
|
3
3
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
4
|
+
SAME_CHAR_RE = Regexp.new('^(.)\1+$')
|
4
5
|
end
|
5
6
|
|
6
7
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
@@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
|
8
9
|
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
9
10
|
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
10
11
|
|
11
|
-
|
12
12
|
import 'java.util.logging.LogManager'
|
13
13
|
import 'java.util.logging.Level'
|
14
14
|
|
@@ -22,13 +22,17 @@ lm.logger_names.each do |name|
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
25
|
-
|
26
|
-
|
27
25
|
require_relative './tabula/version'
|
28
26
|
require_relative './tabula/core_ext'
|
27
|
+
|
29
28
|
require_relative './tabula/entities'
|
30
29
|
require_relative './tabula/extraction'
|
31
30
|
require_relative './tabula/table_extractor'
|
32
31
|
require_relative './tabula/writers'
|
33
|
-
|
34
|
-
|
32
|
+
|
33
|
+
module Tabula
|
34
|
+
autoload :LSD , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
|
35
|
+
autoload :Render , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
|
36
|
+
end
|
37
|
+
|
38
|
+
require_relative './tabula/table_extractor'
|
data/lib/tabula/entities.rb
CHANGED
data/lib/tabula/entities/cell.rb
CHANGED
@@ -15,7 +15,7 @@ module Tabula
|
|
15
15
|
@placeholder = false
|
16
16
|
@spanning = false
|
17
17
|
@text_elements = []
|
18
|
-
@options = ({:use_line_returns =>
|
18
|
+
@options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
|
19
19
|
end
|
20
20
|
|
21
21
|
def self.new_from_points(topleft, bottomright, options={})
|
@@ -29,11 +29,13 @@ module Tabula
|
|
29
29
|
output = ""
|
30
30
|
text_elements.sort #use the default sort for ZoneEntity
|
31
31
|
text_elements.group_by(&:top).values.each do |row|
|
32
|
-
output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\
|
33
|
-
|
32
|
+
output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
|
33
|
+
# per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
|
34
|
+
# line returns as \r behave better in Excel.
|
35
|
+
end
|
34
36
|
if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
|
35
37
|
text_output = output.dup
|
36
|
-
output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
|
38
|
+
output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
|
37
39
|
output += " \n #{text_output}"
|
38
40
|
end
|
39
41
|
output.strip
|
@@ -6,27 +6,30 @@ module Tabula
|
|
6
6
|
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
|
7
7
|
module HasCells
|
8
8
|
|
9
|
-
|
9
|
+
ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
|
10
10
|
|
11
11
|
def is_tabular?
|
12
|
+
ratio = heuristic_ratio
|
13
|
+
return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
|
14
|
+
end
|
15
|
+
|
16
|
+
def heuristic_ratio
|
12
17
|
#spreadsheet extraction
|
13
18
|
spreadsheet = spreadsheets.first
|
14
|
-
return
|
19
|
+
return Float::NAN if spreadsheet.nil?
|
15
20
|
rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
|
16
21
|
columns_defined_by_lines = spreadsheet.cols.size
|
17
22
|
|
18
23
|
table = self.get_table
|
19
24
|
columns_defined_without_lines = table.cols.size
|
20
25
|
rows_defined_without_lines = table.rows.size
|
21
|
-
|
22
|
-
|
23
|
-
return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
|
26
|
+
((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
|
24
27
|
end
|
25
28
|
|
26
29
|
# finds cells from the ruling lines on the page.
|
27
30
|
# implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
|
28
31
|
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
29
|
-
def find_cells!(options={})
|
32
|
+
def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
|
30
33
|
# All lines need to been sorted from up to down,
|
31
34
|
# and left to right in ascending order
|
32
35
|
|
@@ -39,9 +42,10 @@ module Tabula
|
|
39
42
|
# depending on the Point2D default sort here.
|
40
43
|
intersection_points_array = intersection_points.keys.sort
|
41
44
|
|
42
|
-
|
45
|
+
intersection_points_array.each_with_index do |topLeft, i|
|
43
46
|
# Fetch all points on the same vertical and horizontal
|
44
47
|
# line with current crossing point
|
48
|
+
horizontal, vertical = intersection_points[topLeft]
|
45
49
|
|
46
50
|
# this lets us go to the next intersection_point in intersection_points_array
|
47
51
|
# it is bad and I feel bad.
|
@@ -64,19 +68,19 @@ module Tabula
|
|
64
68
|
# point;
|
65
69
|
next unless horizontal.colinear?(y_point)
|
66
70
|
#Hypothetical bottom right point of rectangle
|
67
|
-
btmRight = Point2D::Float.new(
|
71
|
+
btmRight = Point2D::Float.new(y_point.x, x_point.y)
|
68
72
|
if intersection_points.include?(btmRight)
|
69
|
-
intersection_points[btmRight]
|
70
|
-
|
73
|
+
btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
|
74
|
+
|
75
|
+
if btmRightHorizontal.colinear?( x_point ) &&
|
71
76
|
btmRightVertical.colinear?( y_point )
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
77
|
+
# Rectangle is confirmed to have 4 sides
|
78
|
+
cellsFound << Cell.new_from_points( topLeft, btmRight, options)
|
79
|
+
# Each crossing point can be the top left corner
|
80
|
+
# of only a single rectangle
|
81
|
+
#next crossing-point; we need to "next" out of the outer loop here
|
82
|
+
# to avoid creating non-minimal cells, I htink.
|
83
|
+
throw :cellCreated
|
80
84
|
end
|
81
85
|
end
|
82
86
|
end
|
@@ -87,66 +91,6 @@ module Tabula
|
|
87
91
|
cellsFound
|
88
92
|
end
|
89
93
|
|
90
|
-
#############################
|
91
|
-
# Chapter 2, Spanning Cells #
|
92
|
-
#############################
|
93
|
-
#if c is a "spanning cell", that is
|
94
|
-
# if there are N>0 vertical lines strictly between this cell's left and right
|
95
|
-
#insert N placeholder cells after it with zero size (but same top)
|
96
|
-
|
97
|
-
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
98
|
-
def add_spanning_cells!
|
99
|
-
#rounding: because Cell.new_from_points, using in #find_cells above, has
|
100
|
-
# a float precision error where, for instance, a cell whose x2 coord is
|
101
|
-
# supposed to be 160.137451171875 comes out as 160.13745498657227 because
|
102
|
-
# of minus. :(
|
103
|
-
vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
|
104
|
-
horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
|
105
|
-
|
106
|
-
cells.each do |c|
|
107
|
-
vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
|
108
|
-
horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
|
109
|
-
|
110
|
-
unless vertical_rulings_spanned_over.empty?
|
111
|
-
c.spanning = true
|
112
|
-
vertical_rulings_spanned_over.each do |spanned_over_line_loc|
|
113
|
-
placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
|
114
|
-
placeholder.placeholder = true
|
115
|
-
cells << placeholder
|
116
|
-
end
|
117
|
-
end
|
118
|
-
unless horizontal_rulings_spanned_over.empty?
|
119
|
-
c.spanning = true
|
120
|
-
horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
|
121
|
-
placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
|
122
|
-
placeholder.placeholder = true
|
123
|
-
cells << placeholder
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
#if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
|
128
|
-
# e.g. -------------------
|
129
|
-
# | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
|
130
|
-
# |-----------------|
|
131
|
-
# | C | C | C | C |
|
132
|
-
# |-----------------|
|
133
|
-
# | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
|
134
|
-
# |---- + ----| P is a "placeholder" cell with either zero width or zero height
|
135
|
-
# | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
|
136
|
-
# |---- + ----| C is an ordinary cell.
|
137
|
-
# | C | P DP | C |
|
138
|
-
# |-----------------|
|
139
|
-
|
140
|
-
unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
|
141
|
-
double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
|
142
|
-
placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
|
143
|
-
placeholder.placeholder = true
|
144
|
-
cells << placeholder
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
94
|
#TODO:
|
151
95
|
#returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
|
152
96
|
#maybe placeholders should be added after cells is split into spreadsheets
|
data/lib/tabula/entities/line.rb
CHANGED
@@ -3,6 +3,8 @@ module Tabula
|
|
3
3
|
attr_accessor :text_elements
|
4
4
|
attr_reader :index
|
5
5
|
|
6
|
+
SPACE_RUN_MAX_LENGTH = 3
|
7
|
+
|
6
8
|
def initialize(index=nil)
|
7
9
|
@text_elements = []
|
8
10
|
@index = index
|
@@ -16,15 +18,59 @@ module Tabula
|
|
16
18
|
self.width = t.width
|
17
19
|
self.height = t.height
|
18
20
|
else
|
19
|
-
|
20
|
-
|
21
|
-
else
|
22
|
-
self.text_elements << t
|
23
|
-
self.merge!(t)
|
24
|
-
end
|
21
|
+
self.text_elements << t
|
22
|
+
self.merge!(t)
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
26
|
+
##
|
27
|
+
# remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
|
28
|
+
# should not change dimensions of the container +Line+
|
29
|
+
def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
|
30
|
+
self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
|
31
|
+
long_space_runs = text_chunk
|
32
|
+
.text_elements
|
33
|
+
.chunk { |te| te.text == ' '} # detect runs of spaces...
|
34
|
+
.select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
|
35
|
+
is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
|
36
|
+
}
|
37
|
+
.map { |_, text_elements| text_elements }
|
38
|
+
|
39
|
+
# no long runs of spaces
|
40
|
+
# keep as it was and end iteration
|
41
|
+
if long_space_runs.empty?
|
42
|
+
memo << text_chunk
|
43
|
+
next memo
|
44
|
+
end
|
45
|
+
|
46
|
+
ranges = long_space_runs.map { |lsr|
|
47
|
+
idx = text_chunk
|
48
|
+
.text_elements
|
49
|
+
.index { |te| te.equal?(lsr.first) } # we need pointer comparison here
|
50
|
+
(idx)..(idx+lsr.size-1)
|
51
|
+
}
|
52
|
+
|
53
|
+
in_run = false
|
54
|
+
new_chunk = true
|
55
|
+
text_chunk
|
56
|
+
.text_elements
|
57
|
+
.each_with_index do |te, i|
|
58
|
+
if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
|
59
|
+
in_run = true
|
60
|
+
else
|
61
|
+
if in_run || new_chunk
|
62
|
+
memo << TextChunk.create_from_text_element(te)
|
63
|
+
else
|
64
|
+
memo.last << te
|
65
|
+
end
|
66
|
+
in_run = new_chunk = false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
memo
|
70
|
+
end # reduce
|
71
|
+
self
|
72
|
+
end
|
73
|
+
|
28
74
|
#used for testing, ignores text element stuff besides stripped text.
|
29
75
|
def ==(other)
|
30
76
|
return false if other.nil?
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -6,7 +6,7 @@ module Tabula
|
|
6
6
|
attr_writer :min_char_width, :min_char_height
|
7
7
|
attr_accessor :cells
|
8
8
|
|
9
|
-
def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
|
9
|
+
def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
|
10
10
|
super(0, 0, width, height)
|
11
11
|
@rotation = rotation
|
12
12
|
if number < 1
|
@@ -19,10 +19,16 @@ module Tabula
|
|
19
19
|
@spreadsheets = nil
|
20
20
|
@min_char_width = min_char_width
|
21
21
|
@min_char_height = min_char_height
|
22
|
-
@spatial_index = TextElementIndex.new
|
23
22
|
|
24
23
|
self.texts = texts
|
25
|
-
|
24
|
+
|
25
|
+
if spatial_index.nil?
|
26
|
+
@spatial_index = TextElementIndex.new
|
27
|
+
self.texts.each { |te| @spatial_index << te }
|
28
|
+
else
|
29
|
+
@spatial_index = spatial_index
|
30
|
+
end
|
31
|
+
|
26
32
|
end
|
27
33
|
|
28
34
|
def min_char_width
|
@@ -49,7 +55,8 @@ module Tabula
|
|
49
55
|
texts,
|
50
56
|
Ruling.crop_rulings_to_area(@ruling_lines, area),
|
51
57
|
texts.map(&:width).min,
|
52
|
-
texts.map(&:height).min
|
58
|
+
texts.map(&:height).min,
|
59
|
+
@spatial_index)
|
53
60
|
return page_area
|
54
61
|
end
|
55
62
|
|
@@ -60,28 +67,33 @@ module Tabula
|
|
60
67
|
return Tabula::Table.new(0, [])
|
61
68
|
end
|
62
69
|
|
63
|
-
|
70
|
+
texts = self.texts.sort
|
71
|
+
text_chunks = TextElement.merge_words(texts, options)
|
64
72
|
|
65
|
-
lines = TextChunk.group_by_lines(text_chunks)
|
73
|
+
lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
|
66
74
|
|
67
|
-
unless options[:vertical_rulings].empty?
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
text_chunks)
|
73
|
-
separators = columns[1..-1].sort.reverse
|
74
|
-
end
|
75
|
+
columns = unless options[:vertical_rulings].empty?
|
76
|
+
options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
|
77
|
+
else
|
78
|
+
TextChunk.column_positions(lines).sort
|
79
|
+
end
|
75
80
|
|
76
|
-
table = Table.new(lines.count,
|
81
|
+
table = Table.new(lines.count, columns)
|
77
82
|
lines.each_with_index do |line, i|
|
78
|
-
line.text_elements.each do |te|
|
79
|
-
j =
|
80
|
-
table.add_text_element(te, i,
|
83
|
+
line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
|
84
|
+
j = columns.find_index { |s| te.left <= s } || columns.count
|
85
|
+
table.add_text_element(te, i, j)
|
81
86
|
end
|
82
87
|
end
|
83
88
|
|
84
|
-
table
|
89
|
+
# fixes up the table a little bit, replacing nils with empty TextElements
|
90
|
+
# and sorting the lines.
|
91
|
+
# table.rows.each do |l|
|
92
|
+
# l.text_elements = l.text_elements.map do |te|
|
93
|
+
# te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
# table.rows.sort_by!(&:top)
|
85
97
|
table
|
86
98
|
end
|
87
99
|
|
@@ -96,7 +108,7 @@ module Tabula
|
|
96
108
|
return @spreadsheets
|
97
109
|
end
|
98
110
|
get_ruling_lines!(options)
|
99
|
-
self.find_cells!(options)
|
111
|
+
self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
|
100
112
|
|
101
113
|
spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
|
102
114
|
|
@@ -157,14 +169,18 @@ module Tabula
|
|
157
169
|
|
158
170
|
#returns ruling lines, memoizes them in
|
159
171
|
def get_ruling_lines!(options={})
|
160
|
-
if
|
161
|
-
|
162
|
-
@vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
|
163
|
-
@horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
|
164
|
-
@vertical_ruling_lines + @horizontal_ruling_lines
|
165
|
-
else
|
166
|
-
[]
|
172
|
+
if @ruling_lines.nil? || @ruling_lines.empty?
|
173
|
+
return []
|
167
174
|
end
|
175
|
+
self.snap_points!
|
176
|
+
|
177
|
+
@ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
|
178
|
+
|
179
|
+
@vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
|
180
|
+
@horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
|
181
|
+
|
182
|
+
@vertical_ruling_lines + @horizontal_ruling_lines
|
183
|
+
|
168
184
|
end
|
169
185
|
|
170
186
|
##
|
@@ -252,29 +268,6 @@ module Tabula
|
|
252
268
|
l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
|
253
269
|
end
|
254
270
|
end
|
255
|
-
|
256
|
-
def collapse_oriented_rulings(lines)
|
257
|
-
# lines must all be of one orientation (i.e. horizontal, vertical)
|
258
|
-
|
259
|
-
if lines.empty?
|
260
|
-
return []
|
261
|
-
end
|
262
|
-
|
263
|
-
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
264
|
-
|
265
|
-
lines = lines.inject([lines.shift]) do |memo, next_line|
|
266
|
-
last = memo.last
|
267
|
-
if next_line.position == last.position && last.nearlyIntersects?(next_line)
|
268
|
-
memo.last.start = next_line.start < last.start ? next_line.start : last.start
|
269
|
-
memo.last.end = next_line.end < last.end ? last.end : next_line.end
|
270
|
-
memo
|
271
|
-
elsif next_line.length == 0
|
272
|
-
memo
|
273
|
-
else
|
274
|
-
memo << next_line
|
275
|
-
end
|
276
|
-
end
|
277
|
-
end
|
278
271
|
end
|
279
272
|
|
280
273
|
end
|