tabula-extractor 0.7.2-java → 0.7.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +4 -8
- data/bin/tabula +3 -3
- data/lib/tabula.rb +9 -5
- data/lib/tabula/entities.rb +1 -0
- data/lib/tabula/entities/cell.rb +6 -4
- data/lib/tabula/entities/has_cells.rb +22 -78
- data/lib/tabula/entities/line.rb +52 -6
- data/lib/tabula/entities/page.rb +43 -50
- data/lib/tabula/entities/ruling.rb +83 -105
- data/lib/tabula/entities/spreadsheet.rb +74 -11
- data/lib/tabula/entities/table.rb +55 -37
- data/lib/tabula/entities/tabular.rb +42 -0
- data/lib/tabula/entities/text_chunk.rb +55 -52
- data/lib/tabula/entities/text_element.rb +129 -62
- data/lib/tabula/entities/zone_entity.rb +15 -6
- data/lib/tabula/extraction.rb +114 -49
- data/lib/tabula/line_segment_detector.rb +0 -5
- data/lib/tabula/table_extractor.rb +32 -37
- data/lib/tabula/version.rb +1 -1
- data/tabula-extractor.gemspec +2 -5
- metadata +13 -95
- data/ext/COPYING +0 -661
- data/ext/Makefile.OSX +0 -18
- data/ext/Makefile.defaults +0 -9
- data/ext/Makefile.linux32 +0 -11
- data/ext/Makefile.linux64 +0 -12
- data/ext/Makefile.mingw +0 -10
- data/ext/Makefile.mingw64 +0 -10
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.def +0 -3
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +0 -2270
- data/ext/lsd.h +0 -283
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/ClinicalResearchDisclosureReport2012Q2.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/argentina_diputados_voting_record.pdf +0 -0
- data/test/data/bo_page24.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +0 -88
- data/test/data/gre.pdf +0 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/nyc_2013fiscalreporttables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +0 -21
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/test/data/sydney_disclosure_contract.pdf +0 -0
- data/test/data/tabla_subsidios.pdf +0 -0
- data/test/data/vertical_rulings_bug.pdf +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/data/wc2012.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/original/cs076pct.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +0 -50
- data/test/test_bin_tabula.sh +0 -7
- data/test/tests.rb +0 -603
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 935de0f0dc43fa388a86cc091dc540b74b6ce31f
|
4
|
+
data.tar.gz: 67fa5fda6450c3b1659af3c61c8027843be5c082
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 191054f79148535bf359c81c72d35b717f71f97ee3c3bedd4c2af66e4332afb98f3071afe4c9ed9e894586e3a20722769742f17fc02b9a5d5d954a4fae50803d
|
7
|
+
data.tar.gz: 711f993194c402d1bca016f0fe13ccaeb8e4eafc6b67c2de0fa8b3cef1e7e3ae5b4cdefc2b251b64467747e7af26f80bb54bf57d4424ea50bb2dd26db7e27570
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@ Extract tables from PDF files. `tabula-extractor` is the table extraction engine
|
|
7
7
|
|
8
8
|
## Installation
|
9
9
|
|
10
|
-
|
10
|
+
`tabula-extractor` only works with JRuby 1.7 or newer. [Install JRuby](http://jruby.org/getting-started) and run
|
11
11
|
|
12
12
|
``
|
13
13
|
jruby -S gem install tabula-extractor
|
@@ -57,12 +57,12 @@ Here's a very basic example:
|
|
57
57
|
|
58
58
|
````ruby
|
59
59
|
require 'tabula'
|
60
|
-
|
60
|
+
|
61
61
|
pdf_file_path = "whatever.pdf"
|
62
62
|
outfilename = "whatever.csv"
|
63
|
-
|
63
|
+
|
64
64
|
out = open(outfilename, 'w')
|
65
|
-
|
65
|
+
|
66
66
|
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
|
67
67
|
extractor.extract.each do |pdf_page|
|
68
68
|
pdf_page.spreadsheets.each do |spreadsheet|
|
@@ -73,7 +73,3 @@ end
|
|
73
73
|
out.close
|
74
74
|
|
75
75
|
````
|
76
|
-
|
77
|
-
## Notes
|
78
|
-
|
79
|
-
`tabula-extractor` uses [LSD: a Line Segment Detector](http://www.ipol.im/pub/art/2012/gjmr-lsd/) by Rafael Grompone von Gioi, Jérémie Jakubowicz, Jean-Michel Morel and Gregory Randall.
|
data/bin/tabula
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
1
|
+
#!/usr/bin/env jruby -J-Djava.awt.headless=true
|
2
2
|
# encoding: utf-8
|
3
3
|
require 'trollop'
|
4
4
|
require_relative '../lib/tabula'
|
@@ -9,7 +9,7 @@ def parse_pages_arg(pages_arg)
|
|
9
9
|
if(pages_arg == 'all')
|
10
10
|
return :all
|
11
11
|
end
|
12
|
-
|
12
|
+
|
13
13
|
ranges = pages_arg.split(',').map(&:strip)
|
14
14
|
pages = []
|
15
15
|
ranges.each do |range|
|
@@ -100,7 +100,7 @@ def main
|
|
100
100
|
else
|
101
101
|
false
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
104
|
extractor = Tabula::Extraction::ObjectExtractor.new(filename, parse_pages_arg(opts[:pages]), opts[:password])
|
105
105
|
extractor.extract.each_with_index do |pdf_page, page_index|
|
106
106
|
|
data/lib/tabula.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Tabula
|
2
2
|
PDFBOX = 'pdfbox-app-2.0.0-SNAPSHOT.jar'
|
3
3
|
ONLY_SPACES_RE = Regexp.new('^\s+$')
|
4
|
+
SAME_CHAR_RE = Regexp.new('^(.)\1+$')
|
4
5
|
end
|
5
6
|
|
6
7
|
require File.join(File.dirname(__FILE__), '../target/', Tabula::PDFBOX)
|
@@ -8,7 +9,6 @@ require File.join(File.dirname(__FILE__), '../target/', 'slf4j-api-1.6.3.jar')
|
|
8
9
|
require File.join(File.dirname(__FILE__), '../target/', 'trove4j-3.0.3.jar')
|
9
10
|
require File.join(File.dirname(__FILE__), '../target/', 'jsi-1.1.0-SNAPSHOT.jar')
|
10
11
|
|
11
|
-
|
12
12
|
import 'java.util.logging.LogManager'
|
13
13
|
import 'java.util.logging.Level'
|
14
14
|
|
@@ -22,13 +22,17 @@ lm.logger_names.each do |name|
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
25
|
-
|
26
|
-
|
27
25
|
require_relative './tabula/version'
|
28
26
|
require_relative './tabula/core_ext'
|
27
|
+
|
29
28
|
require_relative './tabula/entities'
|
30
29
|
require_relative './tabula/extraction'
|
31
30
|
require_relative './tabula/table_extractor'
|
32
31
|
require_relative './tabula/writers'
|
33
|
-
|
34
|
-
|
32
|
+
|
33
|
+
module Tabula
|
34
|
+
autoload :LSD , File.expand_path('tabula/line_segment_detector.rb', File.dirname(__FILE__))
|
35
|
+
autoload :Render , File.expand_path('tabula/pdf_render.rb', File.dirname(__FILE__))
|
36
|
+
end
|
37
|
+
|
38
|
+
require_relative './tabula/table_extractor'
|
data/lib/tabula/entities.rb
CHANGED
data/lib/tabula/entities/cell.rb
CHANGED
@@ -15,7 +15,7 @@ module Tabula
|
|
15
15
|
@placeholder = false
|
16
16
|
@spanning = false
|
17
17
|
@text_elements = []
|
18
|
-
@options = ({:use_line_returns =>
|
18
|
+
@options = ({:use_line_returns => true, :cell_debug => NORMAL}).merge options
|
19
19
|
end
|
20
20
|
|
21
21
|
def self.new_from_points(topleft, bottomright, options={})
|
@@ -29,11 +29,13 @@ module Tabula
|
|
29
29
|
output = ""
|
30
30
|
text_elements.sort #use the default sort for ZoneEntity
|
31
31
|
text_elements.group_by(&:top).values.each do |row|
|
32
|
-
output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\
|
33
|
-
|
32
|
+
output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\r" : '')
|
33
|
+
# per @bchartoff, https://github.com/jazzido/tabula-extractor/pull/65#issuecomment-32899336
|
34
|
+
# line returns as \r behave better in Excel.
|
35
|
+
end
|
34
36
|
if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
|
35
37
|
text_output = output.dup
|
36
|
-
output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
|
38
|
+
output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
|
37
39
|
output += " \n #{text_output}"
|
38
40
|
end
|
39
41
|
output.strip
|
@@ -6,27 +6,30 @@ module Tabula
|
|
6
6
|
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
|
7
7
|
module HasCells
|
8
8
|
|
9
|
-
|
9
|
+
ARBITRARY_MAGIC_HEURISTIC_NUMBER = 0.65
|
10
10
|
|
11
11
|
def is_tabular?
|
12
|
+
ratio = heuristic_ratio
|
13
|
+
return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER)
|
14
|
+
end
|
15
|
+
|
16
|
+
def heuristic_ratio
|
12
17
|
#spreadsheet extraction
|
13
18
|
spreadsheet = spreadsheets.first
|
14
|
-
return
|
19
|
+
return Float::NAN if spreadsheet.nil?
|
15
20
|
rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
|
16
21
|
columns_defined_by_lines = spreadsheet.cols.size
|
17
22
|
|
18
23
|
table = self.get_table
|
19
24
|
columns_defined_without_lines = table.cols.size
|
20
25
|
rows_defined_without_lines = table.rows.size
|
21
|
-
|
22
|
-
|
23
|
-
return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
|
26
|
+
((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
|
24
27
|
end
|
25
28
|
|
26
29
|
# finds cells from the ruling lines on the page.
|
27
30
|
# implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
|
28
31
|
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
29
|
-
def find_cells!(options={})
|
32
|
+
def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options={})
|
30
33
|
# All lines need to been sorted from up to down,
|
31
34
|
# and left to right in ascending order
|
32
35
|
|
@@ -39,9 +42,10 @@ module Tabula
|
|
39
42
|
# depending on the Point2D default sort here.
|
40
43
|
intersection_points_array = intersection_points.keys.sort
|
41
44
|
|
42
|
-
|
45
|
+
intersection_points_array.each_with_index do |topLeft, i|
|
43
46
|
# Fetch all points on the same vertical and horizontal
|
44
47
|
# line with current crossing point
|
48
|
+
horizontal, vertical = intersection_points[topLeft]
|
45
49
|
|
46
50
|
# this lets us go to the next intersection_point in intersection_points_array
|
47
51
|
# it is bad and I feel bad.
|
@@ -64,19 +68,19 @@ module Tabula
|
|
64
68
|
# point;
|
65
69
|
next unless horizontal.colinear?(y_point)
|
66
70
|
#Hypothetical bottom right point of rectangle
|
67
|
-
btmRight = Point2D::Float.new(
|
71
|
+
btmRight = Point2D::Float.new(y_point.x, x_point.y)
|
68
72
|
if intersection_points.include?(btmRight)
|
69
|
-
intersection_points[btmRight]
|
70
|
-
|
73
|
+
btmRightHorizontal, btmRightVertical = intersection_points[btmRight]
|
74
|
+
|
75
|
+
if btmRightHorizontal.colinear?( x_point ) &&
|
71
76
|
btmRightVertical.colinear?( y_point )
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
end
|
77
|
+
# Rectangle is confirmed to have 4 sides
|
78
|
+
cellsFound << Cell.new_from_points( topLeft, btmRight, options)
|
79
|
+
# Each crossing point can be the top left corner
|
80
|
+
# of only a single rectangle
|
81
|
+
#next crossing-point; we need to "next" out of the outer loop here
|
82
|
+
# to avoid creating non-minimal cells, I htink.
|
83
|
+
throw :cellCreated
|
80
84
|
end
|
81
85
|
end
|
82
86
|
end
|
@@ -87,66 +91,6 @@ module Tabula
|
|
87
91
|
cellsFound
|
88
92
|
end
|
89
93
|
|
90
|
-
#############################
|
91
|
-
# Chapter 2, Spanning Cells #
|
92
|
-
#############################
|
93
|
-
#if c is a "spanning cell", that is
|
94
|
-
# if there are N>0 vertical lines strictly between this cell's left and right
|
95
|
-
#insert N placeholder cells after it with zero size (but same top)
|
96
|
-
|
97
|
-
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
98
|
-
def add_spanning_cells!
|
99
|
-
#rounding: because Cell.new_from_points, using in #find_cells above, has
|
100
|
-
# a float precision error where, for instance, a cell whose x2 coord is
|
101
|
-
# supposed to be 160.137451171875 comes out as 160.13745498657227 because
|
102
|
-
# of minus. :(
|
103
|
-
vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
|
104
|
-
horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
|
105
|
-
|
106
|
-
cells.each do |c|
|
107
|
-
vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
|
108
|
-
horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
|
109
|
-
|
110
|
-
unless vertical_rulings_spanned_over.empty?
|
111
|
-
c.spanning = true
|
112
|
-
vertical_rulings_spanned_over.each do |spanned_over_line_loc|
|
113
|
-
placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
|
114
|
-
placeholder.placeholder = true
|
115
|
-
cells << placeholder
|
116
|
-
end
|
117
|
-
end
|
118
|
-
unless horizontal_rulings_spanned_over.empty?
|
119
|
-
c.spanning = true
|
120
|
-
horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
|
121
|
-
placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
|
122
|
-
placeholder.placeholder = true
|
123
|
-
cells << placeholder
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
#if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
|
128
|
-
# e.g. -------------------
|
129
|
-
# | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
|
130
|
-
# |-----------------|
|
131
|
-
# | C | C | C | C |
|
132
|
-
# |-----------------|
|
133
|
-
# | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
|
134
|
-
# |---- + ----| P is a "placeholder" cell with either zero width or zero height
|
135
|
-
# | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
|
136
|
-
# |---- + ----| C is an ordinary cell.
|
137
|
-
# | C | P DP | C |
|
138
|
-
# |-----------------|
|
139
|
-
|
140
|
-
unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
|
141
|
-
double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
|
142
|
-
placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
|
143
|
-
placeholder.placeholder = true
|
144
|
-
cells << placeholder
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
|
150
94
|
#TODO:
|
151
95
|
#returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
|
152
96
|
#maybe placeholders should be added after cells is split into spreadsheets
|
data/lib/tabula/entities/line.rb
CHANGED
@@ -3,6 +3,8 @@ module Tabula
|
|
3
3
|
attr_accessor :text_elements
|
4
4
|
attr_reader :index
|
5
5
|
|
6
|
+
SPACE_RUN_MAX_LENGTH = 3
|
7
|
+
|
6
8
|
def initialize(index=nil)
|
7
9
|
@text_elements = []
|
8
10
|
@index = index
|
@@ -16,15 +18,59 @@ module Tabula
|
|
16
18
|
self.width = t.width
|
17
19
|
self.height = t.height
|
18
20
|
else
|
19
|
-
|
20
|
-
|
21
|
-
else
|
22
|
-
self.text_elements << t
|
23
|
-
self.merge!(t)
|
24
|
-
end
|
21
|
+
self.text_elements << t
|
22
|
+
self.merge!(t)
|
25
23
|
end
|
26
24
|
end
|
27
25
|
|
26
|
+
##
|
27
|
+
# remove runs of the space char longer than SPACE_RUN_MAX_LENGTH
|
28
|
+
# should not change dimensions of the container +Line+
|
29
|
+
def remove_sequential_spaces!(seq_spaces_count=SPACE_RUN_MAX_LENGTH)
|
30
|
+
self.text_elements = self.text_elements.reduce([]) do |memo, text_chunk|
|
31
|
+
long_space_runs = text_chunk
|
32
|
+
.text_elements
|
33
|
+
.chunk { |te| te.text == ' '} # detect runs of spaces...
|
34
|
+
.select { |is_space, text_elements| # ...longer than SPACE_RUN_MAX_LENGTH
|
35
|
+
is_space && !text_elements.nil? && text_elements.size >= SPACE_RUN_MAX_LENGTH
|
36
|
+
}
|
37
|
+
.map { |_, text_elements| text_elements }
|
38
|
+
|
39
|
+
# no long runs of spaces
|
40
|
+
# keep as it was and end iteration
|
41
|
+
if long_space_runs.empty?
|
42
|
+
memo << text_chunk
|
43
|
+
next memo
|
44
|
+
end
|
45
|
+
|
46
|
+
ranges = long_space_runs.map { |lsr|
|
47
|
+
idx = text_chunk
|
48
|
+
.text_elements
|
49
|
+
.index { |te| te.equal?(lsr.first) } # we need pointer comparison here
|
50
|
+
(idx)..(idx+lsr.size-1)
|
51
|
+
}
|
52
|
+
|
53
|
+
in_run = false
|
54
|
+
new_chunk = true
|
55
|
+
text_chunk
|
56
|
+
.text_elements
|
57
|
+
.each_with_index do |te, i|
|
58
|
+
if ranges.any? { |r| r.include?(i) } # te belongs to a run of spaces, skip
|
59
|
+
in_run = true
|
60
|
+
else
|
61
|
+
if in_run || new_chunk
|
62
|
+
memo << TextChunk.create_from_text_element(te)
|
63
|
+
else
|
64
|
+
memo.last << te
|
65
|
+
end
|
66
|
+
in_run = new_chunk = false
|
67
|
+
end
|
68
|
+
end
|
69
|
+
memo
|
70
|
+
end # reduce
|
71
|
+
self
|
72
|
+
end
|
73
|
+
|
28
74
|
#used for testing, ignores text element stuff besides stripped text.
|
29
75
|
def ==(other)
|
30
76
|
return false if other.nil?
|
data/lib/tabula/entities/page.rb
CHANGED
@@ -6,7 +6,7 @@ module Tabula
|
|
6
6
|
attr_writer :min_char_width, :min_char_height
|
7
7
|
attr_accessor :cells
|
8
8
|
|
9
|
-
def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
|
9
|
+
def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil)
|
10
10
|
super(0, 0, width, height)
|
11
11
|
@rotation = rotation
|
12
12
|
if number < 1
|
@@ -19,10 +19,16 @@ module Tabula
|
|
19
19
|
@spreadsheets = nil
|
20
20
|
@min_char_width = min_char_width
|
21
21
|
@min_char_height = min_char_height
|
22
|
-
@spatial_index = TextElementIndex.new
|
23
22
|
|
24
23
|
self.texts = texts
|
25
|
-
|
24
|
+
|
25
|
+
if spatial_index.nil?
|
26
|
+
@spatial_index = TextElementIndex.new
|
27
|
+
self.texts.each { |te| @spatial_index << te }
|
28
|
+
else
|
29
|
+
@spatial_index = spatial_index
|
30
|
+
end
|
31
|
+
|
26
32
|
end
|
27
33
|
|
28
34
|
def min_char_width
|
@@ -49,7 +55,8 @@ module Tabula
|
|
49
55
|
texts,
|
50
56
|
Ruling.crop_rulings_to_area(@ruling_lines, area),
|
51
57
|
texts.map(&:width).min,
|
52
|
-
texts.map(&:height).min
|
58
|
+
texts.map(&:height).min,
|
59
|
+
@spatial_index)
|
53
60
|
return page_area
|
54
61
|
end
|
55
62
|
|
@@ -60,28 +67,33 @@ module Tabula
|
|
60
67
|
return Tabula::Table.new(0, [])
|
61
68
|
end
|
62
69
|
|
63
|
-
|
70
|
+
texts = self.texts.sort
|
71
|
+
text_chunks = TextElement.merge_words(texts, options)
|
64
72
|
|
65
|
-
lines = TextChunk.group_by_lines(text_chunks)
|
73
|
+
lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top)
|
66
74
|
|
67
|
-
unless options[:vertical_rulings].empty?
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
text_chunks)
|
73
|
-
separators = columns[1..-1].sort.reverse
|
74
|
-
end
|
75
|
+
columns = unless options[:vertical_rulings].empty?
|
76
|
+
options[:vertical_rulings].map(&:left).sort #pixel locations, not entities
|
77
|
+
else
|
78
|
+
TextChunk.column_positions(lines).sort
|
79
|
+
end
|
75
80
|
|
76
|
-
table = Table.new(lines.count,
|
81
|
+
table = Table.new(lines.count, columns)
|
77
82
|
lines.each_with_index do |line, i|
|
78
|
-
line.text_elements.each do |te|
|
79
|
-
j =
|
80
|
-
table.add_text_element(te, i,
|
83
|
+
line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te|
|
84
|
+
j = columns.find_index { |s| te.left <= s } || columns.count
|
85
|
+
table.add_text_element(te, i, j)
|
81
86
|
end
|
82
87
|
end
|
83
88
|
|
84
|
-
table
|
89
|
+
# fixes up the table a little bit, replacing nils with empty TextElements
|
90
|
+
# and sorting the lines.
|
91
|
+
# table.rows.each do |l|
|
92
|
+
# l.text_elements = l.text_elements.map do |te|
|
93
|
+
# te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
# table.rows.sort_by!(&:top)
|
85
97
|
table
|
86
98
|
end
|
87
99
|
|
@@ -96,7 +108,7 @@ module Tabula
|
|
96
108
|
return @spreadsheets
|
97
109
|
end
|
98
110
|
get_ruling_lines!(options)
|
99
|
-
self.find_cells!(options)
|
111
|
+
self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, options)
|
100
112
|
|
101
113
|
spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
|
102
114
|
|
@@ -157,14 +169,18 @@ module Tabula
|
|
157
169
|
|
158
170
|
#returns ruling lines, memoizes them in
|
159
171
|
def get_ruling_lines!(options={})
|
160
|
-
if
|
161
|
-
|
162
|
-
@vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
|
163
|
-
@horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
|
164
|
-
@vertical_ruling_lines + @horizontal_ruling_lines
|
165
|
-
else
|
166
|
-
[]
|
172
|
+
if @ruling_lines.nil? || @ruling_lines.empty?
|
173
|
+
return []
|
167
174
|
end
|
175
|
+
self.snap_points!
|
176
|
+
|
177
|
+
@ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) }
|
178
|
+
|
179
|
+
@vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
|
180
|
+
@horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
|
181
|
+
|
182
|
+
@vertical_ruling_lines + @horizontal_ruling_lines
|
183
|
+
|
168
184
|
end
|
169
185
|
|
170
186
|
##
|
@@ -252,29 +268,6 @@ module Tabula
|
|
252
268
|
l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1]
|
253
269
|
end
|
254
270
|
end
|
255
|
-
|
256
|
-
def collapse_oriented_rulings(lines)
|
257
|
-
# lines must all be of one orientation (i.e. horizontal, vertical)
|
258
|
-
|
259
|
-
if lines.empty?
|
260
|
-
return []
|
261
|
-
end
|
262
|
-
|
263
|
-
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
264
|
-
|
265
|
-
lines = lines.inject([lines.shift]) do |memo, next_line|
|
266
|
-
last = memo.last
|
267
|
-
if next_line.position == last.position && last.nearlyIntersects?(next_line)
|
268
|
-
memo.last.start = next_line.start < last.start ? next_line.start : last.start
|
269
|
-
memo.last.end = next_line.end < last.end ? last.end : next_line.end
|
270
|
-
memo
|
271
|
-
elsif next_line.length == 0
|
272
|
-
memo
|
273
|
-
else
|
274
|
-
memo << next_line
|
275
|
-
end
|
276
|
-
end
|
277
|
-
end
|
278
271
|
end
|
279
272
|
|
280
273
|
end
|