tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,42 @@
|
|
1
|
+
module Tabula
|
2
|
+
|
3
|
+
#cells are components of spreadsheets
|
4
|
+
|
5
|
+
class Cell < ZoneEntity
|
6
|
+
|
7
|
+
NORMAL = 0
|
8
|
+
DEBUG = 1
|
9
|
+
SUPERDEBUG = 2
|
10
|
+
|
11
|
+
attr_accessor :text_elements, :placeholder, :spanning, :options
|
12
|
+
|
13
|
+
def initialize(top, left, width, height, options={})
|
14
|
+
super(top, left, width, height)
|
15
|
+
@placeholder = false
|
16
|
+
@spanning = false
|
17
|
+
@text_elements = []
|
18
|
+
@options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.new_from_points(topleft, bottomright, options={})
|
22
|
+
width = bottomright.x - topleft.x
|
23
|
+
height = bottomright.y - topleft.y
|
24
|
+
Cell.new(topleft.y, topleft.x, width, height, options)
|
25
|
+
end
|
26
|
+
|
27
|
+
def text
|
28
|
+
return "placeholder" if @placeholder && @options[:cell_debug] >= DEBUG
|
29
|
+
output = ""
|
30
|
+
text_elements.sort #use the default sort for ZoneEntity
|
31
|
+
text_elements.group_by(&:top).values.each do |row|
|
32
|
+
output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
|
33
|
+
end
|
34
|
+
if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
|
35
|
+
text_output = output.dup
|
36
|
+
output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
|
37
|
+
output += " \n #{text_output}"
|
38
|
+
end
|
39
|
+
output.strip
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
require 'set'
|
2
|
+
java_import java.awt.Polygon
|
3
|
+
java_import java.awt.geom.Area
|
4
|
+
|
5
|
+
module Tabula
|
6
|
+
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
|
7
|
+
module HasCells
|
8
|
+
|
9
|
+
IS_TABULAR_HEURISTIC_RATIO = 0.8
|
10
|
+
ANOTHER_MAGIC_NUMBER = 0.75
|
11
|
+
|
12
|
+
def is_tabular?
|
13
|
+
#spreadsheet extraction
|
14
|
+
spreadsheet = spreadsheets.first
|
15
|
+
return false if spreadsheet.nil?
|
16
|
+
rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
|
17
|
+
columns_defined_by_lines = spreadsheet.cols.size
|
18
|
+
|
19
|
+
table = self.get_table
|
20
|
+
columns_defined_without_lines = table.cols.size
|
21
|
+
rows_defined_without_lines = table.rows.size
|
22
|
+
ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
|
23
|
+
|
24
|
+
return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
|
25
|
+
end
|
26
|
+
|
27
|
+
# finds cells from the ruling lines on the page.
|
28
|
+
# implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
|
29
|
+
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
30
|
+
def find_cells!(options={})
|
31
|
+
# All lines need to been sorted from up to down,
|
32
|
+
# and left to right in ascending order
|
33
|
+
|
34
|
+
cellsFound = []
|
35
|
+
|
36
|
+
intersection_points = Ruling.find_intersections(horizontal_ruling_lines, vertical_ruling_lines)
|
37
|
+
|
38
|
+
# All crossing-points have been sorted from up to down,
|
39
|
+
# and left to right in ascending order
|
40
|
+
# depending on the Point2D default sort here.
|
41
|
+
intersection_points_array = intersection_points.keys.sort
|
42
|
+
|
43
|
+
intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
|
44
|
+
# Fetch all points on the same vertical and horizontal
|
45
|
+
# line with current crossing point
|
46
|
+
|
47
|
+
# this lets us go to the next intersection_point in intersection_points_array
|
48
|
+
# it is bad and I feel bad.
|
49
|
+
catch :cellCreated do
|
50
|
+
|
51
|
+
# CrossingPointsDirectlyBelow( topLeft );
|
52
|
+
x_points = intersection_points_array[i..-1].select{|pt| pt.x == topLeft.x && pt.y > topLeft.y }
|
53
|
+
# CrossingPointsDirectlyToTheRight( topLeft );
|
54
|
+
y_points = intersection_points_array[i..-1].select{|pt| pt.y == topLeft.y && pt.x > topLeft.x }
|
55
|
+
|
56
|
+
|
57
|
+
x_points.each do |x_point|
|
58
|
+
# Skip to next crossing-point
|
59
|
+
# if( NOT EdgeExistsBetween( topLeft, x_point)) next crossing-
|
60
|
+
# point;
|
61
|
+
next unless vertical.colinear?(x_point)
|
62
|
+
y_points.each do |y_point|
|
63
|
+
|
64
|
+
# if( NOT EdgeExistsBetween( topLeft, y_point)) next crossing-
|
65
|
+
# point;
|
66
|
+
next unless horizontal.colinear?(y_point)
|
67
|
+
#Hypothetical bottom right point of rectangle
|
68
|
+
btmRight = Point2D::Float.new( y_point.x, x_point.y )
|
69
|
+
if intersection_points.include?(btmRight)
|
70
|
+
intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
|
71
|
+
if btmRightHorizontal.colinear?( x_point ) &&
|
72
|
+
btmRightVertical.colinear?( y_point )
|
73
|
+
# Rectangle is confirmed to have 4 sides
|
74
|
+
cellsFound << Cell.new_from_points( topLeft, btmRight, options)
|
75
|
+
# Each crossing point can be the top left corner
|
76
|
+
# of only a single rectangle
|
77
|
+
#next crossing-point; we need to "next" out of the outer loop here
|
78
|
+
# to avoid creating non-minimal cells, I htink.
|
79
|
+
throw :cellCreated
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end #cellCreated
|
86
|
+
end
|
87
|
+
self.cells = cellsFound
|
88
|
+
cellsFound
|
89
|
+
end
|
90
|
+
|
91
|
+
#############################
|
92
|
+
# Chapter 2, Spanning Cells #
|
93
|
+
#############################
|
94
|
+
#if c is a "spanning cell", that is
|
95
|
+
# if there are N>0 vertical lines strictly between this cell's left and right
|
96
|
+
#insert N placeholder cells after it with zero size (but same top)
|
97
|
+
|
98
|
+
# subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
|
99
|
+
def add_spanning_cells!
|
100
|
+
#rounding: because Cell.new_from_points, using in #find_cells above, has
|
101
|
+
# a float precision error where, for instance, a cell whose x2 coord is
|
102
|
+
# supposed to be 160.137451171875 comes out as 160.13745498657227 because
|
103
|
+
# of minus. :(
|
104
|
+
vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
|
105
|
+
horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
|
106
|
+
|
107
|
+
cells.each do |c|
|
108
|
+
vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
|
109
|
+
horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
|
110
|
+
|
111
|
+
unless vertical_rulings_spanned_over.empty?
|
112
|
+
c.spanning = true
|
113
|
+
vertical_rulings_spanned_over.each do |spanned_over_line_loc|
|
114
|
+
placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
|
115
|
+
placeholder.placeholder = true
|
116
|
+
cells << placeholder
|
117
|
+
end
|
118
|
+
end
|
119
|
+
unless horizontal_rulings_spanned_over.empty?
|
120
|
+
c.spanning = true
|
121
|
+
horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
|
122
|
+
placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
|
123
|
+
placeholder.placeholder = true
|
124
|
+
cells << placeholder
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
#if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
|
129
|
+
# e.g. -------------------
|
130
|
+
# | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
|
131
|
+
# |-----------------|
|
132
|
+
# | C | C | C | C |
|
133
|
+
# |-----------------|
|
134
|
+
# | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
|
135
|
+
# |---- + ----| P is a "placeholder" cell with either zero width or zero height
|
136
|
+
# | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
|
137
|
+
# |---- + ----| C is an ordinary cell.
|
138
|
+
# | C | P DP | C |
|
139
|
+
# |-----------------|
|
140
|
+
|
141
|
+
unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
|
142
|
+
double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
|
143
|
+
placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
|
144
|
+
placeholder.placeholder = true
|
145
|
+
cells << placeholder
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
#TODO:
|
152
|
+
#returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
|
153
|
+
#maybe placeholders should be added after cells is split into spreadsheets
|
154
|
+
def find_spreadsheets_from_cells
|
155
|
+
cells.sort!
|
156
|
+
|
157
|
+
# via http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
158
|
+
|
159
|
+
points = Set.new
|
160
|
+
cells.each do |cell|
|
161
|
+
#TODO: keep track of cells for each point here for more efficiently keeping track of cells inside a polygon
|
162
|
+
cell.points.each do |pt|
|
163
|
+
if points.include?(pt) # Shared vertex, remove it.
|
164
|
+
points.delete(pt)
|
165
|
+
else
|
166
|
+
points << pt
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
points = points.to_a
|
171
|
+
|
172
|
+
#x first sort
|
173
|
+
points_sort_x = points.sort{ |s, other| s.x_first_cmp(other) }
|
174
|
+
points_sort_y = points.sort
|
175
|
+
|
176
|
+
edges_h = {}
|
177
|
+
edges_v = {}
|
178
|
+
|
179
|
+
i = 0
|
180
|
+
while i < points.size do
|
181
|
+
curr_y = points_sort_y[i].y
|
182
|
+
while i < points.size && points_sort_y[i].y == curr_y do
|
183
|
+
edges_h[points_sort_y[i]] = points_sort_y[i + 1]
|
184
|
+
edges_h[points_sort_y[i + 1]] = points_sort_y[i]
|
185
|
+
i += 2
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
i = 0
|
190
|
+
while i < points.size do
|
191
|
+
curr_x = points_sort_x[i].x
|
192
|
+
while i < points.size && points_sort_x[i].x == curr_x do
|
193
|
+
edges_v[points_sort_x[i]] = points_sort_x[i + 1]
|
194
|
+
edges_v[points_sort_x[i + 1]] = points_sort_x[i]
|
195
|
+
i += 2
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# Get all the polygons.
|
200
|
+
polygons = []
|
201
|
+
while !edges_h.empty?
|
202
|
+
# We can start with any point.
|
203
|
+
#TODO: should the polygon be represented just by an ordered array of points?
|
204
|
+
polygon = [[edges_h.shift[0], :horiz]] #popitem removes and returns a random key-value pair
|
205
|
+
loop do
|
206
|
+
curr, e = polygon.last
|
207
|
+
if e == :horiz
|
208
|
+
next_vertex = edges_v.delete(curr)
|
209
|
+
polygon << [next_vertex, :vert]
|
210
|
+
else
|
211
|
+
next_vertex = edges_h.delete(curr) #pop removes and returns the value at key `curr`
|
212
|
+
polygon << [next_vertex, :horiz]
|
213
|
+
end
|
214
|
+
if polygon[-1] == polygon[0]
|
215
|
+
# Closed polygon
|
216
|
+
polygon.pop()
|
217
|
+
break
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# Remove implementation-markers (:horiz and :vert) from the polygon.
|
222
|
+
polygon.map!{|point, _| point}
|
223
|
+
polygon.each do |vertex|
|
224
|
+
edges_h.delete(vertex) if edges_h.include?(vertex)
|
225
|
+
edges_v.delete(vertex) if edges_v.include?(vertex)
|
226
|
+
end
|
227
|
+
polygons << polygon
|
228
|
+
end
|
229
|
+
|
230
|
+
# for efficiency's sake, we maybe ought to use java Polygon objects internally
|
231
|
+
# for flexibility, we don't.
|
232
|
+
|
233
|
+
polygons.map do |polygon|
|
234
|
+
xpoints = []
|
235
|
+
ypoints = []
|
236
|
+
polygon.each do |pt|
|
237
|
+
xpoints << pt.x
|
238
|
+
ypoints << pt.y
|
239
|
+
end
|
240
|
+
Area.new(Polygon.new(xpoints.to_java(Java::int), ypoints.to_java(Java::int), xpoints.size)) #lol jruby
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Line < ZoneEntity
|
3
|
+
attr_accessor :text_elements
|
4
|
+
attr_reader :index
|
5
|
+
|
6
|
+
def initialize(index=nil)
|
7
|
+
@text_elements = []
|
8
|
+
@index = index
|
9
|
+
end
|
10
|
+
|
11
|
+
def <<(t)
|
12
|
+
if @text_elements.size == 0
|
13
|
+
@text_elements << t
|
14
|
+
self.top = t.top
|
15
|
+
self.left = t.left
|
16
|
+
self.width = t.width
|
17
|
+
self.height = t.height
|
18
|
+
else
|
19
|
+
if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
|
20
|
+
in_same_column.merge!(t)
|
21
|
+
else
|
22
|
+
self.text_elements << t
|
23
|
+
self.merge!(t)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
#used for testing, ignores text element stuff besides stripped text.
|
29
|
+
def ==(other)
|
30
|
+
return false if other.nil?
|
31
|
+
self.text_elements = self.text_elements.rpad(TextElement::EMPTY, other.text_elements.size)
|
32
|
+
other.text_elements = other.text_elements.rpad(TextElement::EMPTY, self.text_elements.size)
|
33
|
+
self.text_elements.zip(other.text_elements).inject(true) do |memo, my_yours|
|
34
|
+
my, yours = my_yours
|
35
|
+
memo && my == yours
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
module Tabula
|
2
|
+
class Page < ZoneEntity
|
3
|
+
include Tabula::HasCells
|
4
|
+
|
5
|
+
attr_reader :rotation, :number_one_indexed, :file_path
|
6
|
+
attr_writer :min_char_width, :min_char_height
|
7
|
+
attr_accessor :cells
|
8
|
+
|
9
|
+
def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
|
10
|
+
super(0, 0, width, height)
|
11
|
+
@rotation = rotation
|
12
|
+
if number < 1
|
13
|
+
raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid."
|
14
|
+
end
|
15
|
+
@ruling_lines = ruling_lines
|
16
|
+
@file_path = file_path
|
17
|
+
@number_one_indexed = number
|
18
|
+
self.texts = texts
|
19
|
+
@cells = []
|
20
|
+
@spreadsheets = nil
|
21
|
+
@min_char_width = min_char_width
|
22
|
+
@min_char_height = min_char_height
|
23
|
+
end
|
24
|
+
|
25
|
+
def min_char_width
|
26
|
+
@min_char_width ||= texts.map(&:width).min
|
27
|
+
end
|
28
|
+
|
29
|
+
def min_char_height
|
30
|
+
@min_char_height ||= texts.map(&:height).min
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_area(area)
|
34
|
+
if area.is_a?(Array)
|
35
|
+
top, left, bottom, right = area
|
36
|
+
area = Tabula::ZoneEntity.new(top, left,
|
37
|
+
right - left, bottom - top)
|
38
|
+
end
|
39
|
+
|
40
|
+
texts = self.get_text(area)
|
41
|
+
page_area = PageArea.new(file_path,
|
42
|
+
area.width,
|
43
|
+
area.height,
|
44
|
+
rotation,
|
45
|
+
number,
|
46
|
+
texts,
|
47
|
+
Ruling.crop_rulings_to_area(@ruling_lines, area),
|
48
|
+
texts.map(&:width).min,
|
49
|
+
texts.map(&:height).min)
|
50
|
+
return page_area
|
51
|
+
end
|
52
|
+
|
53
|
+
#returns a Table object
|
54
|
+
def get_table(options={})
|
55
|
+
options = {:vertical_rulings => []}.merge(options)
|
56
|
+
if texts.empty?
|
57
|
+
return []
|
58
|
+
end
|
59
|
+
|
60
|
+
text_chunks = TextElement.merge_words(self.texts, options).sort
|
61
|
+
|
62
|
+
lines = TextChunk.group_by_lines(text_chunks)
|
63
|
+
|
64
|
+
unless options[:vertical_rulings].empty?
|
65
|
+
columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
|
66
|
+
separators = columns.sort.reverse
|
67
|
+
else
|
68
|
+
columns = TextChunk.column_positions(text_chunks)
|
69
|
+
separators = columns[1..-1].sort.reverse
|
70
|
+
end
|
71
|
+
|
72
|
+
table = Table.new(lines.count, separators)
|
73
|
+
lines.each_with_index do |line, i|
|
74
|
+
line.text_elements.each do |te|
|
75
|
+
j = separators.find_index { |s| te.left > s } || separators.count
|
76
|
+
table.add_text_element(te, i, separators.count - j)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
table.lstrip_lines!
|
81
|
+
table
|
82
|
+
end
|
83
|
+
|
84
|
+
#for API backwards-compatibility reasons, this returns an array of arrays.
|
85
|
+
def make_table(options={})
|
86
|
+
get_table(options).lines.map do |l|
|
87
|
+
l.text_elements.map! do |te|
|
88
|
+
te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
|
89
|
+
end
|
90
|
+
end.sort_by { |l| l.map { |te| te.top or 0 }.max }
|
91
|
+
end
|
92
|
+
|
93
|
+
# returns the Spreadsheets; creating them if they're not memoized
|
94
|
+
def spreadsheets(options={})
|
95
|
+
unless @spreadsheets.nil?
|
96
|
+
return @spreadsheets
|
97
|
+
end
|
98
|
+
get_ruling_lines!(options)
|
99
|
+
self.find_cells!(options)
|
100
|
+
|
101
|
+
spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
|
102
|
+
|
103
|
+
#transform each spreadsheet area into a rectangle
|
104
|
+
# and get the cells contained within it.
|
105
|
+
spreadsheet_rectangle_areas = spreadsheet_areas.map{|a| a.getBounds } #getBounds2D is theoretically better, but returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it.
|
106
|
+
|
107
|
+
@spreadsheets = spreadsheet_rectangle_areas.map do |rect|
|
108
|
+
spr = Spreadsheet.new(rect.y, rect.x,
|
109
|
+
rect.width, rect.height,
|
110
|
+
self,
|
111
|
+
#TODO: keep track of the cells, instead of getting them again inefficiently.
|
112
|
+
[],
|
113
|
+
vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) },
|
114
|
+
horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) }
|
115
|
+
)
|
116
|
+
spr.cells = @cells.select{|c| spr.overlaps?(c) }
|
117
|
+
spr.add_spanning_cells!
|
118
|
+
spr
|
119
|
+
end
|
120
|
+
if options[:fill_in_cells]
|
121
|
+
fill_in_cells!
|
122
|
+
end
|
123
|
+
spreadsheets
|
124
|
+
end
|
125
|
+
|
126
|
+
def fill_in_cells!(options={})
|
127
|
+
spreadsheets(options).each do |spreadsheet|
|
128
|
+
spreadsheet.cells.each do |cell|
|
129
|
+
cell.text_elements = page.get_cell_text(cell)
|
130
|
+
spreadsheet.cells_resolved = true
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def number(indexing_base=:one_indexed)
|
136
|
+
if indexing_base == :zero_indexed
|
137
|
+
return @number_one_indexed - 1
|
138
|
+
else
|
139
|
+
return @number_one_indexed
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# TODO no need for this, let's choose one name
|
144
|
+
def ruling_lines
|
145
|
+
get_ruling_lines!
|
146
|
+
end
|
147
|
+
|
148
|
+
def horizontal_ruling_lines
|
149
|
+
get_ruling_lines!
|
150
|
+
@horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines
|
151
|
+
end
|
152
|
+
|
153
|
+
def vertical_ruling_lines
|
154
|
+
get_ruling_lines!
|
155
|
+
@vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines
|
156
|
+
end
|
157
|
+
|
158
|
+
#returns ruling lines, memoizes them in
|
159
|
+
def get_ruling_lines!(options={})
|
160
|
+
if !@ruling_lines.nil? && !@ruling_lines.empty?
|
161
|
+
self.snap_points!
|
162
|
+
@vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
|
163
|
+
@horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
|
164
|
+
@vertical_ruling_lines + @horizontal_ruling_lines
|
165
|
+
else
|
166
|
+
[]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
##
|
171
|
+
# get text insidea area
|
172
|
+
# area can be an Array ([top, left, width, height])
|
173
|
+
# or a Rectangle2D
|
174
|
+
def get_text(area=nil)
|
175
|
+
if area.instance_of?(Array)
|
176
|
+
top, left, bottom, right = area
|
177
|
+
area = Tabula::ZoneEntity.new(top, left,
|
178
|
+
right - left, bottom - top)
|
179
|
+
end
|
180
|
+
if area.nil?
|
181
|
+
texts
|
182
|
+
else
|
183
|
+
texts.select do |t|
|
184
|
+
area.contains(t)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def get_cell_text(area=nil)
|
190
|
+
TextElement.merge_words(self.get_text(area))
|
191
|
+
end
|
192
|
+
|
193
|
+
def to_json(options={})
|
194
|
+
{ :width => self.width,
|
195
|
+
:height => self.height,
|
196
|
+
:number => self.number,
|
197
|
+
:rotation => self.rotation,
|
198
|
+
:texts => self.texts
|
199
|
+
}.to_json(options)
|
200
|
+
end
|
201
|
+
|
202
|
+
def snap_points!
|
203
|
+
lines_to_points = {}
|
204
|
+
points = []
|
205
|
+
@ruling_lines.each do |line|
|
206
|
+
point1 = line.p1 #comptooters are the wurst
|
207
|
+
point2 = line.p2
|
208
|
+
# for a given line, each call to #p1 and #p2 creates a new
|
209
|
+
# Point2D::Float object, rather than returning the same one over and
|
210
|
+
# over again.
|
211
|
+
# so we have to get it, store it in memory as `point1` and `point2`
|
212
|
+
# and then store those in various places (and now, modifying one will
|
213
|
+
# modify the reference and thereby modify the other)
|
214
|
+
lines_to_points[line] = [point1, point2]
|
215
|
+
points += [point1, point2]
|
216
|
+
end
|
217
|
+
|
218
|
+
# lines are stored separately from their constituent points
|
219
|
+
# so you can't modify the points and then modify the lines.
|
220
|
+
# ah, but perhaps I can stick the points in a hash AND in an array
|
221
|
+
# and then modify the lines by means of the points in the hash.
|
222
|
+
|
223
|
+
[[:x, :x=, self.min_char_width], [:y, :y=, self.min_char_height]].each do |getter, setter, cell_size|
|
224
|
+
sorted_points = points.sort_by(&getter)
|
225
|
+
first_point = sorted_points.shift
|
226
|
+
grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point|
|
227
|
+
last = memo.last
|
228
|
+
|
229
|
+
if (next_point.send(getter) - last.first.send(getter)).abs < cell_size
|
230
|
+
memo[-1] << next_point
|
231
|
+
else
|
232
|
+
memo << [next_point]
|
233
|
+
end
|
234
|
+
memo
|
235
|
+
end
|
236
|
+
grouped_points.each do |group|
|
237
|
+
uniq_locs = group.map(&getter).uniq
|
238
|
+
avg_loc = uniq_locs.sum / uniq_locs.size
|
239
|
+
group.each{|p| p.send(setter, avg_loc) }
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
lines_to_points.each do |l, p1_p2|
|
244
|
+
l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0],
|
245
|
+
p1_p2[1]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def collapse_oriented_rulings(lines)
|
250
|
+
# lines must all be of one orientation (i.e. horizontal, vertical)
|
251
|
+
lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
|
252
|
+
|
253
|
+
lines = lines.inject([lines.shift]) do |memo, next_line|
|
254
|
+
last = memo.last
|
255
|
+
if next_line.position == last.position && last.nearlyIntersects?(next_line)
|
256
|
+
memo.last.start = next_line.start < last.start ? next_line.start : last.start
|
257
|
+
memo.last.end = next_line.end < last.end ? last.end : next_line.end
|
258
|
+
memo
|
259
|
+
elsif next_line.length == 0
|
260
|
+
memo
|
261
|
+
else
|
262
|
+
memo << next_line
|
263
|
+
end
|
264
|
+
end
|
265
|
+
lines
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
end
|