tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,42 @@
1
+ module Tabula
2
+
3
+ #cells are components of spreadsheets
4
+
5
+ class Cell < ZoneEntity
6
+
7
+ NORMAL = 0
8
+ DEBUG = 1
9
+ SUPERDEBUG = 2
10
+
11
+ attr_accessor :text_elements, :placeholder, :spanning, :options
12
+
13
+ def initialize(top, left, width, height, options={})
14
+ super(top, left, width, height)
15
+ @placeholder = false
16
+ @spanning = false
17
+ @text_elements = []
18
+ @options = ({:use_line_returns => false, :cell_debug => NORMAL}).merge options
19
+ end
20
+
21
+ def self.new_from_points(topleft, bottomright, options={})
22
+ width = bottomright.x - topleft.x
23
+ height = bottomright.y - topleft.y
24
+ Cell.new(topleft.y, topleft.x, width, height, options)
25
+ end
26
+
27
+ def text
28
+ return "placeholder" if @placeholder && @options[:cell_debug] >= DEBUG
29
+ output = ""
30
+ text_elements.sort #use the default sort for ZoneEntity
31
+ text_elements.group_by(&:top).values.each do |row|
32
+ output << row.map{|el| el.text}.join('') + (@options[:use_line_returns] ? "\n" : '')
33
+ end
34
+ if (output.empty? && @options[:cell_debug] >= DEBUG) || @options[:cell_debug] >= SUPERDEBUG
35
+ text_output = output.dup
36
+ output = "top: #{top} left: #{left} \n w: #{width} h: #{height}"
37
+ output += " \n #{text_output}"
38
+ end
39
+ output.strip
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,244 @@
1
+ require 'set'
2
+ java_import java.awt.Polygon
3
+ java_import java.awt.geom.Area
4
+
5
+ module Tabula
6
+ # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
7
+ module HasCells
8
+
9
+ IS_TABULAR_HEURISTIC_RATIO = 0.8
10
+ ANOTHER_MAGIC_NUMBER = 0.75
11
+
12
+ def is_tabular?
13
+ #spreadsheet extraction
14
+ spreadsheet = spreadsheets.first
15
+ return false if spreadsheet.nil?
16
+ rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically
17
+ columns_defined_by_lines = spreadsheet.cols.size
18
+
19
+ table = self.get_table
20
+ columns_defined_without_lines = table.cols.size
21
+ rows_defined_without_lines = table.rows.size
22
+ ratio = ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2
23
+
24
+ return ratio > ANOTHER_MAGIC_NUMBER && ratio < (1 / ANOTHER_MAGIC_NUMBER)
25
+ end
26
+
27
+ # finds cells from the ruling lines on the page.
28
+ # implements Nurminen thesis algorithm cf. https://github.com/jazzido/tabula-extractor/issues/16
29
+ # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
30
+ def find_cells!(options={})
31
+ # All lines need to been sorted from up to down,
32
+ # and left to right in ascending order
33
+
34
+ cellsFound = []
35
+
36
+ intersection_points = Ruling.find_intersections(horizontal_ruling_lines, vertical_ruling_lines)
37
+
38
+ # All crossing-points have been sorted from up to down,
39
+ # and left to right in ascending order
40
+ # depending on the Point2D default sort here.
41
+ intersection_points_array = intersection_points.keys.sort
42
+
43
+ intersection_points.each_with_index do |(topLeft, ((horizontal, vertical))), i|
44
+ # Fetch all points on the same vertical and horizontal
45
+ # line with current crossing point
46
+
47
+ # this lets us go to the next intersection_point in intersection_points_array
48
+ # it is bad and I feel bad.
49
+ catch :cellCreated do
50
+
51
+ # CrossingPointsDirectlyBelow( topLeft );
52
+ x_points = intersection_points_array[i..-1].select{|pt| pt.x == topLeft.x && pt.y > topLeft.y }
53
+ # CrossingPointsDirectlyToTheRight( topLeft );
54
+ y_points = intersection_points_array[i..-1].select{|pt| pt.y == topLeft.y && pt.x > topLeft.x }
55
+
56
+
57
+ x_points.each do |x_point|
58
+ # Skip to next crossing-point
59
+ # if( NOT EdgeExistsBetween( topLeft, x_point)) next crossing-
60
+ # point;
61
+ next unless vertical.colinear?(x_point)
62
+ y_points.each do |y_point|
63
+
64
+ # if( NOT EdgeExistsBetween( topLeft, y_point)) next crossing-
65
+ # point;
66
+ next unless horizontal.colinear?(y_point)
67
+ #Hypothetical bottom right point of rectangle
68
+ btmRight = Point2D::Float.new( y_point.x, x_point.y )
69
+ if intersection_points.include?(btmRight)
70
+ intersection_points[btmRight].each do |btmRightHorizontal, btmRightVertical|
71
+ if btmRightHorizontal.colinear?( x_point ) &&
72
+ btmRightVertical.colinear?( y_point )
73
+ # Rectangle is confirmed to have 4 sides
74
+ cellsFound << Cell.new_from_points( topLeft, btmRight, options)
75
+ # Each crossing point can be the top left corner
76
+ # of only a single rectangle
77
+ #next crossing-point; we need to "next" out of the outer loop here
78
+ # to avoid creating non-minimal cells, I htink.
79
+ throw :cellCreated
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end #cellCreated
86
+ end
87
+ self.cells = cellsFound
88
+ cellsFound
89
+ end
90
+
91
+ #############################
92
+ # Chapter 2, Spanning Cells #
93
+ #############################
94
+ #if c is a "spanning cell", that is
95
+ # if there are N>0 vertical lines strictly between this cell's left and right
96
+ #insert N placeholder cells after it with zero size (but same top)
97
+
98
+ # subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
99
+ def add_spanning_cells!
100
+ #rounding: because Cell.new_from_points, using in #find_cells above, has
101
+ # a float precision error where, for instance, a cell whose x2 coord is
102
+ # supposed to be 160.137451171875 comes out as 160.13745498657227 because
103
+ # of minus. :(
104
+ vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted
105
+ horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted
106
+
107
+ cells.each do |c|
108
+ vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) }
109
+ horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) }
110
+
111
+ unless vertical_rulings_spanned_over.empty?
112
+ c.spanning = true
113
+ vertical_rulings_spanned_over.each do |spanned_over_line_loc|
114
+ placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height)
115
+ placeholder.placeholder = true
116
+ cells << placeholder
117
+ end
118
+ end
119
+ unless horizontal_rulings_spanned_over.empty?
120
+ c.spanning = true
121
+ horizontal_rulings_spanned_over.each do |spanned_over_line_loc|
122
+ placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0)
123
+ placeholder.placeholder = true
124
+ cells << placeholder
125
+ end
126
+ end
127
+
128
+ #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells
129
+ # e.g. -------------------
130
+ # | C | C | C | C | (this is some pretty sweet ASCII art, eh?)
131
+ # |-----------------|
132
+ # | C | C | C | C |
133
+ # |-----------------|
134
+ # | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds
135
+ # |---- + ----| P is a "placeholder" cell with either zero width or zero height
136
+ # | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height
137
+ # |---- + ----| C is an ordinary cell.
138
+ # | C | P DP | C |
139
+ # |-----------------|
140
+
141
+ unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty?
142
+ double_placeholders.each do |vert_spanned_over, horiz_spanned_over|
143
+ placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0)
144
+ placeholder.placeholder = true
145
+ cells << placeholder
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ #TODO:
152
+ #returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells)
153
+ #maybe placeholders should be added after cells is split into spreadsheets
154
+ def find_spreadsheets_from_cells
155
+ cells.sort!
156
+
157
+ # via http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
158
+
159
+ points = Set.new
160
+ cells.each do |cell|
161
+ #TODO: keep track of cells for each point here for more efficiently keeping track of cells inside a polygon
162
+ cell.points.each do |pt|
163
+ if points.include?(pt) # Shared vertex, remove it.
164
+ points.delete(pt)
165
+ else
166
+ points << pt
167
+ end
168
+ end
169
+ end
170
+ points = points.to_a
171
+
172
+ #x first sort
173
+ points_sort_x = points.sort{ |s, other| s.x_first_cmp(other) }
174
+ points_sort_y = points.sort
175
+
176
+ edges_h = {}
177
+ edges_v = {}
178
+
179
+ i = 0
180
+ while i < points.size do
181
+ curr_y = points_sort_y[i].y
182
+ while i < points.size && points_sort_y[i].y == curr_y do
183
+ edges_h[points_sort_y[i]] = points_sort_y[i + 1]
184
+ edges_h[points_sort_y[i + 1]] = points_sort_y[i]
185
+ i += 2
186
+ end
187
+ end
188
+
189
+ i = 0
190
+ while i < points.size do
191
+ curr_x = points_sort_x[i].x
192
+ while i < points.size && points_sort_x[i].x == curr_x do
193
+ edges_v[points_sort_x[i]] = points_sort_x[i + 1]
194
+ edges_v[points_sort_x[i + 1]] = points_sort_x[i]
195
+ i += 2
196
+ end
197
+ end
198
+
199
+ # Get all the polygons.
200
+ polygons = []
201
+ while !edges_h.empty?
202
+ # We can start with any point.
203
+ #TODO: should the polygon be represented just by an ordered array of points?
204
+ polygon = [[edges_h.shift[0], :horiz]] #popitem removes and returns a random key-value pair
205
+ loop do
206
+ curr, e = polygon.last
207
+ if e == :horiz
208
+ next_vertex = edges_v.delete(curr)
209
+ polygon << [next_vertex, :vert]
210
+ else
211
+ next_vertex = edges_h.delete(curr) #pop removes and returns the value at key `curr`
212
+ polygon << [next_vertex, :horiz]
213
+ end
214
+ if polygon[-1] == polygon[0]
215
+ # Closed polygon
216
+ polygon.pop()
217
+ break
218
+ end
219
+ end
220
+
221
+ # Remove implementation-markers (:horiz and :vert) from the polygon.
222
+ polygon.map!{|point, _| point}
223
+ polygon.each do |vertex|
224
+ edges_h.delete(vertex) if edges_h.include?(vertex)
225
+ edges_v.delete(vertex) if edges_v.include?(vertex)
226
+ end
227
+ polygons << polygon
228
+ end
229
+
230
+ # for efficiency's sake, we maybe ought to use java Polygon objects internally
231
+ # for flexibility, we don't.
232
+
233
+ polygons.map do |polygon|
234
+ xpoints = []
235
+ ypoints = []
236
+ polygon.each do |pt|
237
+ xpoints << pt.x
238
+ ypoints << pt.y
239
+ end
240
+ Area.new(Polygon.new(xpoints.to_java(Java::int), ypoints.to_java(Java::int), xpoints.size)) #lol jruby
241
+ end
242
+ end
243
+ end
244
+ end
@@ -0,0 +1,39 @@
1
+ module Tabula
2
+ class Line < ZoneEntity
3
+ attr_accessor :text_elements
4
+ attr_reader :index
5
+
6
+ def initialize(index=nil)
7
+ @text_elements = []
8
+ @index = index
9
+ end
10
+
11
+ def <<(t)
12
+ if @text_elements.size == 0
13
+ @text_elements << t
14
+ self.top = t.top
15
+ self.left = t.left
16
+ self.width = t.width
17
+ self.height = t.height
18
+ else
19
+ if in_same_column = @text_elements.find { |te| te.horizontally_overlaps?(t) }
20
+ in_same_column.merge!(t)
21
+ else
22
+ self.text_elements << t
23
+ self.merge!(t)
24
+ end
25
+ end
26
+ end
27
+
28
+ #used for testing, ignores text element stuff besides stripped text.
29
+ def ==(other)
30
+ return false if other.nil?
31
+ self.text_elements = self.text_elements.rpad(TextElement::EMPTY, other.text_elements.size)
32
+ other.text_elements = other.text_elements.rpad(TextElement::EMPTY, self.text_elements.size)
33
+ self.text_elements.zip(other.text_elements).inject(true) do |memo, my_yours|
34
+ my, yours = my_yours
35
+ memo && my == yours
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,269 @@
1
+ module Tabula
2
+ class Page < ZoneEntity
3
+ include Tabula::HasCells
4
+
5
+ attr_reader :rotation, :number_one_indexed, :file_path
6
+ attr_writer :min_char_width, :min_char_height
7
+ attr_accessor :cells
8
+
9
+ def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil)
10
+ super(0, 0, width, height)
11
+ @rotation = rotation
12
+ if number < 1
13
+ raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid."
14
+ end
15
+ @ruling_lines = ruling_lines
16
+ @file_path = file_path
17
+ @number_one_indexed = number
18
+ self.texts = texts
19
+ @cells = []
20
+ @spreadsheets = nil
21
+ @min_char_width = min_char_width
22
+ @min_char_height = min_char_height
23
+ end
24
+
25
+ def min_char_width
26
+ @min_char_width ||= texts.map(&:width).min
27
+ end
28
+
29
+ def min_char_height
30
+ @min_char_height ||= texts.map(&:height).min
31
+ end
32
+
33
+ def get_area(area)
34
+ if area.is_a?(Array)
35
+ top, left, bottom, right = area
36
+ area = Tabula::ZoneEntity.new(top, left,
37
+ right - left, bottom - top)
38
+ end
39
+
40
+ texts = self.get_text(area)
41
+ page_area = PageArea.new(file_path,
42
+ area.width,
43
+ area.height,
44
+ rotation,
45
+ number,
46
+ texts,
47
+ Ruling.crop_rulings_to_area(@ruling_lines, area),
48
+ texts.map(&:width).min,
49
+ texts.map(&:height).min)
50
+ return page_area
51
+ end
52
+
53
+ #returns a Table object
54
+ def get_table(options={})
55
+ options = {:vertical_rulings => []}.merge(options)
56
+ if texts.empty?
57
+ return []
58
+ end
59
+
60
+ text_chunks = TextElement.merge_words(self.texts, options).sort
61
+
62
+ lines = TextChunk.group_by_lines(text_chunks)
63
+
64
+ unless options[:vertical_rulings].empty?
65
+ columns = options[:vertical_rulings].map(&:left) #pixel locations, not entities
66
+ separators = columns.sort.reverse
67
+ else
68
+ columns = TextChunk.column_positions(text_chunks)
69
+ separators = columns[1..-1].sort.reverse
70
+ end
71
+
72
+ table = Table.new(lines.count, separators)
73
+ lines.each_with_index do |line, i|
74
+ line.text_elements.each do |te|
75
+ j = separators.find_index { |s| te.left > s } || separators.count
76
+ table.add_text_element(te, i, separators.count - j)
77
+ end
78
+ end
79
+
80
+ table.lstrip_lines!
81
+ table
82
+ end
83
+
84
+ #for API backwards-compatibility reasons, this returns an array of arrays.
85
+ def make_table(options={})
86
+ get_table(options).lines.map do |l|
87
+ l.text_elements.map! do |te|
88
+ te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
89
+ end
90
+ end.sort_by { |l| l.map { |te| te.top or 0 }.max }
91
+ end
92
+
93
+ # returns the Spreadsheets; creating them if they're not memoized
94
+ def spreadsheets(options={})
95
+ unless @spreadsheets.nil?
96
+ return @spreadsheets
97
+ end
98
+ get_ruling_lines!(options)
99
+ self.find_cells!(options)
100
+
101
+ spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons.
102
+
103
+ #transform each spreadsheet area into a rectangle
104
+ # and get the cells contained within it.
105
+ spreadsheet_rectangle_areas = spreadsheet_areas.map{|a| a.getBounds } #getBounds2D is theoretically better, but returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it.
106
+
107
+ @spreadsheets = spreadsheet_rectangle_areas.map do |rect|
108
+ spr = Spreadsheet.new(rect.y, rect.x,
109
+ rect.width, rect.height,
110
+ self,
111
+ #TODO: keep track of the cells, instead of getting them again inefficiently.
112
+ [],
113
+ vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) },
114
+ horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) }
115
+ )
116
+ spr.cells = @cells.select{|c| spr.overlaps?(c) }
117
+ spr.add_spanning_cells!
118
+ spr
119
+ end
120
+ if options[:fill_in_cells]
121
+ fill_in_cells!
122
+ end
123
+ spreadsheets
124
+ end
125
+
126
+ def fill_in_cells!(options={})
127
+ spreadsheets(options).each do |spreadsheet|
128
+ spreadsheet.cells.each do |cell|
129
+ cell.text_elements = page.get_cell_text(cell)
130
+ spreadsheet.cells_resolved = true
131
+ end
132
+ end
133
+ end
134
+
135
+ def number(indexing_base=:one_indexed)
136
+ if indexing_base == :zero_indexed
137
+ return @number_one_indexed - 1
138
+ else
139
+ return @number_one_indexed
140
+ end
141
+ end
142
+
143
+ # TODO no need for this, let's choose one name
144
+ def ruling_lines
145
+ get_ruling_lines!
146
+ end
147
+
148
+ def horizontal_ruling_lines
149
+ get_ruling_lines!
150
+ @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines
151
+ end
152
+
153
+ def vertical_ruling_lines
154
+ get_ruling_lines!
155
+ @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines
156
+ end
157
+
158
+ #returns ruling lines, memoizes them in
159
+ def get_ruling_lines!(options={})
160
+ if !@ruling_lines.nil? && !@ruling_lines.empty?
161
+ self.snap_points!
162
+ @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?))
163
+ @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?))
164
+ @vertical_ruling_lines + @horizontal_ruling_lines
165
+ else
166
+ []
167
+ end
168
+ end
169
+
170
+ ##
171
+ # get text insidea area
172
+ # area can be an Array ([top, left, width, height])
173
+ # or a Rectangle2D
174
+ def get_text(area=nil)
175
+ if area.instance_of?(Array)
176
+ top, left, bottom, right = area
177
+ area = Tabula::ZoneEntity.new(top, left,
178
+ right - left, bottom - top)
179
+ end
180
+ if area.nil?
181
+ texts
182
+ else
183
+ texts.select do |t|
184
+ area.contains(t)
185
+ end
186
+ end
187
+ end
188
+
189
+ def get_cell_text(area=nil)
190
+ TextElement.merge_words(self.get_text(area))
191
+ end
192
+
193
+ def to_json(options={})
194
+ { :width => self.width,
195
+ :height => self.height,
196
+ :number => self.number,
197
+ :rotation => self.rotation,
198
+ :texts => self.texts
199
+ }.to_json(options)
200
+ end
201
+
202
+ def snap_points!
203
+ lines_to_points = {}
204
+ points = []
205
+ @ruling_lines.each do |line|
206
+ point1 = line.p1 #comptooters are the wurst
207
+ point2 = line.p2
208
+ # for a given line, each call to #p1 and #p2 creates a new
209
+ # Point2D::Float object, rather than returning the same one over and
210
+ # over again.
211
+ # so we have to get it, store it in memory as `point1` and `point2`
212
+ # and then store those in various places (and now, modifying one will
213
+ # modify the reference and thereby modify the other)
214
+ lines_to_points[line] = [point1, point2]
215
+ points += [point1, point2]
216
+ end
217
+
218
+ # lines are stored separately from their constituent points
219
+ # so you can't modify the points and then modify the lines.
220
+ # ah, but perhaps I can stick the points in a hash AND in an array
221
+ # and then modify the lines by means of the points in the hash.
222
+
223
+ [[:x, :x=, self.min_char_width], [:y, :y=, self.min_char_height]].each do |getter, setter, cell_size|
224
+ sorted_points = points.sort_by(&getter)
225
+ first_point = sorted_points.shift
226
+ grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point|
227
+ last = memo.last
228
+
229
+ if (next_point.send(getter) - last.first.send(getter)).abs < cell_size
230
+ memo[-1] << next_point
231
+ else
232
+ memo << [next_point]
233
+ end
234
+ memo
235
+ end
236
+ grouped_points.each do |group|
237
+ uniq_locs = group.map(&getter).uniq
238
+ avg_loc = uniq_locs.sum / uniq_locs.size
239
+ group.each{|p| p.send(setter, avg_loc) }
240
+ end
241
+ end
242
+
243
+ lines_to_points.each do |l, p1_p2|
244
+ l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0],
245
+ p1_p2[1]
246
+ end
247
+ end
248
+
249
+ def collapse_oriented_rulings(lines)
250
+ # lines must all be of one orientation (i.e. horizontal, vertical)
251
+ lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start }
252
+
253
+ lines = lines.inject([lines.shift]) do |memo, next_line|
254
+ last = memo.last
255
+ if next_line.position == last.position && last.nearlyIntersects?(next_line)
256
+ memo.last.start = next_line.start < last.start ? next_line.start : last.start
257
+ memo.last.end = next_line.end < last.end ? last.end : next_line.end
258
+ memo
259
+ elsif next_line.length == 0
260
+ memo
261
+ else
262
+ memo << next_line
263
+ end
264
+ end
265
+ lines
266
+ end
267
+ end
268
+
269
+ end