tabula-extractor 0.6.6-java → 0.7.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -5,8 +5,7 @@ require 'ffi'
5
5
 
6
6
  require_relative './entities'
7
7
  require_relative './pdf_render'
8
- require_relative './pdf_dump'
9
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
8
+ require_relative './extraction'
10
9
 
11
10
  java_import javax.imageio.ImageIO
12
11
  java_import java.awt.image.BufferedImage
@@ -55,6 +54,7 @@ module Tabula
55
54
  lines
56
55
  end
57
56
 
57
+ #zero-indexed page_number
58
58
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
59
59
  options = DETECT_LINES_DEFAULTS.merge(options)
60
60
 
@@ -79,7 +79,7 @@ module Tabula
79
79
  raise ArgumentError, 'image must be a string or a BufferedImage'
80
80
  end
81
81
 
82
- image = LSD.image_to_image_double(bimage)
82
+ image = LSD.image_to_image_float(bimage)
83
83
 
84
84
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
85
85
 
@@ -89,7 +89,7 @@ module Tabula
89
89
 
90
90
  rv = []
91
91
  lines_found.times do |i|
92
- a = out[7*8*i].read_array_of_type(:double, 7)
92
+ a = out[7*4*i].read_array_of_type(:float, 7)
93
93
 
94
94
  a_round = a[0..3].map(&:round)
95
95
  p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
@@ -109,17 +109,19 @@ module Tabula
109
109
  end
110
110
 
111
111
  private
112
- def LSD.image_to_image_double(buffered_image)
112
+
113
+ def LSD.image_to_image_float(buffered_image)
113
114
  width = buffered_image.getWidth; height = buffered_image.getHeight
114
115
  raster_size = width * height
115
116
 
116
- image_double = FFI::MemoryPointer.new(:double, raster_size)
117
+ image_float = FFI::MemoryPointer.new(:float, raster_size)
117
118
  pixels = Java::int[width * height].new
118
119
  buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
119
120
 
120
- image_double.put_array_of_double 0, pixels.to_a
121
+ image_float.put_array_of_float 0, pixels.to_a
121
122
  end
122
123
 
124
+
123
125
  end
124
126
  end
125
127
 
@@ -0,0 +1,319 @@
1
+ java_import org.apache.pdfbox.util.operator.OperatorProcessor
2
+ java_import org.apache.pdfbox.pdfparser.PDFParser
3
+ java_import org.apache.pdfbox.util.PDFStreamEngine
4
+ java_import org.apache.pdfbox.util.ResourceLoader
5
+
6
+ java_import java.awt.geom.PathIterator
7
+ java_import java.awt.geom.Point2D
8
+ java_import java.awt.geom.GeneralPath
9
+ java_import java.awt.geom.AffineTransform
10
+ java_import java.awt.Color
11
+
12
+ warn 'Tabula::Extraction::LineExtractor is DEPRECATED and will be removed'
13
+
14
+ class Tabula::Extraction::LineExtractor < org.apache.pdfbox.util.PDFStreamEngine
15
+
16
+ attr_accessor :currentX, :currentY
17
+ attr_accessor :currentPath
18
+ attr_accessor :rulings
19
+ attr_accessor :options
20
+ field_accessor :page
21
+
22
+ DETECT_LINES_DEFAULTS = {
23
+ :snapping_grid_cell_size => 2
24
+ }
25
+
26
+ def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
27
+ lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top }
28
+ lines.inject([]) do |memo, next_line|
29
+ if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line)
30
+ memo.last.top = [next_line.top, memo.last.top].min
31
+ memo.last.bottom = [next_line.bottom, memo.last.bottom].max
32
+ memo
33
+ else
34
+ memo << next_line
35
+ end
36
+ end
37
+ end
38
+
39
+ def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
40
+ lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left }
41
+ lines.inject([]) do |memo, next_line|
42
+ if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line)
43
+ memo.last.left = [next_line.left, memo.last.left].min
44
+ memo.last.right = [next_line.right, memo.last.right].max
45
+ memo
46
+ else
47
+ memo << next_line
48
+ end
49
+ end
50
+ end
51
+
52
+ #N.B. for merge `spreadsheets` into `text-extractor-refactor` --
53
+ # only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method
54
+ # the rest is readability changes.
55
+ #page_number here is zero-indexed
56
+ def self.lines_in_pdf_page(pdf_path, page_number, options={})
57
+ options = options.merge!(DETECT_LINES_DEFAULTS)
58
+ if options[:render_pdf]
59
+ # only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here
60
+ # since there's no good reason want unclean lines
61
+ Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, options))
62
+ else
63
+ pdf_file = ::Tabula::Extraction.openPDF(pdf_path)
64
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number]
65
+ le = self.new(options)
66
+ le.processStream(page, page.findResources, page.getContents.getStream)
67
+ pdf_file.close
68
+ rulings = le.rulings.map do |l, color|
69
+ ::Tabula::Ruling.new(l.getP1.getY,
70
+ l.getP1.getX,
71
+ l.getP2.getX - l.getP1.getX,
72
+ l.getP2.getY - l.getP1.getY,
73
+ color)
74
+ end
75
+ rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
76
+ collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?))
77
+ end
78
+ end
79
+
80
+ class LineToOperator < OperatorProcessor
81
+ def process(operator, arguments)
82
+ drawer = self.context
83
+ x, y = arguments[0], arguments[1]
84
+ ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
85
+
86
+ l = java.awt.geom.Line2D::Float.new(drawer.currentX, drawer.currentY, ppos.getX, ppos.getY)
87
+
88
+ drawer.currentPath << l if l.horizontal? or l.vertical?
89
+
90
+ drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
91
+ end
92
+ end
93
+
94
+ class MoveToOperator < OperatorProcessor
95
+ def process(operator, arguments)
96
+ drawer = self.context
97
+ x, y = arguments[0], arguments[1]
98
+
99
+ ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
100
+
101
+ drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
102
+ end
103
+ end
104
+
105
+ class AppendRectangleToPathOperator < OperatorProcessor
106
+ def process(operator, arguments)
107
+
108
+ drawer = self.context
109
+ finalX, finalY, finalW, finalH = arguments.to_array.map(&:floatValue)
110
+
111
+ ppos = drawer.TransformedPoint(finalX, finalY)
112
+ psize = drawer.ScaledPoint(finalW, finalH)
113
+
114
+ finalY = ppos.getY - psize.getY
115
+ if finalY < 0
116
+ finalY = 0
117
+ end
118
+
119
+ width = psize.getX.abs
120
+ height = psize.getY.abs
121
+
122
+ lines = if width > height && height < 2 # horizontal line, "thin" rectangle.
123
+ [java.awt.geom.Line2D::Float.new(ppos.getX, finalY + psize.getY/2, ppos.getX + psize.getX, finalY + psize.getY/2)]
124
+ elsif width < height && width < 2 # vertical line, "thin" rectangle
125
+ [java.awt.geom.Line2D::Float.new(ppos.getX + psize.getX/2, finalY, ppos.getX + psize.getX/2, finalY + psize.getY)]
126
+ else
127
+ # add every edge of the rectangle to drawer.rulings
128
+ [java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX + psize.getX, finalY),
129
+ java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX, finalY + psize.getY),
130
+ java.awt.geom.Line2D::Float.new(ppos.getX+psize.getX, finalY, ppos.getX + psize.getX, finalY + psize.getY),
131
+ java.awt.geom.Line2D::Float.new(ppos.getX, finalY+psize.getY, ppos.getX + psize.getX, finalY + psize.getY)]
132
+ end
133
+
134
+ drawer.currentPath += lines.select { |l| l.horizontal? or l.vertical? }
135
+
136
+ end
137
+ end
138
+
139
+ class StrokePathOperator < OperatorProcessor
140
+ def process(operator, arguments)
141
+ drawer = self.context
142
+ strokeColorComps = drawer.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
143
+ color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
144
+ if color_filter.call(strokeColorComps)
145
+ drawer.currentPath.each { |segment| drawer.addRuling(segment, strokeColorComps.to_a) }
146
+ end
147
+
148
+ drawer.currentPath = []
149
+ end
150
+ end
151
+
152
+ class CloseFillNonZeroAndStrokePathOperator < OperatorProcessor
153
+ def process(operator, arguments)
154
+ drawer = self.context
155
+
156
+ fillColorComps = drawer.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil)
157
+ color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
158
+ if color_filter.call(fillColorComps)
159
+ drawer.currentPath.each { |segment| drawer.addRuling(segment, fillColorComps.to_a) }
160
+ end
161
+
162
+ drawer.currentPath = []
163
+ end
164
+ end
165
+
166
+ class CloseAndStrokePathOperator < OperatorProcessor
167
+ def process(operator, arguments)
168
+ drawer = self.context
169
+ drawer.currentPath.each { |segment| drawer.addRuling(segment) }
170
+ drawer.currentPath = []
171
+ end
172
+ end
173
+
174
+ class EndPathOperator < OperatorProcessor
175
+ def process(operator, arguments)
176
+ drawer = self.context
177
+ # end without stroke, we don't care about it. discard it
178
+ drawer.currentPath = []
179
+ end
180
+ end
181
+
182
+ class FillNonZeroRuleOperator < OperatorProcessor
183
+ def process(operator, arguments)
184
+ drawer = self.context
185
+ # end without stroke, we don't care about it. discard it
186
+ drawer.currentPath = []
187
+ end
188
+ end
189
+
190
+ OPERATOR_PROCESSORS = {
191
+ 'm' => MoveToOperator.new,
192
+ 're' => AppendRectangleToPathOperator.new,
193
+ 'l' => LineToOperator.new,
194
+ 'S' => StrokePathOperator.new,
195
+ 's' => StrokePathOperator.new,
196
+ 'n' => EndPathOperator.new,
197
+ 'b' => CloseFillNonZeroAndStrokePathOperator.new,
198
+ 'b*' => CloseFillNonZeroAndStrokePathOperator.new,
199
+ 'f' => CloseFillNonZeroAndStrokePathOperator.new,
200
+ 'f*' => CloseFillNonZeroAndStrokePathOperator.new,
201
+ 'BT' => org.apache.pdfbox.util.operator.BeginText.new,
202
+ 'cm' => org.apache.pdfbox.util.operator.Concatenate.new,
203
+ 'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new,
204
+ 'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new,
205
+ 'ET' => org.apache.pdfbox.util.operator.EndText.new,
206
+ 'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new,
207
+ 'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new,
208
+ 'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new,
209
+ 'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new,
210
+ 'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new,
211
+ 'q' => org.apache.pdfbox.util.operator.GSave.new,
212
+ 'Q' => org.apache.pdfbox.util.operator.GRestore.new,
213
+ 'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new,
214
+ 'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new,
215
+ 'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
216
+ 'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
217
+ 'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
218
+ 'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
219
+ 'T*' => org.apache.pdfbox.util.operator.NextLine.new,
220
+ 'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new,
221
+ 'Td' => org.apache.pdfbox.util.operator.MoveText.new,
222
+ 'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new,
223
+ 'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new,
224
+ 'Tj' => org.apache.pdfbox.util.operator.ShowText.new,
225
+ 'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new,
226
+ 'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new,
227
+ 'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new,
228
+ 'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new,
229
+ 'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new,
230
+ 'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new,
231
+ 'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new,
232
+ "\'" => org.apache.pdfbox.util.operator.MoveAndShow.new,
233
+ '\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new,
234
+ }
235
+
236
+ def initialize(options={})
237
+ super()
238
+ @options = options.merge!(DETECT_LINES_DEFAULTS)
239
+ self.clear!
240
+ OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) }
241
+ end
242
+
243
+ def clear!
244
+ self.rulings = []
245
+ self.currentX = -1
246
+ self.currentY = -1
247
+ self.currentPath = []
248
+ @pageSize = nil
249
+ end
250
+
251
+ def addRuling(ruling, color=nil)
252
+ color = color.nil? ? [0,0,0] : color
253
+ if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation)
254
+
255
+ mb = page.findMediaBox
256
+
257
+ ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation)
258
+
259
+ trans = if page.getRotation == 90 || page.getRotation == -270
260
+ AffineTransform.getTranslateInstance(mb.getHeight, 0)
261
+ else
262
+ AffineTransform.getTranslateInstance(0, mb.getWidth)
263
+ end
264
+ ruling.transform!(trans)
265
+ end
266
+
267
+ # snapping to grid and joining lines that are close together
268
+ ruling.snap!(options[:snapping_grid_cell_size])
269
+
270
+ self.rulings << [ruling, color]
271
+ end
272
+
273
+ ##
274
+ # get current page size
275
+ def pageSize
276
+ @pageSize ||= self.page.findMediaBox.createDimension
277
+ end
278
+
279
+ ##
280
+ # fix the Y coordinate based on page rotation
281
+ def fixY(y)
282
+ pageSize.getHeight - y
283
+ end
284
+
285
+ def ScaledPoint(*args)
286
+ x, y = args[0], args[1]
287
+
288
+ # if scale factor not provided, get it from current transformation matrix
289
+ if args.size == 2
290
+ ctm = getGraphicsState.getCurrentTransformationMatrix
291
+ at = ctm.createAffineTransform
292
+ scaleX = at.getScaleX; scaleY = at.getScaleY
293
+ else
294
+ scaleX = args[2]; scaleY = args[3]
295
+ end
296
+
297
+ finalX = 0.0;
298
+ finalY = 0.0;
299
+
300
+ if scaleX > 0
301
+ finalX = x * scaleX;
302
+ end
303
+ if scaleY > 0
304
+ finalY = y * scaleY;
305
+ end
306
+
307
+ return java.awt.geom.Point2D::Float.new(finalX, finalY);
308
+
309
+ end
310
+
311
+ def TransformedPoint(x, y)
312
+ position = [x,y].to_java(:float)
313
+ at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform
314
+ at.transform(position, 0, position, 0, 1)
315
+ position[1] = fixY(position[1])
316
+ java.awt.geom.Point2D::Float.new(position[0], position[1])
317
+ end
318
+
319
+ end
@@ -1,7 +1,5 @@
1
1
  require 'java'
2
2
 
3
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
4
-
5
3
  java_import org.apache.pdfbox.pdmodel.PDDocument
6
4
  java_import org.apache.pdfbox.pdfviewer.PageDrawer
7
5
  java_import java.awt.image.BufferedImage
@@ -31,7 +29,6 @@ module Tabula
31
29
  rotation = java.lang.Math.toRadians(page.findRotation)
32
30
 
33
31
  scaling = width / (rotation == 0 ? widthPt : heightPt)
34
- #widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
35
32
  widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
36
33
 
37
34
 
@@ -44,7 +41,7 @@ module Tabula
44
41
  graphics.setBackground(TRANSPARENT_WHITE)
45
42
  graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
46
43
  if rotation != 0
47
- graphics.translate(retval.getWidth, 0.0)
44
+ graphics.java_send :translate, [Java::int, Java::int], retval.getWidth, 0.0
48
45
  graphics.rotate(rotation)
49
46
  end
50
47
  graphics.scale(scaling, scaling)
@@ -65,4 +62,3 @@ if __FILE__ == $0
65
62
  ImageIO.write(bi, 'png',
66
63
  java.io.File.new('notext.png'))
67
64
  end
68
-
@@ -0,0 +1,52 @@
1
+ module Tabula
2
+ module Extraction
3
+
4
+ warn 'Tabula::Extraction::SpreadsheetExtractor is DEPRECATED and will be removed. Use ObjectExtractor instead'
5
+
6
+ class SpreadsheetExtractor < ObjectExtractor
7
+
8
+ # yields each spreadsheet and the page it corresponds to
9
+ # because each page can contain an arbitrary number of spreadsheets, each page can be sent
10
+ # to the block an arbitrary number of times.
11
+ # so the extract.each_with_index trick will absolutely not work.
12
+
13
+ # TODO lots of repeated code with parent class
14
+ # REFACTOR
15
+ def extract(options={})
16
+ Enumerator.new do |y|
17
+ begin
18
+ @pages.each do |i|
19
+ pdfbox_page = @all_pages.get(i-1) #TODO: this can error out ungracefully if you try to extract a page that doesn't exist (e.g. page 5 of a 4 page doc). we should catch and handle.
20
+ contents = pdfbox_page.getContents
21
+ next if contents.nil?
22
+ self.clear!
23
+ self.drawPage pdfbox_page
24
+
25
+ page = Tabula::Page.new( @pdf_filename,
26
+ pdfbox_page.findCropBox.width,
27
+ pdfbox_page.findCropBox.height,
28
+ pdfbox_page.getRotation.to_i,
29
+ i, #one-indexed, just like `i` is.
30
+ self.characters,
31
+ self.rulings)
32
+
33
+ page.spreadsheets(options).each do |spreadsheet|
34
+ spreadsheet.cells.each do |cell|
35
+ cell.text_elements = page.get_cell_text(cell)
36
+ end
37
+ y.yield page, spreadsheet
38
+ end
39
+ end
40
+ ensure
41
+ @pdf_file.close
42
+ end # begin
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+
50
+ #new plan:
51
+ # find all the cells on the page (lines -> minimal rects)
52
+ # find all the spreadsheets from the cells (minimal rects -> maximal rects)