tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -5,8 +5,7 @@ require 'ffi'
5
5
 
6
6
  require_relative './entities'
7
7
  require_relative './pdf_render'
8
- require_relative './pdf_dump'
9
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
8
+ require_relative './extraction'
10
9
 
11
10
  java_import javax.imageio.ImageIO
12
11
  java_import java.awt.image.BufferedImage
@@ -55,6 +54,7 @@ module Tabula
55
54
  lines
56
55
  end
57
56
 
57
+ #zero-indexed page_number
58
58
  def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
59
59
  options = DETECT_LINES_DEFAULTS.merge(options)
60
60
 
@@ -79,7 +79,7 @@ module Tabula
79
79
  raise ArgumentError, 'image must be a string or a BufferedImage'
80
80
  end
81
81
 
82
- image = LSD.image_to_image_double(bimage)
82
+ image = LSD.image_to_image_float(bimage)
83
83
 
84
84
  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
85
85
 
@@ -89,7 +89,7 @@ module Tabula
89
89
 
90
90
  rv = []
91
91
  lines_found.times do |i|
92
- a = out[7*8*i].read_array_of_type(:double, 7)
92
+ a = out[7*4*i].read_array_of_type(:float, 7)
93
93
 
94
94
  a_round = a[0..3].map(&:round)
95
95
  p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
@@ -109,17 +109,19 @@ module Tabula
109
109
  end
110
110
 
111
111
  private
112
- def LSD.image_to_image_double(buffered_image)
112
+
113
+ def LSD.image_to_image_float(buffered_image)
113
114
  width = buffered_image.getWidth; height = buffered_image.getHeight
114
115
  raster_size = width * height
115
116
 
116
- image_double = FFI::MemoryPointer.new(:double, raster_size)
117
+ image_float = FFI::MemoryPointer.new(:float, raster_size)
117
118
  pixels = Java::int[width * height].new
118
119
  buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
119
120
 
120
- image_double.put_array_of_double 0, pixels.to_a
121
+ image_float.put_array_of_float 0, pixels.to_a
121
122
  end
122
123
 
124
+
123
125
  end
124
126
  end
125
127
 
@@ -0,0 +1,319 @@
1
+ java_import org.apache.pdfbox.util.operator.OperatorProcessor
2
+ java_import org.apache.pdfbox.pdfparser.PDFParser
3
+ java_import org.apache.pdfbox.util.PDFStreamEngine
4
+ java_import org.apache.pdfbox.util.ResourceLoader
5
+
6
+ java_import java.awt.geom.PathIterator
7
+ java_import java.awt.geom.Point2D
8
+ java_import java.awt.geom.GeneralPath
9
+ java_import java.awt.geom.AffineTransform
10
+ java_import java.awt.Color
11
+
12
+ warn 'Tabula::Extraction::LineExtractor is DEPRECATED and will be removed'
13
+
14
+ class Tabula::Extraction::LineExtractor < org.apache.pdfbox.util.PDFStreamEngine
15
+
16
+ attr_accessor :currentX, :currentY
17
+ attr_accessor :currentPath
18
+ attr_accessor :rulings
19
+ attr_accessor :options
20
+ field_accessor :page
21
+
22
+ DETECT_LINES_DEFAULTS = {
23
+ :snapping_grid_cell_size => 2
24
+ }
25
+
26
+ def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
27
+ lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top }
28
+ lines.inject([]) do |memo, next_line|
29
+ if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line)
30
+ memo.last.top = [next_line.top, memo.last.top].min
31
+ memo.last.bottom = [next_line.bottom, memo.last.bottom].max
32
+ memo
33
+ else
34
+ memo << next_line
35
+ end
36
+ end
37
+ end
38
+
39
+ def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
40
+ lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left }
41
+ lines.inject([]) do |memo, next_line|
42
+ if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line)
43
+ memo.last.left = [next_line.left, memo.last.left].min
44
+ memo.last.right = [next_line.right, memo.last.right].max
45
+ memo
46
+ else
47
+ memo << next_line
48
+ end
49
+ end
50
+ end
51
+
52
+ #N.B. for merge `spreadsheets` into `text-extractor-refactor` --
53
+ # only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method
54
+ # the rest is readability changes.
55
+ #page_number here is zero-indexed
56
+ def self.lines_in_pdf_page(pdf_path, page_number, options={})
57
+ options = options.merge!(DETECT_LINES_DEFAULTS)
58
+ if options[:render_pdf]
59
+ # only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here
60
+ # since there's no good reason want unclean lines
61
+ Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, options))
62
+ else
63
+ pdf_file = ::Tabula::Extraction.openPDF(pdf_path)
64
+ page = pdf_file.getDocumentCatalog.getAllPages[page_number]
65
+ le = self.new(options)
66
+ le.processStream(page, page.findResources, page.getContents.getStream)
67
+ pdf_file.close
68
+ rulings = le.rulings.map do |l, color|
69
+ ::Tabula::Ruling.new(l.getP1.getY,
70
+ l.getP1.getX,
71
+ l.getP2.getX - l.getP1.getX,
72
+ l.getP2.getY - l.getP1.getY,
73
+ color)
74
+ end
75
+ rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
76
+ collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?))
77
+ end
78
+ end
79
+
80
+ class LineToOperator < OperatorProcessor
81
+ def process(operator, arguments)
82
+ drawer = self.context
83
+ x, y = arguments[0], arguments[1]
84
+ ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
85
+
86
+ l = java.awt.geom.Line2D::Float.new(drawer.currentX, drawer.currentY, ppos.getX, ppos.getY)
87
+
88
+ drawer.currentPath << l if l.horizontal? or l.vertical?
89
+
90
+ drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
91
+ end
92
+ end
93
+
94
+ class MoveToOperator < OperatorProcessor
95
+ def process(operator, arguments)
96
+ drawer = self.context
97
+ x, y = arguments[0], arguments[1]
98
+
99
+ ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
100
+
101
+ drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
102
+ end
103
+ end
104
+
105
+ class AppendRectangleToPathOperator < OperatorProcessor
106
+ def process(operator, arguments)
107
+
108
+ drawer = self.context
109
+ finalX, finalY, finalW, finalH = arguments.to_array.map(&:floatValue)
110
+
111
+ ppos = drawer.TransformedPoint(finalX, finalY)
112
+ psize = drawer.ScaledPoint(finalW, finalH)
113
+
114
+ finalY = ppos.getY - psize.getY
115
+ if finalY < 0
116
+ finalY = 0
117
+ end
118
+
119
+ width = psize.getX.abs
120
+ height = psize.getY.abs
121
+
122
+ lines = if width > height && height < 2 # horizontal line, "thin" rectangle.
123
+ [java.awt.geom.Line2D::Float.new(ppos.getX, finalY + psize.getY/2, ppos.getX + psize.getX, finalY + psize.getY/2)]
124
+ elsif width < height && width < 2 # vertical line, "thin" rectangle
125
+ [java.awt.geom.Line2D::Float.new(ppos.getX + psize.getX/2, finalY, ppos.getX + psize.getX/2, finalY + psize.getY)]
126
+ else
127
+ # add every edge of the rectangle to drawer.rulings
128
+ [java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX + psize.getX, finalY),
129
+ java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX, finalY + psize.getY),
130
+ java.awt.geom.Line2D::Float.new(ppos.getX+psize.getX, finalY, ppos.getX + psize.getX, finalY + psize.getY),
131
+ java.awt.geom.Line2D::Float.new(ppos.getX, finalY+psize.getY, ppos.getX + psize.getX, finalY + psize.getY)]
132
+ end
133
+
134
+ drawer.currentPath += lines.select { |l| l.horizontal? or l.vertical? }
135
+
136
+ end
137
+ end
138
+
139
+ class StrokePathOperator < OperatorProcessor
140
+ def process(operator, arguments)
141
+ drawer = self.context
142
+ strokeColorComps = drawer.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
143
+ color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
144
+ if color_filter.call(strokeColorComps)
145
+ drawer.currentPath.each { |segment| drawer.addRuling(segment, strokeColorComps.to_a) }
146
+ end
147
+
148
+ drawer.currentPath = []
149
+ end
150
+ end
151
+
152
+ class CloseFillNonZeroAndStrokePathOperator < OperatorProcessor
153
+ def process(operator, arguments)
154
+ drawer = self.context
155
+
156
+ fillColorComps = drawer.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil)
157
+ color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
158
+ if color_filter.call(fillColorComps)
159
+ drawer.currentPath.each { |segment| drawer.addRuling(segment, fillColorComps.to_a) }
160
+ end
161
+
162
+ drawer.currentPath = []
163
+ end
164
+ end
165
+
166
+ class CloseAndStrokePathOperator < OperatorProcessor
167
+ def process(operator, arguments)
168
+ drawer = self.context
169
+ drawer.currentPath.each { |segment| drawer.addRuling(segment) }
170
+ drawer.currentPath = []
171
+ end
172
+ end
173
+
174
+ class EndPathOperator < OperatorProcessor
175
+ def process(operator, arguments)
176
+ drawer = self.context
177
+ # end without stroke, we don't care about it. discard it
178
+ drawer.currentPath = []
179
+ end
180
+ end
181
+
182
+ class FillNonZeroRuleOperator < OperatorProcessor
183
+ def process(operator, arguments)
184
+ drawer = self.context
185
+ # end without stroke, we don't care about it. discard it
186
+ drawer.currentPath = []
187
+ end
188
+ end
189
+
190
+ OPERATOR_PROCESSORS = {
191
+ 'm' => MoveToOperator.new,
192
+ 're' => AppendRectangleToPathOperator.new,
193
+ 'l' => LineToOperator.new,
194
+ 'S' => StrokePathOperator.new,
195
+ 's' => StrokePathOperator.new,
196
+ 'n' => EndPathOperator.new,
197
+ 'b' => CloseFillNonZeroAndStrokePathOperator.new,
198
+ 'b*' => CloseFillNonZeroAndStrokePathOperator.new,
199
+ 'f' => CloseFillNonZeroAndStrokePathOperator.new,
200
+ 'f*' => CloseFillNonZeroAndStrokePathOperator.new,
201
+ 'BT' => org.apache.pdfbox.util.operator.BeginText.new,
202
+ 'cm' => org.apache.pdfbox.util.operator.Concatenate.new,
203
+ 'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new,
204
+ 'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new,
205
+ 'ET' => org.apache.pdfbox.util.operator.EndText.new,
206
+ 'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new,
207
+ 'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new,
208
+ 'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new,
209
+ 'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new,
210
+ 'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new,
211
+ 'q' => org.apache.pdfbox.util.operator.GSave.new,
212
+ 'Q' => org.apache.pdfbox.util.operator.GRestore.new,
213
+ 'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new,
214
+ 'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new,
215
+ 'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
216
+ 'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
217
+ 'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
218
+ 'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
219
+ 'T*' => org.apache.pdfbox.util.operator.NextLine.new,
220
+ 'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new,
221
+ 'Td' => org.apache.pdfbox.util.operator.MoveText.new,
222
+ 'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new,
223
+ 'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new,
224
+ 'Tj' => org.apache.pdfbox.util.operator.ShowText.new,
225
+ 'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new,
226
+ 'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new,
227
+ 'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new,
228
+ 'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new,
229
+ 'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new,
230
+ 'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new,
231
+ 'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new,
232
+ "\'" => org.apache.pdfbox.util.operator.MoveAndShow.new,
233
+ '\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new,
234
+ }
235
+
236
+ def initialize(options={})
237
+ super()
238
+ @options = options.merge!(DETECT_LINES_DEFAULTS)
239
+ self.clear!
240
+ OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) }
241
+ end
242
+
243
+ def clear!
244
+ self.rulings = []
245
+ self.currentX = -1
246
+ self.currentY = -1
247
+ self.currentPath = []
248
+ @pageSize = nil
249
+ end
250
+
251
+ def addRuling(ruling, color=nil)
252
+ color = color.nil? ? [0,0,0] : color
253
+ if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation)
254
+
255
+ mb = page.findMediaBox
256
+
257
+ ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation)
258
+
259
+ trans = if page.getRotation == 90 || page.getRotation == -270
260
+ AffineTransform.getTranslateInstance(mb.getHeight, 0)
261
+ else
262
+ AffineTransform.getTranslateInstance(0, mb.getWidth)
263
+ end
264
+ ruling.transform!(trans)
265
+ end
266
+
267
+ # snapping to grid and joining lines that are close together
268
+ ruling.snap!(options[:snapping_grid_cell_size])
269
+
270
+ self.rulings << [ruling, color]
271
+ end
272
+
273
+ ##
274
+ # get current page size
275
+ def pageSize
276
+ @pageSize ||= self.page.findMediaBox.createDimension
277
+ end
278
+
279
+ ##
280
+ # fix the Y coordinate based on page rotation
281
+ def fixY(y)
282
+ pageSize.getHeight - y
283
+ end
284
+
285
+ def ScaledPoint(*args)
286
+ x, y = args[0], args[1]
287
+
288
+ # if scale factor not provided, get it from current transformation matrix
289
+ if args.size == 2
290
+ ctm = getGraphicsState.getCurrentTransformationMatrix
291
+ at = ctm.createAffineTransform
292
+ scaleX = at.getScaleX; scaleY = at.getScaleY
293
+ else
294
+ scaleX = args[2]; scaleY = args[3]
295
+ end
296
+
297
+ finalX = 0.0;
298
+ finalY = 0.0;
299
+
300
+ if scaleX > 0
301
+ finalX = x * scaleX;
302
+ end
303
+ if scaleY > 0
304
+ finalY = y * scaleY;
305
+ end
306
+
307
+ return java.awt.geom.Point2D::Float.new(finalX, finalY);
308
+
309
+ end
310
+
311
+ def TransformedPoint(x, y)
312
+ position = [x,y].to_java(:float)
313
+ at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform
314
+ at.transform(position, 0, position, 0, 1)
315
+ position[1] = fixY(position[1])
316
+ java.awt.geom.Point2D::Float.new(position[0], position[1])
317
+ end
318
+
319
+ end
@@ -1,7 +1,5 @@
1
1
  require 'java'
2
2
 
3
- require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
4
-
5
3
  java_import org.apache.pdfbox.pdmodel.PDDocument
6
4
  java_import org.apache.pdfbox.pdfviewer.PageDrawer
7
5
  java_import java.awt.image.BufferedImage
@@ -31,7 +29,6 @@ module Tabula
31
29
  rotation = java.lang.Math.toRadians(page.findRotation)
32
30
 
33
31
  scaling = width / (rotation == 0 ? widthPt : heightPt)
34
- #widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
35
32
  widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
36
33
 
37
34
 
@@ -44,7 +41,7 @@ module Tabula
44
41
  graphics.setBackground(TRANSPARENT_WHITE)
45
42
  graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
46
43
  if rotation != 0
47
- graphics.translate(retval.getWidth, 0.0)
44
+ graphics.java_send :translate, [Java::int, Java::int], retval.getWidth, 0.0
48
45
  graphics.rotate(rotation)
49
46
  end
50
47
  graphics.scale(scaling, scaling)
@@ -65,4 +62,3 @@ if __FILE__ == $0
65
62
  ImageIO.write(bi, 'png',
66
63
  java.io.File.new('notext.png'))
67
64
  end
68
-
@@ -0,0 +1,52 @@
1
+ module Tabula
2
+ module Extraction
3
+
4
+ warn 'Tabula::Extraction::SpreadsheetExtractor is DEPRECATED and will be removed. Use ObjectExtractor instead'
5
+
6
+ class SpreadsheetExtractor < ObjectExtractor
7
+
8
+ # yields each spreadsheet and the page it corresponds to
9
+ # because each page can contain an arbitrary number of spreadsheets, each page can be sent
10
+ # to the block an arbitrary number of times.
11
+ # so the extract.each_with_index trick will absolutely not work.
12
+
13
+ # TODO lots of repeated code with parent class
14
+ # REFACTOR
15
+ def extract(options={})
16
+ Enumerator.new do |y|
17
+ begin
18
+ @pages.each do |i|
19
+ pdfbox_page = @all_pages.get(i-1) #TODO: this can error out ungracefully if you try to extract a page that doesn't exist (e.g. page 5 of a 4 page doc). we should catch and handle.
20
+ contents = pdfbox_page.getContents
21
+ next if contents.nil?
22
+ self.clear!
23
+ self.drawPage pdfbox_page
24
+
25
+ page = Tabula::Page.new( @pdf_filename,
26
+ pdfbox_page.findCropBox.width,
27
+ pdfbox_page.findCropBox.height,
28
+ pdfbox_page.getRotation.to_i,
29
+ i, #one-indexed, just like `i` is.
30
+ self.characters,
31
+ self.rulings)
32
+
33
+ page.spreadsheets(options).each do |spreadsheet|
34
+ spreadsheet.cells.each do |cell|
35
+ cell.text_elements = page.get_cell_text(cell)
36
+ end
37
+ y.yield page, spreadsheet
38
+ end
39
+ end
40
+ ensure
41
+ @pdf_file.close
42
+ end # begin
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+
50
+ #new plan:
51
+ # find all the cells on the page (lines -> minimal rects)
52
+ # find all the spreadsheets from the cells (minimal rects -> maximal rects)