tabula-extractor 0.6.6-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS.md +1 -0
  3. data/README.md +27 -11
  4. data/bin/tabula +61 -19
  5. data/ext/liblsd-linux32.so +0 -0
  6. data/ext/liblsd-linux64.so +0 -0
  7. data/ext/liblsd.dll +0 -0
  8. data/ext/liblsd.dylib +0 -0
  9. data/ext/liblsd64.dll +0 -0
  10. data/ext/lsd.c +137 -137
  11. data/ext/lsd.h +9 -9
  12. data/lib/tabula.rb +20 -3
  13. data/lib/tabula/core_ext.rb +261 -0
  14. data/lib/tabula/entities.rb +11 -456
  15. data/lib/tabula/entities/cell.rb +42 -0
  16. data/lib/tabula/entities/has_cells.rb +244 -0
  17. data/lib/tabula/entities/line.rb +39 -0
  18. data/lib/tabula/entities/page.rb +269 -0
  19. data/lib/tabula/entities/page_area.rb +7 -0
  20. data/lib/tabula/entities/ruling.rb +300 -0
  21. data/lib/tabula/entities/spreadsheet.rb +92 -0
  22. data/lib/tabula/entities/table.rb +81 -0
  23. data/lib/tabula/entities/text_chunk.rb +114 -0
  24. data/lib/tabula/entities/text_element.rb +112 -0
  25. data/lib/tabula/entities/zone_entity.rb +57 -0
  26. data/lib/tabula/extraction.rb +327 -0
  27. data/lib/tabula/line_segment_detector.rb +9 -7
  28. data/lib/tabula/pdf_line_extractor.rb +319 -0
  29. data/lib/tabula/pdf_render.rb +1 -5
  30. data/lib/tabula/spreadsheet_extractor.rb +52 -0
  31. data/lib/tabula/table_extractor.rb +50 -348
  32. data/lib/tabula/table_guesser.rb +21 -23
  33. data/lib/tabula/version.rb +1 -1
  34. data/lib/tabula/writers.rb +5 -6
  35. data/tabula-extractor.gemspec +1 -0
  36. data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
  37. data/test/data/47008204D_USA.page4.pdf +0 -0
  38. data/test/data/560015757GV_China.page1.pdf +0 -0
  39. data/test/data/GSK_2012_Q4.page437.pdf +0 -0
  40. data/test/data/S2MNCEbirdisland.pdf +0 -0
  41. data/test/data/campaign_donors.pdf +0 -0
  42. data/test/data/frx_2012_disclosure.tsv +88 -0
  43. data/test/data/no_tables.pdf +0 -0
  44. data/test/data/puertos1.pdf +0 -0
  45. data/test/data/spanning_cells.csv +21 -0
  46. data/test/data/spanning_cells.pdf +0 -0
  47. data/test/data/strongschools.pdf +0 -0
  48. data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
  49. data/test/data/vietnam3.pdf +0 -0
  50. data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
  51. data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
  52. data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
  53. data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
  54. data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
  55. data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
  56. data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
  57. data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
  58. data/test/heuristic.rb +50 -0
  59. data/test/test_bin_tabula.sh +7 -0
  60. data/test/tests.rb +476 -63
  61. metadata +79 -28
  62. data/lib/geom/point.rb +0 -21
  63. data/lib/geom/rectangle.rb +0 -101
  64. data/lib/geom/segment.rb +0 -82
  65. data/lib/tabula/pdf_dump.rb +0 -132
  66. data/lib/tabula/whitespace.rb +0 -50
  67. data/vertical_rulings_bug.rb +0 -29
@@ -0,0 +1,114 @@
1
+ module Tabula
2
+ ##
3
+ # a "collection" of TextElements
4
+ class TextChunk < ZoneEntity
5
+ attr_accessor :font, :font_size, :text_elements, :width_of_space
6
+
7
+ ##
8
+ # initialize a new TextChunk from a TextElement
9
+ def self.create_from_text_element(text_element)
10
+ raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
11
+ tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
12
+ tc.text_elements = [text_element]
13
+ return tc
14
+ end
15
+
16
+ ##
17
+ # group an iterable of TextChunk into a list of Line
18
+ def self.group_by_lines(text_chunks)
19
+ lines = []
20
+ text_chunks.each do |te|
21
+ next if te.text =~ ONLY_SPACES_RE
22
+ l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
23
+ if l.nil?
24
+ l = Line.new
25
+ lines << l
26
+ end
27
+ l << te
28
+ end
29
+ lines
30
+ end
31
+
32
+ ##
33
+ # calculate estimated columns from an iterable of TextChunk
34
+ def self.column_positions(text_chunks)
35
+ right = 0
36
+ columns = []
37
+ lines = TextChunk.group_by_lines(text_chunks)
38
+ top = lines.first.text_elements.map(&:top).min
39
+
40
+ text_chunks.each do |te|
41
+ next if te.text =~ ONLY_SPACES_RE
42
+ if te.top >= top
43
+ left = te.left
44
+ if (left > right)
45
+ columns << right
46
+ right = te.right
47
+ elsif te.right > right
48
+ right = te.right
49
+ end
50
+ end
51
+ end
52
+ columns
53
+ end
54
+
55
+ ##
56
+ # add a TextElement to this TextChunk
57
+ def <<(text_element)
58
+ self.text_elements << text_element
59
+ self.merge!(text_element)
60
+ end
61
+
62
+ def merge!(other)
63
+ if other.instance_of?(TextChunk)
64
+ if self.horizontally_overlaps?(other) && other.top < self.top
65
+ self.text_elements = other.text_elements + self.text_elements
66
+ else
67
+ self.text_elements = self.text_elements + other.text_elements
68
+ end
69
+ end
70
+ super(other)
71
+ end
72
+
73
+ ##
74
+ # split this TextChunk vertically
75
+ # (in place, returns the remaining chunk)
76
+ def split_vertically!(y)
77
+ raise "Not Implemented"
78
+ end
79
+
80
+ ##
81
+ # remove leading and trailing whitespace
82
+ # (changes geometry accordingly)
83
+ # TODO horrible implementation - fix.
84
+ def strip!
85
+ acc = 0
86
+ new_te = self.text_elements.drop_while { |te|
87
+ te.text == ' ' && acc += 1
88
+ }
89
+ self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
90
+ self.text_elements = new_te
91
+
92
+ self.text_elements.reverse!
93
+ acc = 0
94
+ new_te = self.text_elements.drop_while { |te|
95
+ te.text == ' ' && acc += 1
96
+ }
97
+ self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
98
+ self.text_elements = new_te.reverse
99
+ self
100
+ end
101
+
102
+ def text
103
+ self.text_elements.map(&:text).join
104
+ end
105
+
106
+ def inspect
107
+ "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
108
+ end
109
+
110
+ def to_h
111
+ super.merge(:text => self.text)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,112 @@
1
+ module Tabula
2
+ ##
3
+ # a Glyph
4
+ class TextElement < ZoneEntity
5
+ attr_accessor :font, :font_size, :text, :width_of_space
6
+
7
+ TOLERANCE_FACTOR = 0.25
8
+
9
+ def initialize(top, left, width, height, font, font_size, text, width_of_space)
10
+ super(top, left, width, height)
11
+ self.font = font
12
+ self.font_size = font_size
13
+ self.text = text
14
+ self.width_of_space = width_of_space
15
+ end
16
+
17
+ EMPTY = TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
18
+
19
+ ##
20
+ # heuristically merge an iterable of TextElement into a list of TextChunk
21
+ def self.merge_words(text_elements, options={})
22
+ default_options = {:vertical_rulings => []}
23
+ options = default_options.merge(options)
24
+ vertical_ruling_locations = options[:vertical_rulings].map(&:left) if options[:vertical_rulings]
25
+
26
+ return [] if text_elements.empty?
27
+
28
+ text_chunks = [TextChunk.create_from_text_element(text_elements.shift)]
29
+
30
+ text_elements.inject(text_chunks) do |chunks, char|
31
+ current_chunk = chunks.last
32
+ prev_char = current_chunk.text_elements.last
33
+
34
+ # any vertical ruling goes across prev_char and char?
35
+ across_vertical_ruling = vertical_ruling_locations.any? { |loc|
36
+ prev_char.left < loc && char.left > loc
37
+ }
38
+
39
+ # should we add a space?
40
+ if (prev_char.text != " ") && (char.text != " ") \
41
+ && !across_vertical_ruling \
42
+ && prev_char.should_add_space?(char)
43
+
44
+ sp = self.new(prev_char.top,
45
+ prev_char.right,
46
+ prev_char.width_of_space,
47
+ prev_char.width_of_space, # width == height for spaces
48
+ prev_char.font,
49
+ prev_char.font_size,
50
+ ' ',
51
+ prev_char.width_of_space)
52
+ chunks.last << sp
53
+ prev_char = sp
54
+ end
55
+
56
+ # should_merge? isn't aware of vertical rulings, so even if two text elements are close enough
57
+ # that they ought to be merged by that account.
58
+ # we still shouldn't merge them if the two elements are on opposite sides of a vertical ruling.
59
+ # Why are both of those `.left`?, you might ask. The intuition is that a letter
60
+ # that starts on the left of a vertical ruling ought to remain on the left of it.
61
+ if !across_vertical_ruling && prev_char.should_merge?(char)
62
+ chunks.last << char
63
+ else
64
+ # create a new chunk
65
+ chunks << TextChunk.create_from_text_element(char)
66
+ end
67
+ chunks
68
+ end
69
+ end
70
+
71
+ # more or less returns True if distance < tolerance
72
+ def should_merge?(other)
73
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
74
+ self.vertically_overlaps?(other) && self.horizontal_distance(other) < width_of_space * (1 + TOLERANCE_FACTOR) && !self.should_add_space?(other)
75
+ end
76
+
77
+ # more or less returns True if (tolerance <= distance < CHARACTER_DISTANCE_THRESHOLD*tolerance)
78
+ def should_add_space?(other)
79
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
80
+
81
+ return false if self.width_of_space.nan?
82
+
83
+ (self.vertically_overlaps?(other) &&
84
+ self.horizontal_distance(other).abs.between?(self.width_of_space * (1 - TOLERANCE_FACTOR), self.width_of_space * (1 + TOLERANCE_FACTOR))) ||
85
+ (self.vertical_distance(other) > self.height)
86
+ end
87
+
88
+ ##
89
+ # merge this TextElement with another (adjust size and text content accordingly)
90
+ def merge!(other)
91
+ raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement)
92
+ if self.horizontally_overlaps?(other) and other.top < self.top
93
+ self.text = other.text + self.text
94
+ else
95
+ self.text << other.text
96
+ end
97
+ super(other)
98
+ end
99
+
100
+ def to_h
101
+ super.merge({:font => self.font, :text => self.text })
102
+ end
103
+
104
+ def inspect
105
+ "#<TextElement: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
106
+ end
107
+
108
+ def ==(other)
109
+ self.text.strip == other.text.strip
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,57 @@
1
+ java_import java.awt.geom.Point2D
2
+
3
+ module Tabula
4
+
5
+ class ZoneEntity < java.awt.geom.Rectangle2D::Float
6
+
7
+ attr_accessor :texts
8
+
9
+ def initialize(top, left, width, height)
10
+ super()
11
+ if left && top && width && height
12
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], left, top, width, height
13
+ end
14
+ self.texts = []
15
+ end
16
+
17
+ def merge!(other)
18
+ self.top = [self.top, other.top].min
19
+ self.left = [self.left, other.left].min
20
+ self.width = [self.right, other.right].max - left
21
+ self.height = [self.bottom, other.bottom].max - top
22
+
23
+ self.java_send :setRect, [Java::float, Java::float, Java::float, Java::float,], self.left, self.top, self.width, self.height
24
+ end
25
+
26
+ ##
27
+ # default sorting order for ZoneEntity objects
28
+ # is lexicographical (left to right, top to bottom)
29
+ def <=>(other)
30
+ return 1 if self.left > other.left
31
+ return -1 if self.left < other.left
32
+ return 0 if self.vertically_overlaps?(other)
33
+ return 1 if self.top > other.top
34
+ return -1 if self.top < other.top
35
+ return 0
36
+ end
37
+
38
+ def to_json(options={})
39
+ self.to_h.to_json
40
+ end
41
+
42
+ def inspect
43
+ "#<#{self.class} dims: #{self.dims(:top, :left, :width, :height)}>"
44
+ end
45
+
46
+ def tlbr
47
+ [top, left, bottom, right]
48
+ end
49
+
50
+ def points
51
+ [ Point2D::Float.new(left, top),
52
+ Point2D::Float.new(right, top),
53
+ Point2D::Float.new(right, bottom),
54
+ Point2D::Float.new(left, bottom) ]
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,327 @@
1
+ java_import org.apache.pdfbox.pdfparser.PDFParser
2
+ java_import org.apache.pdfbox.util.TextPosition
3
+ java_import org.apache.pdfbox.pdmodel.PDDocument
4
+ java_import org.apache.pdfbox.util.PDFTextStripper
5
+ java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
6
+ java_import java.awt.geom.AffineTransform
7
+
8
+ module Tabula
9
+
10
+ module Extraction
11
+
12
+ def Extraction.openPDF(pdf_filename, password='')
13
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
14
+ document = PDDocument.load(pdf_filename)
15
+ if document.isEncrypted
16
+ sdm = StandardDecryptionMaterial.new(password)
17
+ document.openProtection(sdm)
18
+ end
19
+ document
20
+ end
21
+
22
+ class ObjectExtractor < org.apache.pdfbox.pdfviewer.PageDrawer
23
+
24
+ attr_accessor :characters, :debug_text, :debug_clipping_paths, :clipping_paths, :options
25
+ field_accessor :pageSize, :page
26
+
27
+ PRINTABLE_RE = /[[:print:]]/
28
+ DEFAULT_OPTIONS = {
29
+ :line_color_filter => nil,
30
+ :extract_ruling_lines => true
31
+ }
32
+
33
+ def initialize(pdf_filename, pages=[1], password='', options={})
34
+ raise Errno::ENOENT unless File.exists?(pdf_filename)
35
+ @pdf_filename = pdf_filename
36
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
37
+ @all_pages = @pdf_file.getDocumentCatalog.getAllPages
38
+ @pages = pages == :all ? (1..@all_pages.size) : pages
39
+
40
+ super()
41
+
42
+ self.options = DEFAULT_OPTIONS.merge(options)
43
+ self.characters = []
44
+ @debug_clipping_paths = false
45
+ @clipping_path = nil
46
+ @transformed_clipping_path = nil
47
+ self.clipping_paths = []
48
+ @rulings = []
49
+ @min_char_width = @min_char_height = 1000000
50
+ end
51
+
52
+ def extract
53
+ Enumerator.new do |y|
54
+ begin
55
+ @pages.each do |i|
56
+ page = @all_pages.get(i-1)
57
+ contents = page.getContents
58
+ next if contents.nil?
59
+
60
+ self.clear!
61
+ self.drawPage(page)
62
+ p = Tabula::Page.new(@pdf_filename,
63
+ page.findCropBox.width,
64
+ page.findCropBox.height,
65
+ page.getRotation.to_i,
66
+ i, #one-indexed, just like `i` is.
67
+ self.characters,
68
+ self.rulings,
69
+ @min_char_width,
70
+ @min_char_height)
71
+ y.yield p
72
+ end
73
+ ensure
74
+ @pdf_file.close
75
+ end # begin
76
+ end
77
+ end
78
+
79
+ def clear!
80
+ self.characters.clear
81
+ self.clipping_paths.clear
82
+ @page_transform = nil
83
+ @rulings.clear
84
+ end
85
+
86
+ def ensurePageSize!
87
+ if self.pageSize.nil? && !self.page.nil?
88
+ mediaBox = self.page.findMediaBox
89
+ self.pageSize = (mediaBox == nil ? nil : mediaBox.createDimension)
90
+ end
91
+ end
92
+
93
+ def drawPage(page)
94
+ self.page = page
95
+ if !self.page.getContents.nil?
96
+ ensurePageSize!
97
+ self.processStream(self.page,
98
+ self.page.findResources,
99
+ self.page.getContents.getStream)
100
+ end
101
+ end
102
+
103
+ def setStroke(stroke)
104
+ @basicStroke = stroke
105
+ end
106
+
107
+ def getStroke
108
+ @basicStroke
109
+ end
110
+
111
+
112
+ def strokePath(filter_by_color=nil)
113
+ unless self.options[:extract_ruling_lines]
114
+ self.getLinePath.reset
115
+ return
116
+ end
117
+
118
+ path = self.pathToList(self.getLinePath)
119
+
120
+ if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
121
+ || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO && p.first != java.awt.geom.PathIterator::SEG_MOVETO && p.first != java.awt.geom.PathIterator::SEG_CLOSE }
122
+ self.getLinePath.reset
123
+ return
124
+ end
125
+
126
+ ccp_bounds = self.currentClippingPath
127
+
128
+ strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
129
+ color_filter = self.options[:line_color_filter]
130
+
131
+ first = path.shift
132
+ start_pos = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])
133
+
134
+ path.each do |p|
135
+ end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
136
+ line = (start_pos <=> end_pos) == -1 \
137
+ ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
138
+ : java.awt.geom.Line2D::Float.new(end_pos, start_pos)
139
+
140
+ if p[0] == java.awt.geom.PathIterator::SEG_LINETO \
141
+ && (color_filter.nil? ? true : color_filter.call(strokeColorComps)) \
142
+ && line.intersects(ccp_bounds)
143
+ # convert line to rectangle for clipping it to the current clippath
144
+ # sucks, but awt doesn't have methods for this
145
+ tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
146
+ @rulings << ::Tabula::Ruling.new(tmp.getY,
147
+ tmp.getX,
148
+ tmp.getWidth,
149
+ tmp.getHeight,
150
+ filter_by_color.to_a)
151
+ end
152
+ start_pos = end_pos
153
+ end
154
+ self.getLinePath.reset
155
+ end
156
+
157
+ def fillPath(windingRule)
158
+ self.strokePath(self.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil))
159
+ end
160
+
161
+ def drawImage(image, at)
162
+ end
163
+
164
+ def transformPath(path)
165
+ self.pageTransform.createTransformedShape(path)
166
+ end
167
+
168
+ def pageTransform
169
+ unless @page_transform.nil?
170
+ return @page_transform
171
+ end
172
+
173
+ cb = page.findCropBox
174
+ if !([90, -270, -90, 270].include?(page.getRotation))
175
+ @page_transform = AffineTransform.getScaleInstance(1, -1)
176
+ @page_transform.translate(0, -cb.getHeight)
177
+ else
178
+ @page_transform = AffineTransform.getScaleInstance(-1, 1)
179
+ @page_transform.rotate(page.getRotation * (Math::PI/180.0),
180
+ cb.getLowerLeftX, cb.getLowerLeftY)
181
+ end
182
+ @page_transform
183
+ end
184
+
185
+ def currentClippingPath
186
+ cp = self.getGraphicsState.getCurrentClippingPath
187
+
188
+ if cp == @clipping_path
189
+ return @transformed_clipping_path_bounds
190
+ end
191
+
192
+ @clipping_path = cp
193
+ @transformed_clipping_path = self.transformPath(cp)
194
+ @transformed_clipping_path_bounds = @transformed_clipping_path.getBounds
195
+
196
+ return @transformed_clipping_path_bounds
197
+ end
198
+
199
+ def processTextPosition(text)
200
+ c = text.getCharacter
201
+ h = c == ' ' ? text.getWidthDirAdj.round(2) : text.getHeightDir.round(2)
202
+
203
+ te = Tabula::TextElement.new(text.getYDirAdj.round(2) - h,
204
+ text.getXDirAdj.round(2),
205
+ text.getWidthDirAdj.round(2),
206
+ # ugly hack follows: we need spaces to have a height, so we can
207
+ # test for vertical overlap. height == width seems a safe bet.
208
+ h,
209
+ text.getFont,
210
+ text.getFontSize.round(2),
211
+ c,
212
+ # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
213
+ text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace)
214
+
215
+ ccp_bounds = self.currentClippingPath
216
+
217
+ if self.debug_clipping_paths && !self.clipping_paths.include?(ccp_bounds)
218
+ self.clipping_paths << ::Tabula::ZoneEntity.new(ccp_bounds.getMinY,
219
+ ccp_bounds.getMinX,
220
+ ccp_bounds.getWidth,
221
+ ccp_bounds.getHeight)
222
+ end
223
+
224
+ if te.width < @min_char_width
225
+ @min_char_width = te.width
226
+ end
227
+
228
+ if te.height < @min_char_height
229
+ @min_char_height = te.height
230
+ end
231
+
232
+ if c =~ PRINTABLE_RE && ccp_bounds.intersects(te)
233
+ self.characters << te
234
+ end
235
+ end
236
+
237
+ def page_count
238
+ @all_pages.size
239
+ end
240
+
241
+ def rulings
242
+ return [] if @rulings.empty?
243
+ @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
244
+ end
245
+
246
+ protected
247
+
248
+ # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
249
+ def currentSpaceWidth
250
+ gs = self.getGraphicsState
251
+ font = gs.getTextState.getFont
252
+
253
+ fontSizeText = gs.getTextState.getFontSize
254
+ horizontalScalingText = gs.getTextState.getHorizontalScalingPercent / 100.0
255
+
256
+ if font.java_kind_of?(org.apache.pdfbox.pdmodel.font.PDType3Font)
257
+ puts "TYPE3"
258
+ end
259
+
260
+ # idea from pdf.js
261
+ # https://github.com/mozilla/pdf.js/blob/master/src/core/fonts.js#L4418
262
+ spaceWidthText = spaceWidthText = [' ', '-', '1', 'i'] \
263
+ .map { |c| font.getFontWidth(c.ord) } \
264
+ .find { |w| w > 0 } || 1000
265
+
266
+ ctm00 = gs.getCurrentTransformationMatrix.getValue(0, 0)
267
+
268
+ return (spaceWidthText/1000.0) * fontSizeText * horizontalScalingText * (ctm00 == 0 ? 1 : ctm00)
269
+ end
270
+
271
+ def pathToList(path)
272
+ iterator = path.getPathIterator(self.pageTransform)
273
+ rv = []
274
+ while !iterator.isDone do
275
+ coords = Java::double[6].new
276
+ segType = iterator.currentSegment(coords)
277
+ rv << [segType, coords]
278
+ iterator.next
279
+ end
280
+ rv
281
+ end
282
+
283
+ def debugPath(path)
284
+ rv = ''
285
+ pathToList(path).each do |segType, coords|
286
+ case segType
287
+ when java.awt.geom.PathIterator::SEG_MOVETO
288
+ rv += "MOVE: #{coords[0]} #{coords[1]}\n"
289
+ when java.awt.geom.PathIterator::SEG_LINETO
290
+ rv += "LINE: #{coords[0]} #{coords[1]}\n"
291
+ when java.awt.geom.PathIterator::SEG_CLOSE
292
+ rv += "CLOSE\n\n"
293
+ end
294
+ end
295
+ rv
296
+ end
297
+
298
+ end
299
+
300
+
301
+ class PagesInfoExtractor
302
+ def initialize(pdf_filename, password='')
303
+ @pdf_filename = pdf_filename
304
+ @pdf_file = Extraction.openPDF(pdf_filename, password)
305
+ @all_pages = @pdf_file.getDocumentCatalog.getAllPages
306
+ end
307
+
308
+ def pages
309
+ Enumerator.new do |y|
310
+ begin
311
+ @all_pages.each_with_index do |page, i|
312
+ contents = page.getContents
313
+
314
+ y.yield Tabula::Page.new(@pdf_filename,
315
+ page.findCropBox.width,
316
+ page.findCropBox.height,
317
+ page.getRotation.to_i,
318
+ i+1) #remember, these are one-indexed
319
+ end
320
+ ensure
321
+ @pdf_file.close
322
+ end
323
+ end
324
+ end
325
+ end
326
+ end
327
+ end