tabula-extractor 0.6.6-java → 0.7.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -5,8 +5,7 @@ require 'ffi'
|
|
5
5
|
|
6
6
|
require_relative './entities'
|
7
7
|
require_relative './pdf_render'
|
8
|
-
require_relative './
|
9
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
8
|
+
require_relative './extraction'
|
10
9
|
|
11
10
|
java_import javax.imageio.ImageIO
|
12
11
|
java_import java.awt.image.BufferedImage
|
@@ -55,6 +54,7 @@ module Tabula
|
|
55
54
|
lines
|
56
55
|
end
|
57
56
|
|
57
|
+
#zero-indexed page_number
|
58
58
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
59
59
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
60
60
|
|
@@ -79,7 +79,7 @@ module Tabula
|
|
79
79
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
80
80
|
end
|
81
81
|
|
82
|
-
image = LSD.
|
82
|
+
image = LSD.image_to_image_float(bimage)
|
83
83
|
|
84
84
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
85
85
|
|
@@ -89,7 +89,7 @@ module Tabula
|
|
89
89
|
|
90
90
|
rv = []
|
91
91
|
lines_found.times do |i|
|
92
|
-
a = out[7*
|
92
|
+
a = out[7*4*i].read_array_of_type(:float, 7)
|
93
93
|
|
94
94
|
a_round = a[0..3].map(&:round)
|
95
95
|
p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
|
@@ -109,17 +109,19 @@ module Tabula
|
|
109
109
|
end
|
110
110
|
|
111
111
|
private
|
112
|
-
|
112
|
+
|
113
|
+
def LSD.image_to_image_float(buffered_image)
|
113
114
|
width = buffered_image.getWidth; height = buffered_image.getHeight
|
114
115
|
raster_size = width * height
|
115
116
|
|
116
|
-
|
117
|
+
image_float = FFI::MemoryPointer.new(:float, raster_size)
|
117
118
|
pixels = Java::int[width * height].new
|
118
119
|
buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
|
119
120
|
|
120
|
-
|
121
|
+
image_float.put_array_of_float 0, pixels.to_a
|
121
122
|
end
|
122
123
|
|
124
|
+
|
123
125
|
end
|
124
126
|
end
|
125
127
|
|
@@ -0,0 +1,319 @@
|
|
1
|
+
java_import org.apache.pdfbox.util.operator.OperatorProcessor
|
2
|
+
java_import org.apache.pdfbox.pdfparser.PDFParser
|
3
|
+
java_import org.apache.pdfbox.util.PDFStreamEngine
|
4
|
+
java_import org.apache.pdfbox.util.ResourceLoader
|
5
|
+
|
6
|
+
java_import java.awt.geom.PathIterator
|
7
|
+
java_import java.awt.geom.Point2D
|
8
|
+
java_import java.awt.geom.GeneralPath
|
9
|
+
java_import java.awt.geom.AffineTransform
|
10
|
+
java_import java.awt.Color
|
11
|
+
|
12
|
+
warn 'Tabula::Extraction::LineExtractor is DEPRECATED and will be removed'
|
13
|
+
|
14
|
+
class Tabula::Extraction::LineExtractor < org.apache.pdfbox.util.PDFStreamEngine
|
15
|
+
|
16
|
+
attr_accessor :currentX, :currentY
|
17
|
+
attr_accessor :currentPath
|
18
|
+
attr_accessor :rulings
|
19
|
+
attr_accessor :options
|
20
|
+
field_accessor :page
|
21
|
+
|
22
|
+
DETECT_LINES_DEFAULTS = {
|
23
|
+
:snapping_grid_cell_size => 2
|
24
|
+
}
|
25
|
+
|
26
|
+
def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
|
27
|
+
lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top }
|
28
|
+
lines.inject([]) do |memo, next_line|
|
29
|
+
if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line)
|
30
|
+
memo.last.top = [next_line.top, memo.last.top].min
|
31
|
+
memo.last.bottom = [next_line.bottom, memo.last.bottom].max
|
32
|
+
memo
|
33
|
+
else
|
34
|
+
memo << next_line
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
|
40
|
+
lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left }
|
41
|
+
lines.inject([]) do |memo, next_line|
|
42
|
+
if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line)
|
43
|
+
memo.last.left = [next_line.left, memo.last.left].min
|
44
|
+
memo.last.right = [next_line.right, memo.last.right].max
|
45
|
+
memo
|
46
|
+
else
|
47
|
+
memo << next_line
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#N.B. for merge `spreadsheets` into `text-extractor-refactor` --
|
53
|
+
# only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method
|
54
|
+
# the rest is readability changes.
|
55
|
+
#page_number here is zero-indexed
|
56
|
+
def self.lines_in_pdf_page(pdf_path, page_number, options={})
|
57
|
+
options = options.merge!(DETECT_LINES_DEFAULTS)
|
58
|
+
if options[:render_pdf]
|
59
|
+
# only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here
|
60
|
+
# since there's no good reason want unclean lines
|
61
|
+
Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, options))
|
62
|
+
else
|
63
|
+
pdf_file = ::Tabula::Extraction.openPDF(pdf_path)
|
64
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
65
|
+
le = self.new(options)
|
66
|
+
le.processStream(page, page.findResources, page.getContents.getStream)
|
67
|
+
pdf_file.close
|
68
|
+
rulings = le.rulings.map do |l, color|
|
69
|
+
::Tabula::Ruling.new(l.getP1.getY,
|
70
|
+
l.getP1.getX,
|
71
|
+
l.getP2.getX - l.getP1.getX,
|
72
|
+
l.getP2.getY - l.getP1.getY,
|
73
|
+
color)
|
74
|
+
end
|
75
|
+
rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
|
76
|
+
collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class LineToOperator < OperatorProcessor
|
81
|
+
def process(operator, arguments)
|
82
|
+
drawer = self.context
|
83
|
+
x, y = arguments[0], arguments[1]
|
84
|
+
ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
|
85
|
+
|
86
|
+
l = java.awt.geom.Line2D::Float.new(drawer.currentX, drawer.currentY, ppos.getX, ppos.getY)
|
87
|
+
|
88
|
+
drawer.currentPath << l if l.horizontal? or l.vertical?
|
89
|
+
|
90
|
+
drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class MoveToOperator < OperatorProcessor
|
95
|
+
def process(operator, arguments)
|
96
|
+
drawer = self.context
|
97
|
+
x, y = arguments[0], arguments[1]
|
98
|
+
|
99
|
+
ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
|
100
|
+
|
101
|
+
drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class AppendRectangleToPathOperator < OperatorProcessor
|
106
|
+
def process(operator, arguments)
|
107
|
+
|
108
|
+
drawer = self.context
|
109
|
+
finalX, finalY, finalW, finalH = arguments.to_array.map(&:floatValue)
|
110
|
+
|
111
|
+
ppos = drawer.TransformedPoint(finalX, finalY)
|
112
|
+
psize = drawer.ScaledPoint(finalW, finalH)
|
113
|
+
|
114
|
+
finalY = ppos.getY - psize.getY
|
115
|
+
if finalY < 0
|
116
|
+
finalY = 0
|
117
|
+
end
|
118
|
+
|
119
|
+
width = psize.getX.abs
|
120
|
+
height = psize.getY.abs
|
121
|
+
|
122
|
+
lines = if width > height && height < 2 # horizontal line, "thin" rectangle.
|
123
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX, finalY + psize.getY/2, ppos.getX + psize.getX, finalY + psize.getY/2)]
|
124
|
+
elsif width < height && width < 2 # vertical line, "thin" rectangle
|
125
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX + psize.getX/2, finalY, ppos.getX + psize.getX/2, finalY + psize.getY)]
|
126
|
+
else
|
127
|
+
# add every edge of the rectangle to drawer.rulings
|
128
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX + psize.getX, finalY),
|
129
|
+
java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX, finalY + psize.getY),
|
130
|
+
java.awt.geom.Line2D::Float.new(ppos.getX+psize.getX, finalY, ppos.getX + psize.getX, finalY + psize.getY),
|
131
|
+
java.awt.geom.Line2D::Float.new(ppos.getX, finalY+psize.getY, ppos.getX + psize.getX, finalY + psize.getY)]
|
132
|
+
end
|
133
|
+
|
134
|
+
drawer.currentPath += lines.select { |l| l.horizontal? or l.vertical? }
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
class StrokePathOperator < OperatorProcessor
|
140
|
+
def process(operator, arguments)
|
141
|
+
drawer = self.context
|
142
|
+
strokeColorComps = drawer.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
143
|
+
color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
|
144
|
+
if color_filter.call(strokeColorComps)
|
145
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment, strokeColorComps.to_a) }
|
146
|
+
end
|
147
|
+
|
148
|
+
drawer.currentPath = []
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
class CloseFillNonZeroAndStrokePathOperator < OperatorProcessor
|
153
|
+
def process(operator, arguments)
|
154
|
+
drawer = self.context
|
155
|
+
|
156
|
+
fillColorComps = drawer.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
157
|
+
color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
|
158
|
+
if color_filter.call(fillColorComps)
|
159
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment, fillColorComps.to_a) }
|
160
|
+
end
|
161
|
+
|
162
|
+
drawer.currentPath = []
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
class CloseAndStrokePathOperator < OperatorProcessor
|
167
|
+
def process(operator, arguments)
|
168
|
+
drawer = self.context
|
169
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment) }
|
170
|
+
drawer.currentPath = []
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
class EndPathOperator < OperatorProcessor
|
175
|
+
def process(operator, arguments)
|
176
|
+
drawer = self.context
|
177
|
+
# end without stroke, we don't care about it. discard it
|
178
|
+
drawer.currentPath = []
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
class FillNonZeroRuleOperator < OperatorProcessor
|
183
|
+
def process(operator, arguments)
|
184
|
+
drawer = self.context
|
185
|
+
# end without stroke, we don't care about it. discard it
|
186
|
+
drawer.currentPath = []
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
OPERATOR_PROCESSORS = {
|
191
|
+
'm' => MoveToOperator.new,
|
192
|
+
're' => AppendRectangleToPathOperator.new,
|
193
|
+
'l' => LineToOperator.new,
|
194
|
+
'S' => StrokePathOperator.new,
|
195
|
+
's' => StrokePathOperator.new,
|
196
|
+
'n' => EndPathOperator.new,
|
197
|
+
'b' => CloseFillNonZeroAndStrokePathOperator.new,
|
198
|
+
'b*' => CloseFillNonZeroAndStrokePathOperator.new,
|
199
|
+
'f' => CloseFillNonZeroAndStrokePathOperator.new,
|
200
|
+
'f*' => CloseFillNonZeroAndStrokePathOperator.new,
|
201
|
+
'BT' => org.apache.pdfbox.util.operator.BeginText.new,
|
202
|
+
'cm' => org.apache.pdfbox.util.operator.Concatenate.new,
|
203
|
+
'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new,
|
204
|
+
'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new,
|
205
|
+
'ET' => org.apache.pdfbox.util.operator.EndText.new,
|
206
|
+
'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new,
|
207
|
+
'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new,
|
208
|
+
'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new,
|
209
|
+
'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new,
|
210
|
+
'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new,
|
211
|
+
'q' => org.apache.pdfbox.util.operator.GSave.new,
|
212
|
+
'Q' => org.apache.pdfbox.util.operator.GRestore.new,
|
213
|
+
'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new,
|
214
|
+
'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new,
|
215
|
+
'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
|
216
|
+
'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
|
217
|
+
'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
|
218
|
+
'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
|
219
|
+
'T*' => org.apache.pdfbox.util.operator.NextLine.new,
|
220
|
+
'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new,
|
221
|
+
'Td' => org.apache.pdfbox.util.operator.MoveText.new,
|
222
|
+
'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new,
|
223
|
+
'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new,
|
224
|
+
'Tj' => org.apache.pdfbox.util.operator.ShowText.new,
|
225
|
+
'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new,
|
226
|
+
'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new,
|
227
|
+
'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new,
|
228
|
+
'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new,
|
229
|
+
'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new,
|
230
|
+
'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new,
|
231
|
+
'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new,
|
232
|
+
"\'" => org.apache.pdfbox.util.operator.MoveAndShow.new,
|
233
|
+
'\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new,
|
234
|
+
}
|
235
|
+
|
236
|
+
def initialize(options={})
|
237
|
+
super()
|
238
|
+
@options = options.merge!(DETECT_LINES_DEFAULTS)
|
239
|
+
self.clear!
|
240
|
+
OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) }
|
241
|
+
end
|
242
|
+
|
243
|
+
def clear!
|
244
|
+
self.rulings = []
|
245
|
+
self.currentX = -1
|
246
|
+
self.currentY = -1
|
247
|
+
self.currentPath = []
|
248
|
+
@pageSize = nil
|
249
|
+
end
|
250
|
+
|
251
|
+
def addRuling(ruling, color=nil)
|
252
|
+
color = color.nil? ? [0,0,0] : color
|
253
|
+
if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation)
|
254
|
+
|
255
|
+
mb = page.findMediaBox
|
256
|
+
|
257
|
+
ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation)
|
258
|
+
|
259
|
+
trans = if page.getRotation == 90 || page.getRotation == -270
|
260
|
+
AffineTransform.getTranslateInstance(mb.getHeight, 0)
|
261
|
+
else
|
262
|
+
AffineTransform.getTranslateInstance(0, mb.getWidth)
|
263
|
+
end
|
264
|
+
ruling.transform!(trans)
|
265
|
+
end
|
266
|
+
|
267
|
+
# snapping to grid and joining lines that are close together
|
268
|
+
ruling.snap!(options[:snapping_grid_cell_size])
|
269
|
+
|
270
|
+
self.rulings << [ruling, color]
|
271
|
+
end
|
272
|
+
|
273
|
+
##
|
274
|
+
# get current page size
|
275
|
+
def pageSize
|
276
|
+
@pageSize ||= self.page.findMediaBox.createDimension
|
277
|
+
end
|
278
|
+
|
279
|
+
##
|
280
|
+
# fix the Y coordinate based on page rotation
|
281
|
+
def fixY(y)
|
282
|
+
pageSize.getHeight - y
|
283
|
+
end
|
284
|
+
|
285
|
+
def ScaledPoint(*args)
|
286
|
+
x, y = args[0], args[1]
|
287
|
+
|
288
|
+
# if scale factor not provided, get it from current transformation matrix
|
289
|
+
if args.size == 2
|
290
|
+
ctm = getGraphicsState.getCurrentTransformationMatrix
|
291
|
+
at = ctm.createAffineTransform
|
292
|
+
scaleX = at.getScaleX; scaleY = at.getScaleY
|
293
|
+
else
|
294
|
+
scaleX = args[2]; scaleY = args[3]
|
295
|
+
end
|
296
|
+
|
297
|
+
finalX = 0.0;
|
298
|
+
finalY = 0.0;
|
299
|
+
|
300
|
+
if scaleX > 0
|
301
|
+
finalX = x * scaleX;
|
302
|
+
end
|
303
|
+
if scaleY > 0
|
304
|
+
finalY = y * scaleY;
|
305
|
+
end
|
306
|
+
|
307
|
+
return java.awt.geom.Point2D::Float.new(finalX, finalY);
|
308
|
+
|
309
|
+
end
|
310
|
+
|
311
|
+
def TransformedPoint(x, y)
|
312
|
+
position = [x,y].to_java(:float)
|
313
|
+
at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform
|
314
|
+
at.transform(position, 0, position, 0, 1)
|
315
|
+
position[1] = fixY(position[1])
|
316
|
+
java.awt.geom.Point2D::Float.new(position[0], position[1])
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
require 'java'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
4
|
-
|
5
3
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
6
4
|
java_import org.apache.pdfbox.pdfviewer.PageDrawer
|
7
5
|
java_import java.awt.image.BufferedImage
|
@@ -31,7 +29,6 @@ module Tabula
|
|
31
29
|
rotation = java.lang.Math.toRadians(page.findRotation)
|
32
30
|
|
33
31
|
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
34
|
-
#widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
35
32
|
widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
|
36
33
|
|
37
34
|
|
@@ -44,7 +41,7 @@ module Tabula
|
|
44
41
|
graphics.setBackground(TRANSPARENT_WHITE)
|
45
42
|
graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
|
46
43
|
if rotation != 0
|
47
|
-
graphics.translate
|
44
|
+
graphics.java_send :translate, [Java::int, Java::int], retval.getWidth, 0.0
|
48
45
|
graphics.rotate(rotation)
|
49
46
|
end
|
50
47
|
graphics.scale(scaling, scaling)
|
@@ -65,4 +62,3 @@ if __FILE__ == $0
|
|
65
62
|
ImageIO.write(bi, 'png',
|
66
63
|
java.io.File.new('notext.png'))
|
67
64
|
end
|
68
|
-
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Tabula
|
2
|
+
module Extraction
|
3
|
+
|
4
|
+
warn 'Tabula::Extraction::SpreadsheetExtractor is DEPRECATED and will be removed. Use ObjectExtractor instead'
|
5
|
+
|
6
|
+
class SpreadsheetExtractor < ObjectExtractor
|
7
|
+
|
8
|
+
# yields each spreadsheet and the page it corresponds to
|
9
|
+
# because each page can contain an arbitrary number of spreadsheets, each page can be sent
|
10
|
+
# to the block an arbitrary number of times.
|
11
|
+
# so the extract.each_with_index trick will absolutely not work.
|
12
|
+
|
13
|
+
# TODO lots of repeated code with parent class
|
14
|
+
# REFACTOR
|
15
|
+
def extract(options={})
|
16
|
+
Enumerator.new do |y|
|
17
|
+
begin
|
18
|
+
@pages.each do |i|
|
19
|
+
pdfbox_page = @all_pages.get(i-1) #TODO: this can error out ungracefully if you try to extract a page that doesn't exist (e.g. page 5 of a 4 page doc). we should catch and handle.
|
20
|
+
contents = pdfbox_page.getContents
|
21
|
+
next if contents.nil?
|
22
|
+
self.clear!
|
23
|
+
self.drawPage pdfbox_page
|
24
|
+
|
25
|
+
page = Tabula::Page.new( @pdf_filename,
|
26
|
+
pdfbox_page.findCropBox.width,
|
27
|
+
pdfbox_page.findCropBox.height,
|
28
|
+
pdfbox_page.getRotation.to_i,
|
29
|
+
i, #one-indexed, just like `i` is.
|
30
|
+
self.characters,
|
31
|
+
self.rulings)
|
32
|
+
|
33
|
+
page.spreadsheets(options).each do |spreadsheet|
|
34
|
+
spreadsheet.cells.each do |cell|
|
35
|
+
cell.text_elements = page.get_cell_text(cell)
|
36
|
+
end
|
37
|
+
y.yield page, spreadsheet
|
38
|
+
end
|
39
|
+
end
|
40
|
+
ensure
|
41
|
+
@pdf_file.close
|
42
|
+
end # begin
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
#new plan:
|
51
|
+
# find all the cells on the page (lines -> minimal rects)
|
52
|
+
# find all the spreadsheets from the cells (minimal rects -> maximal rects)
|