tabula-extractor 0.6.6-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/AUTHORS.md +1 -0
- data/README.md +27 -11
- data/bin/tabula +61 -19
- data/ext/liblsd-linux32.so +0 -0
- data/ext/liblsd-linux64.so +0 -0
- data/ext/liblsd.dll +0 -0
- data/ext/liblsd.dylib +0 -0
- data/ext/liblsd64.dll +0 -0
- data/ext/lsd.c +137 -137
- data/ext/lsd.h +9 -9
- data/lib/tabula.rb +20 -3
- data/lib/tabula/core_ext.rb +261 -0
- data/lib/tabula/entities.rb +11 -456
- data/lib/tabula/entities/cell.rb +42 -0
- data/lib/tabula/entities/has_cells.rb +244 -0
- data/lib/tabula/entities/line.rb +39 -0
- data/lib/tabula/entities/page.rb +269 -0
- data/lib/tabula/entities/page_area.rb +7 -0
- data/lib/tabula/entities/ruling.rb +300 -0
- data/lib/tabula/entities/spreadsheet.rb +92 -0
- data/lib/tabula/entities/table.rb +81 -0
- data/lib/tabula/entities/text_chunk.rb +114 -0
- data/lib/tabula/entities/text_element.rb +112 -0
- data/lib/tabula/entities/zone_entity.rb +57 -0
- data/lib/tabula/extraction.rb +327 -0
- data/lib/tabula/line_segment_detector.rb +9 -7
- data/lib/tabula/pdf_line_extractor.rb +319 -0
- data/lib/tabula/pdf_render.rb +1 -5
- data/lib/tabula/spreadsheet_extractor.rb +52 -0
- data/lib/tabula/table_extractor.rb +50 -348
- data/lib/tabula/table_guesser.rb +21 -23
- data/lib/tabula/version.rb +1 -1
- data/lib/tabula/writers.rb +5 -6
- data/tabula-extractor.gemspec +1 -0
- data/target/pdfbox-app-2.0.0-SNAPSHOT.jar +0 -0
- data/test/data/47008204D_USA.page4.pdf +0 -0
- data/test/data/560015757GV_China.page1.pdf +0 -0
- data/test/data/GSK_2012_Q4.page437.pdf +0 -0
- data/test/data/S2MNCEbirdisland.pdf +0 -0
- data/test/data/campaign_donors.pdf +0 -0
- data/test/data/frx_2012_disclosure.tsv +88 -0
- data/test/data/no_tables.pdf +0 -0
- data/test/data/puertos1.pdf +0 -0
- data/test/data/spanning_cells.csv +21 -0
- data/test/data/spanning_cells.pdf +0 -0
- data/test/data/strongschools.pdf +0 -0
- data/{vertical_rulings_bug.pdf → test/data/vertical_rulings_bug.pdf} +0 -0
- data/test/data/vietnam3.pdf +0 -0
- data/test/heuristic-test-set/original/560015757GV_China.page1.pdf +0 -0
- data/test/heuristic-test-set/original/S2MNCEbirdisland.pdf +0 -0
- data/test/heuristic-test-set/original/bo_page24.pdf +0 -0
- data/test/heuristic-test-set/original/campaign_donors.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/47008204D_USA.page4.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/GSK_2012_Q4.page437.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/strongschools.pdf +0 -0
- data/test/heuristic-test-set/spreadsheet/tabla_subsidios.pdf +0 -0
- data/test/heuristic.rb +50 -0
- data/test/test_bin_tabula.sh +7 -0
- data/test/tests.rb +476 -63
- metadata +79 -28
- data/lib/geom/point.rb +0 -21
- data/lib/geom/rectangle.rb +0 -101
- data/lib/geom/segment.rb +0 -82
- data/lib/tabula/pdf_dump.rb +0 -132
- data/lib/tabula/whitespace.rb +0 -50
- data/vertical_rulings_bug.rb +0 -29
@@ -5,8 +5,7 @@ require 'ffi'
|
|
5
5
|
|
6
6
|
require_relative './entities'
|
7
7
|
require_relative './pdf_render'
|
8
|
-
require_relative './
|
9
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
8
|
+
require_relative './extraction'
|
10
9
|
|
11
10
|
java_import javax.imageio.ImageIO
|
12
11
|
java_import java.awt.image.BufferedImage
|
@@ -55,6 +54,7 @@ module Tabula
|
|
55
54
|
lines
|
56
55
|
end
|
57
56
|
|
57
|
+
#zero-indexed page_number
|
58
58
|
def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
|
59
59
|
options = DETECT_LINES_DEFAULTS.merge(options)
|
60
60
|
|
@@ -79,7 +79,7 @@ module Tabula
|
|
79
79
|
raise ArgumentError, 'image must be a string or a BufferedImage'
|
80
80
|
end
|
81
81
|
|
82
|
-
image = LSD.
|
82
|
+
image = LSD.image_to_image_float(bimage)
|
83
83
|
|
84
84
|
lines_found_ptr = FFI::MemoryPointer.new(:int, 1)
|
85
85
|
|
@@ -89,7 +89,7 @@ module Tabula
|
|
89
89
|
|
90
90
|
rv = []
|
91
91
|
lines_found.times do |i|
|
92
|
-
a = out[7*
|
92
|
+
a = out[7*4*i].read_array_of_type(:float, 7)
|
93
93
|
|
94
94
|
a_round = a[0..3].map(&:round)
|
95
95
|
p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]
|
@@ -109,17 +109,19 @@ module Tabula
|
|
109
109
|
end
|
110
110
|
|
111
111
|
private
|
112
|
-
|
112
|
+
|
113
|
+
def LSD.image_to_image_float(buffered_image)
|
113
114
|
width = buffered_image.getWidth; height = buffered_image.getHeight
|
114
115
|
raster_size = width * height
|
115
116
|
|
116
|
-
|
117
|
+
image_float = FFI::MemoryPointer.new(:float, raster_size)
|
117
118
|
pixels = Java::int[width * height].new
|
118
119
|
buffered_image.getRGB(0, 0, width, height, pixels, 0, width)
|
119
120
|
|
120
|
-
|
121
|
+
image_float.put_array_of_float 0, pixels.to_a
|
121
122
|
end
|
122
123
|
|
124
|
+
|
123
125
|
end
|
124
126
|
end
|
125
127
|
|
@@ -0,0 +1,319 @@
|
|
1
|
+
java_import org.apache.pdfbox.util.operator.OperatorProcessor
|
2
|
+
java_import org.apache.pdfbox.pdfparser.PDFParser
|
3
|
+
java_import org.apache.pdfbox.util.PDFStreamEngine
|
4
|
+
java_import org.apache.pdfbox.util.ResourceLoader
|
5
|
+
|
6
|
+
java_import java.awt.geom.PathIterator
|
7
|
+
java_import java.awt.geom.Point2D
|
8
|
+
java_import java.awt.geom.GeneralPath
|
9
|
+
java_import java.awt.geom.AffineTransform
|
10
|
+
java_import java.awt.Color
|
11
|
+
|
12
|
+
warn 'Tabula::Extraction::LineExtractor is DEPRECATED and will be removed'
|
13
|
+
|
14
|
+
class Tabula::Extraction::LineExtractor < org.apache.pdfbox.util.PDFStreamEngine
|
15
|
+
|
16
|
+
attr_accessor :currentX, :currentY
|
17
|
+
attr_accessor :currentPath
|
18
|
+
attr_accessor :rulings
|
19
|
+
attr_accessor :options
|
20
|
+
field_accessor :page
|
21
|
+
|
22
|
+
DETECT_LINES_DEFAULTS = {
|
23
|
+
:snapping_grid_cell_size => 2
|
24
|
+
}
|
25
|
+
|
26
|
+
def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
|
27
|
+
lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top }
|
28
|
+
lines.inject([]) do |memo, next_line|
|
29
|
+
if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line)
|
30
|
+
memo.last.top = [next_line.top, memo.last.top].min
|
31
|
+
memo.last.bottom = [next_line.bottom, memo.last.bottom].max
|
32
|
+
memo
|
33
|
+
else
|
34
|
+
memo << next_line
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
|
40
|
+
lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left }
|
41
|
+
lines.inject([]) do |memo, next_line|
|
42
|
+
if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line)
|
43
|
+
memo.last.left = [next_line.left, memo.last.left].min
|
44
|
+
memo.last.right = [next_line.right, memo.last.right].max
|
45
|
+
memo
|
46
|
+
else
|
47
|
+
memo << next_line
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
#N.B. for merge `spreadsheets` into `text-extractor-refactor` --
|
53
|
+
# only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method
|
54
|
+
# the rest is readability changes.
|
55
|
+
#page_number here is zero-indexed
|
56
|
+
def self.lines_in_pdf_page(pdf_path, page_number, options={})
|
57
|
+
options = options.merge!(DETECT_LINES_DEFAULTS)
|
58
|
+
if options[:render_pdf]
|
59
|
+
# only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here
|
60
|
+
# since there's no good reason want unclean lines
|
61
|
+
Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, options))
|
62
|
+
else
|
63
|
+
pdf_file = ::Tabula::Extraction.openPDF(pdf_path)
|
64
|
+
page = pdf_file.getDocumentCatalog.getAllPages[page_number]
|
65
|
+
le = self.new(options)
|
66
|
+
le.processStream(page, page.findResources, page.getContents.getStream)
|
67
|
+
pdf_file.close
|
68
|
+
rulings = le.rulings.map do |l, color|
|
69
|
+
::Tabula::Ruling.new(l.getP1.getY,
|
70
|
+
l.getP1.getX,
|
71
|
+
l.getP2.getX - l.getP1.getX,
|
72
|
+
l.getP2.getY - l.getP1.getY,
|
73
|
+
color)
|
74
|
+
end
|
75
|
+
rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
|
76
|
+
collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?))
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class LineToOperator < OperatorProcessor
|
81
|
+
def process(operator, arguments)
|
82
|
+
drawer = self.context
|
83
|
+
x, y = arguments[0], arguments[1]
|
84
|
+
ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
|
85
|
+
|
86
|
+
l = java.awt.geom.Line2D::Float.new(drawer.currentX, drawer.currentY, ppos.getX, ppos.getY)
|
87
|
+
|
88
|
+
drawer.currentPath << l if l.horizontal? or l.vertical?
|
89
|
+
|
90
|
+
drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class MoveToOperator < OperatorProcessor
|
95
|
+
def process(operator, arguments)
|
96
|
+
drawer = self.context
|
97
|
+
x, y = arguments[0], arguments[1]
|
98
|
+
|
99
|
+
ppos = drawer.TransformedPoint(x.floatValue, y.floatValue)
|
100
|
+
|
101
|
+
drawer.currentX, drawer.currentY = ppos.getX, ppos.getY
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
class AppendRectangleToPathOperator < OperatorProcessor
|
106
|
+
def process(operator, arguments)
|
107
|
+
|
108
|
+
drawer = self.context
|
109
|
+
finalX, finalY, finalW, finalH = arguments.to_array.map(&:floatValue)
|
110
|
+
|
111
|
+
ppos = drawer.TransformedPoint(finalX, finalY)
|
112
|
+
psize = drawer.ScaledPoint(finalW, finalH)
|
113
|
+
|
114
|
+
finalY = ppos.getY - psize.getY
|
115
|
+
if finalY < 0
|
116
|
+
finalY = 0
|
117
|
+
end
|
118
|
+
|
119
|
+
width = psize.getX.abs
|
120
|
+
height = psize.getY.abs
|
121
|
+
|
122
|
+
lines = if width > height && height < 2 # horizontal line, "thin" rectangle.
|
123
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX, finalY + psize.getY/2, ppos.getX + psize.getX, finalY + psize.getY/2)]
|
124
|
+
elsif width < height && width < 2 # vertical line, "thin" rectangle
|
125
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX + psize.getX/2, finalY, ppos.getX + psize.getX/2, finalY + psize.getY)]
|
126
|
+
else
|
127
|
+
# add every edge of the rectangle to drawer.rulings
|
128
|
+
[java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX + psize.getX, finalY),
|
129
|
+
java.awt.geom.Line2D::Float.new(ppos.getX, finalY, ppos.getX, finalY + psize.getY),
|
130
|
+
java.awt.geom.Line2D::Float.new(ppos.getX+psize.getX, finalY, ppos.getX + psize.getX, finalY + psize.getY),
|
131
|
+
java.awt.geom.Line2D::Float.new(ppos.getX, finalY+psize.getY, ppos.getX + psize.getX, finalY + psize.getY)]
|
132
|
+
end
|
133
|
+
|
134
|
+
drawer.currentPath += lines.select { |l| l.horizontal? or l.vertical? }
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
class StrokePathOperator < OperatorProcessor
|
140
|
+
def process(operator, arguments)
|
141
|
+
drawer = self.context
|
142
|
+
strokeColorComps = drawer.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
143
|
+
color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
|
144
|
+
if color_filter.call(strokeColorComps)
|
145
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment, strokeColorComps.to_a) }
|
146
|
+
end
|
147
|
+
|
148
|
+
drawer.currentPath = []
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
class CloseFillNonZeroAndStrokePathOperator < OperatorProcessor
|
153
|
+
def process(operator, arguments)
|
154
|
+
drawer = self.context
|
155
|
+
|
156
|
+
fillColorComps = drawer.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil)
|
157
|
+
color_filter = drawer.options[:line_color_filter] || lambda{|c| true } #by default, use all lines, regardless of color
|
158
|
+
if color_filter.call(fillColorComps)
|
159
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment, fillColorComps.to_a) }
|
160
|
+
end
|
161
|
+
|
162
|
+
drawer.currentPath = []
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
class CloseAndStrokePathOperator < OperatorProcessor
|
167
|
+
def process(operator, arguments)
|
168
|
+
drawer = self.context
|
169
|
+
drawer.currentPath.each { |segment| drawer.addRuling(segment) }
|
170
|
+
drawer.currentPath = []
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
class EndPathOperator < OperatorProcessor
|
175
|
+
def process(operator, arguments)
|
176
|
+
drawer = self.context
|
177
|
+
# end without stroke, we don't care about it. discard it
|
178
|
+
drawer.currentPath = []
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
class FillNonZeroRuleOperator < OperatorProcessor
|
183
|
+
def process(operator, arguments)
|
184
|
+
drawer = self.context
|
185
|
+
# end without stroke, we don't care about it. discard it
|
186
|
+
drawer.currentPath = []
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
OPERATOR_PROCESSORS = {
|
191
|
+
'm' => MoveToOperator.new,
|
192
|
+
're' => AppendRectangleToPathOperator.new,
|
193
|
+
'l' => LineToOperator.new,
|
194
|
+
'S' => StrokePathOperator.new,
|
195
|
+
's' => StrokePathOperator.new,
|
196
|
+
'n' => EndPathOperator.new,
|
197
|
+
'b' => CloseFillNonZeroAndStrokePathOperator.new,
|
198
|
+
'b*' => CloseFillNonZeroAndStrokePathOperator.new,
|
199
|
+
'f' => CloseFillNonZeroAndStrokePathOperator.new,
|
200
|
+
'f*' => CloseFillNonZeroAndStrokePathOperator.new,
|
201
|
+
'BT' => org.apache.pdfbox.util.operator.BeginText.new,
|
202
|
+
'cm' => org.apache.pdfbox.util.operator.Concatenate.new,
|
203
|
+
'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new,
|
204
|
+
'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new,
|
205
|
+
'ET' => org.apache.pdfbox.util.operator.EndText.new,
|
206
|
+
'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new,
|
207
|
+
'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new,
|
208
|
+
'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new,
|
209
|
+
'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new,
|
210
|
+
'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new,
|
211
|
+
'q' => org.apache.pdfbox.util.operator.GSave.new,
|
212
|
+
'Q' => org.apache.pdfbox.util.operator.GRestore.new,
|
213
|
+
'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new,
|
214
|
+
'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new,
|
215
|
+
'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
|
216
|
+
'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
|
217
|
+
'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
|
218
|
+
'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
|
219
|
+
'T*' => org.apache.pdfbox.util.operator.NextLine.new,
|
220
|
+
'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new,
|
221
|
+
'Td' => org.apache.pdfbox.util.operator.MoveText.new,
|
222
|
+
'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new,
|
223
|
+
'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new,
|
224
|
+
'Tj' => org.apache.pdfbox.util.operator.ShowText.new,
|
225
|
+
'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new,
|
226
|
+
'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new,
|
227
|
+
'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new,
|
228
|
+
'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new,
|
229
|
+
'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new,
|
230
|
+
'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new,
|
231
|
+
'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new,
|
232
|
+
"\'" => org.apache.pdfbox.util.operator.MoveAndShow.new,
|
233
|
+
'\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new,
|
234
|
+
}
|
235
|
+
|
236
|
+
def initialize(options={})
|
237
|
+
super()
|
238
|
+
@options = options.merge!(DETECT_LINES_DEFAULTS)
|
239
|
+
self.clear!
|
240
|
+
OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) }
|
241
|
+
end
|
242
|
+
|
243
|
+
def clear!
|
244
|
+
self.rulings = []
|
245
|
+
self.currentX = -1
|
246
|
+
self.currentY = -1
|
247
|
+
self.currentPath = []
|
248
|
+
@pageSize = nil
|
249
|
+
end
|
250
|
+
|
251
|
+
def addRuling(ruling, color=nil)
|
252
|
+
color = color.nil? ? [0,0,0] : color
|
253
|
+
if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation)
|
254
|
+
|
255
|
+
mb = page.findMediaBox
|
256
|
+
|
257
|
+
ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation)
|
258
|
+
|
259
|
+
trans = if page.getRotation == 90 || page.getRotation == -270
|
260
|
+
AffineTransform.getTranslateInstance(mb.getHeight, 0)
|
261
|
+
else
|
262
|
+
AffineTransform.getTranslateInstance(0, mb.getWidth)
|
263
|
+
end
|
264
|
+
ruling.transform!(trans)
|
265
|
+
end
|
266
|
+
|
267
|
+
# snapping to grid and joining lines that are close together
|
268
|
+
ruling.snap!(options[:snapping_grid_cell_size])
|
269
|
+
|
270
|
+
self.rulings << [ruling, color]
|
271
|
+
end
|
272
|
+
|
273
|
+
##
|
274
|
+
# get current page size
|
275
|
+
def pageSize
|
276
|
+
@pageSize ||= self.page.findMediaBox.createDimension
|
277
|
+
end
|
278
|
+
|
279
|
+
##
|
280
|
+
# fix the Y coordinate based on page rotation
|
281
|
+
def fixY(y)
|
282
|
+
pageSize.getHeight - y
|
283
|
+
end
|
284
|
+
|
285
|
+
def ScaledPoint(*args)
|
286
|
+
x, y = args[0], args[1]
|
287
|
+
|
288
|
+
# if scale factor not provided, get it from current transformation matrix
|
289
|
+
if args.size == 2
|
290
|
+
ctm = getGraphicsState.getCurrentTransformationMatrix
|
291
|
+
at = ctm.createAffineTransform
|
292
|
+
scaleX = at.getScaleX; scaleY = at.getScaleY
|
293
|
+
else
|
294
|
+
scaleX = args[2]; scaleY = args[3]
|
295
|
+
end
|
296
|
+
|
297
|
+
finalX = 0.0;
|
298
|
+
finalY = 0.0;
|
299
|
+
|
300
|
+
if scaleX > 0
|
301
|
+
finalX = x * scaleX;
|
302
|
+
end
|
303
|
+
if scaleY > 0
|
304
|
+
finalY = y * scaleY;
|
305
|
+
end
|
306
|
+
|
307
|
+
return java.awt.geom.Point2D::Float.new(finalX, finalY);
|
308
|
+
|
309
|
+
end
|
310
|
+
|
311
|
+
def TransformedPoint(x, y)
|
312
|
+
position = [x,y].to_java(:float)
|
313
|
+
at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform
|
314
|
+
at.transform(position, 0, position, 0, 1)
|
315
|
+
position[1] = fixY(position[1])
|
316
|
+
java.awt.geom.Point2D::Float.new(position[0], position[1])
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
data/lib/tabula/pdf_render.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
require 'java'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '../../target/', Tabula::PDFBOX)
|
4
|
-
|
5
3
|
java_import org.apache.pdfbox.pdmodel.PDDocument
|
6
4
|
java_import org.apache.pdfbox.pdfviewer.PageDrawer
|
7
5
|
java_import java.awt.image.BufferedImage
|
@@ -31,7 +29,6 @@ module Tabula
|
|
31
29
|
rotation = java.lang.Math.toRadians(page.findRotation)
|
32
30
|
|
33
31
|
scaling = width / (rotation == 0 ? widthPt : heightPt)
|
34
|
-
#widthPx, heightPx = java.lang.Math.round(widthPt * scaling), java.lang.Math.round(heightPt * scaling)
|
35
32
|
widthPx, heightPx = (java.lang.Math.java_send :round, [Java::float], widthPt * scaling ), (java.lang.Math.java_send :round, [Java::float], heightPt * scaling)
|
36
33
|
|
37
34
|
|
@@ -44,7 +41,7 @@ module Tabula
|
|
44
41
|
graphics.setBackground(TRANSPARENT_WHITE)
|
45
42
|
graphics.clearRect(0, 0, retval.getWidth, retval.getHeight)
|
46
43
|
if rotation != 0
|
47
|
-
graphics.translate
|
44
|
+
graphics.java_send :translate, [Java::int, Java::int], retval.getWidth, 0.0
|
48
45
|
graphics.rotate(rotation)
|
49
46
|
end
|
50
47
|
graphics.scale(scaling, scaling)
|
@@ -65,4 +62,3 @@ if __FILE__ == $0
|
|
65
62
|
ImageIO.write(bi, 'png',
|
66
63
|
java.io.File.new('notext.png'))
|
67
64
|
end
|
68
|
-
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Tabula
|
2
|
+
module Extraction
|
3
|
+
|
4
|
+
warn 'Tabula::Extraction::SpreadsheetExtractor is DEPRECATED and will be removed. Use ObjectExtractor instead'
|
5
|
+
|
6
|
+
class SpreadsheetExtractor < ObjectExtractor
|
7
|
+
|
8
|
+
# yields each spreadsheet and the page it corresponds to
|
9
|
+
# because each page can contain an arbitrary number of spreadsheets, each page can be sent
|
10
|
+
# to the block an arbitrary number of times.
|
11
|
+
# so the extract.each_with_index trick will absolutely not work.
|
12
|
+
|
13
|
+
# TODO lots of repeated code with parent class
|
14
|
+
# REFACTOR
|
15
|
+
def extract(options={})
|
16
|
+
Enumerator.new do |y|
|
17
|
+
begin
|
18
|
+
@pages.each do |i|
|
19
|
+
pdfbox_page = @all_pages.get(i-1) #TODO: this can error out ungracefully if you try to extract a page that doesn't exist (e.g. page 5 of a 4 page doc). we should catch and handle.
|
20
|
+
contents = pdfbox_page.getContents
|
21
|
+
next if contents.nil?
|
22
|
+
self.clear!
|
23
|
+
self.drawPage pdfbox_page
|
24
|
+
|
25
|
+
page = Tabula::Page.new( @pdf_filename,
|
26
|
+
pdfbox_page.findCropBox.width,
|
27
|
+
pdfbox_page.findCropBox.height,
|
28
|
+
pdfbox_page.getRotation.to_i,
|
29
|
+
i, #one-indexed, just like `i` is.
|
30
|
+
self.characters,
|
31
|
+
self.rulings)
|
32
|
+
|
33
|
+
page.spreadsheets(options).each do |spreadsheet|
|
34
|
+
spreadsheet.cells.each do |cell|
|
35
|
+
cell.text_elements = page.get_cell_text(cell)
|
36
|
+
end
|
37
|
+
y.yield page, spreadsheet
|
38
|
+
end
|
39
|
+
end
|
40
|
+
ensure
|
41
|
+
@pdf_file.close
|
42
|
+
end # begin
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
#new plan:
|
51
|
+
# find all the cells on the page (lines -> minimal rects)
|
52
|
+
# find all the spreadsheets from the cells (minimal rects -> maximal rects)
|