pdf-reader 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -1
- data/README.rdoc +1 -0
- data/Rakefile +23 -8
- data/lib/pdf-reader.rb +3 -1
- data/lib/pdf/hash.rb +5 -1
- data/lib/pdf/reader.rb +8 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +14 -6
- data/lib/pdf/reader/cid_widths.rb +61 -0
- data/lib/pdf/reader/cmap.rb +8 -2
- data/lib/pdf/reader/encoding.rb +52 -27
- data/lib/pdf/reader/error.rb +16 -1
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/filter/ascii85.rb +3 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
- data/lib/pdf/reader/filter/depredict.rb +2 -0
- data/lib/pdf/reader/filter/flate.rb +3 -1
- data/lib/pdf/reader/filter/lzw.rb +1 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -1
- data/lib/pdf/reader/font.rb +74 -18
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/glyph_hash.rb +6 -0
- data/lib/pdf/reader/lzw.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +172 -69
- data/lib/pdf/reader/page_text_receiver.rb +50 -21
- data/lib/pdf/reader/pages_strategy.rb +17 -4
- data/lib/pdf/reader/parser.rb +25 -52
- data/lib/pdf/reader/print_receiver.rb +5 -0
- data/lib/pdf/reader/reference.rb +2 -0
- data/lib/pdf/reader/register_receiver.rb +1 -1
- data/lib/pdf/reader/standard_security_handler.rb +2 -0
- data/lib/pdf/reader/stream.rb +2 -0
- data/lib/pdf/reader/synchronized_cache.rb +32 -0
- data/lib/pdf/reader/text_receiver.rb +5 -4
- data/lib/pdf/reader/text_run.rb +80 -0
- data/lib/pdf/reader/token.rb +2 -0
- data/lib/pdf/reader/transformation_matrix.rb +194 -0
- data/lib/pdf/reader/width_calculator.rb +11 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
- data/lib/pdf/reader/width_calculator/composite.rb +27 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
- data/lib/pdf/reader/xref.rb +9 -2
- metadata +119 -13
@@ -1,22 +1,23 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'pdf/reader/transformation_matrix'
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
class PDF::Reader
|
6
|
+
# encapsulates logic for tracking graphics state as the instructions for
|
7
|
+
# a single page are processed. Most of the public methods correspond
|
8
|
+
# directly to PDF operators.
|
7
9
|
class PageState
|
8
10
|
|
9
11
|
DEFAULT_GRAPHICS_STATE = {
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:text_font => nil,
|
12
|
+
:char_spacing => 0,
|
13
|
+
:word_spacing => 0,
|
14
|
+
:h_scaling => 1.0,
|
15
|
+
:text_leading => 0,
|
16
|
+
:text_font => nil,
|
16
17
|
:text_font_size => nil,
|
17
|
-
:text_mode
|
18
|
-
:text_rise
|
19
|
-
:text_knockout
|
18
|
+
:text_mode => 0,
|
19
|
+
:text_rise => 0,
|
20
|
+
:text_knockout => 0
|
20
21
|
}
|
21
22
|
|
22
23
|
# starting a new page
|
@@ -28,16 +29,23 @@ module PDF
|
|
28
29
|
@xobject_stack = [page.xobjects]
|
29
30
|
@cs_stack = [page.color_spaces]
|
30
31
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
32
|
+
state[:ctm] = identity_matrix
|
31
33
|
end
|
32
34
|
|
33
35
|
#####################################################
|
34
36
|
# Graphics State Operators
|
35
37
|
#####################################################
|
36
38
|
|
39
|
+
# Clones the current graphics state and push it onto the top of the stack.
|
40
|
+
# Any changes that are subsequently made to the state can then by reversed
|
41
|
+
# by calling restore_graphics_state.
|
42
|
+
#
|
37
43
|
def save_graphics_state
|
38
44
|
@stack.push clone_state
|
39
45
|
end
|
40
46
|
|
47
|
+
# Restore the state to the previous value on the stack.
|
48
|
+
#
|
41
49
|
def restore_graphics_state
|
42
50
|
@stack.pop
|
43
51
|
end
|
@@ -54,16 +62,12 @@ module PDF
|
|
54
62
|
# with the new matrix to form the updated matrix.
|
55
63
|
#
|
56
64
|
def concatenate_matrix(a, b, c, d, e, f)
|
57
|
-
transform = Matrix[
|
58
|
-
[a, b, 0],
|
59
|
-
[c, d, 0],
|
60
|
-
[e, f, 1]
|
61
|
-
]
|
62
65
|
if state[:ctm]
|
63
|
-
state[:ctm]
|
66
|
+
state[:ctm].multiply!(a,b,c,d,e,f)
|
64
67
|
else
|
65
|
-
state[:ctm] =
|
68
|
+
state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
|
66
69
|
end
|
70
|
+
@text_rendering_matrix = nil # invalidate cached value
|
67
71
|
end
|
68
72
|
|
69
73
|
#####################################################
|
@@ -71,13 +75,13 @@ module PDF
|
|
71
75
|
#####################################################
|
72
76
|
|
73
77
|
def begin_text_object
|
74
|
-
@text_matrix =
|
75
|
-
@text_line_matrix =
|
78
|
+
@text_matrix = identity_matrix
|
79
|
+
@text_line_matrix = identity_matrix
|
80
|
+
@font_size = nil
|
76
81
|
end
|
77
82
|
|
78
83
|
def end_text_object
|
79
|
-
|
80
|
-
@text_line_matrix = Matrix.identity(3)
|
84
|
+
# don't need to do anything
|
81
85
|
end
|
82
86
|
|
83
87
|
#####################################################
|
@@ -89,7 +93,7 @@ module PDF
|
|
89
93
|
end
|
90
94
|
|
91
95
|
def set_horizontal_text_scaling(h_scaling)
|
92
|
-
state[:h_scaling] = h_scaling
|
96
|
+
state[:h_scaling] = h_scaling / 100.0
|
93
97
|
end
|
94
98
|
|
95
99
|
def set_text_font_and_size(label, size)
|
@@ -98,7 +102,7 @@ module PDF
|
|
98
102
|
end
|
99
103
|
|
100
104
|
def font_size
|
101
|
-
state[:text_font_size] * @text_matrix
|
105
|
+
@font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
|
102
106
|
end
|
103
107
|
|
104
108
|
def set_text_leading(leading)
|
@@ -122,12 +126,16 @@ module PDF
|
|
122
126
|
#####################################################
|
123
127
|
|
124
128
|
def move_text_position(x, y) # Td
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
129
|
+
temp = TransformationMatrix.new(1, 0,
|
130
|
+
0, 1,
|
131
|
+
x, y)
|
132
|
+
@text_line_matrix = temp.multiply!(
|
133
|
+
@text_line_matrix.a, @text_line_matrix.b,
|
134
|
+
@text_line_matrix.c, @text_line_matrix.d,
|
135
|
+
@text_line_matrix.e, @text_line_matrix.f
|
136
|
+
)
|
137
|
+
@text_matrix = @text_line_matrix.dup
|
138
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
131
139
|
end
|
132
140
|
|
133
141
|
def move_text_position_and_set_leading(x, y) # TD
|
@@ -136,11 +144,13 @@ module PDF
|
|
136
144
|
end
|
137
145
|
|
138
146
|
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
139
|
-
@text_matrix =
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
147
|
+
@text_matrix = TransformationMatrix.new(
|
148
|
+
a, b,
|
149
|
+
c, d,
|
150
|
+
e, f
|
151
|
+
)
|
152
|
+
@text_line_matrix = @text_matrix.dup
|
153
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
144
154
|
end
|
145
155
|
|
146
156
|
def move_to_start_of_next_line # T*
|
@@ -197,22 +207,29 @@ module PDF
|
|
197
207
|
# transform x and y co-ordinates from the current user space to the
|
198
208
|
# underlying device space.
|
199
209
|
#
|
200
|
-
def ctm_transform(x, y
|
210
|
+
def ctm_transform(x, y)
|
201
211
|
[
|
202
|
-
(ctm
|
203
|
-
(ctm
|
212
|
+
(ctm.a * x) + (ctm.c * y) + (ctm.e),
|
213
|
+
(ctm.b * x) + (ctm.d * y) + (ctm.f)
|
204
214
|
]
|
205
215
|
end
|
206
216
|
|
207
217
|
# transform x and y co-ordinates from the current text space to the
|
208
218
|
# underlying device space.
|
209
219
|
#
|
210
|
-
|
220
|
+
# transforming (0,0) is a really common case, so optimise for it to
|
221
|
+
# avoid unnecessary object allocations
|
222
|
+
#
|
223
|
+
def trm_transform(x, y)
|
211
224
|
trm = text_rendering_matrix
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
225
|
+
if x == 0 && y == 0
|
226
|
+
[trm.e, trm.f]
|
227
|
+
else
|
228
|
+
[
|
229
|
+
(trm.a * x) + (trm.c * y) + (trm.e),
|
230
|
+
(trm.b * x) + (trm.d * y) + (trm.f)
|
231
|
+
]
|
232
|
+
end
|
216
233
|
end
|
217
234
|
|
218
235
|
def current_font
|
@@ -240,16 +257,105 @@ module PDF
|
|
240
257
|
dict ? dict[label] : nil
|
241
258
|
end
|
242
259
|
|
260
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
261
|
+
# current state onto the stack. That way any modifications to the state
|
262
|
+
# will be undone once restore_graphics_state is called.
|
263
|
+
#
|
264
|
+
def stack_depth
|
265
|
+
@stack.size
|
266
|
+
end
|
267
|
+
|
268
|
+
# This returns a deep clone of the current state, ensuring changes are
|
269
|
+
# keep separate from earlier states.
|
270
|
+
#
|
271
|
+
# Marshal is used to round-trip the state through a string to easily
|
272
|
+
# perform the deep clone. Kinda hacky, but effective.
|
273
|
+
#
|
274
|
+
def clone_state
|
275
|
+
if @stack.empty?
|
276
|
+
{}
|
277
|
+
else
|
278
|
+
Marshal.load Marshal.dump(@stack.last)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
# after each glyph is painted onto the page the text matrix must be
|
283
|
+
# modified. There's no defined operator for this, but depending on
|
284
|
+
# the use case some receivers may need to mutate the state with this
|
285
|
+
# while walking a page.
|
286
|
+
#
|
287
|
+
# NOTE: some of the variable names in this method are obscure because
|
288
|
+
# they mirror variable names from the PDF spec
|
289
|
+
#
|
290
|
+
# NOTE: see Section 9.4.4, PDF 32000-1:2008, pp 252
|
291
|
+
#
|
292
|
+
# Arguments:
|
293
|
+
#
|
294
|
+
# w0 - the glyph width in *text space*. This generally means the width
|
295
|
+
# in glyph space should be divded by 1000 before being passed to
|
296
|
+
# this function
|
297
|
+
# tj - any kerning that should be applied to the text matrix before the
|
298
|
+
# following glyph is painted. This is usually the numeric arguments
|
299
|
+
# in the array passed to a TJ operator
|
300
|
+
# word_boundary - a boolean indicating if a word boundary was just
|
301
|
+
# reached. Depending on the current state extra space
|
302
|
+
# may need to be added
|
303
|
+
#
|
304
|
+
def process_glyph_displacement(w0, tj, word_boundary)
|
305
|
+
fs = font_size # font size
|
306
|
+
tc = state[:char_spacing]
|
307
|
+
if word_boundary
|
308
|
+
tw = state[:word_spacing]
|
309
|
+
else
|
310
|
+
tw = 0
|
311
|
+
end
|
312
|
+
th = state[:h_scaling]
|
313
|
+
# optimise the common path to reduce Float allocations
|
314
|
+
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
315
|
+
glyph_width = w0 * fs
|
316
|
+
tx = glyph_width
|
317
|
+
else
|
318
|
+
glyph_width = ((w0 - (tj/1000.0)) * fs) * th
|
319
|
+
tx = glyph_width + ((tc + tw) * th)
|
320
|
+
end
|
321
|
+
ty = 0
|
322
|
+
|
323
|
+
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
324
|
+
# ctm[0] here, but this gets my tests green and I'm out of
|
325
|
+
# ideas for now
|
326
|
+
# TODO: support ty > 0
|
327
|
+
if ctm.a == 1
|
328
|
+
@text_matrix.horizontal_displacement_multiply!(tx)
|
329
|
+
else
|
330
|
+
@text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
|
331
|
+
end
|
332
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
333
|
+
end
|
334
|
+
|
243
335
|
private
|
244
336
|
|
337
|
+
# used for many and varied text positioning calculations. We potentially
|
338
|
+
# need to access the results of this method many times when working with
|
339
|
+
# text, so memoize it
|
340
|
+
#
|
245
341
|
def text_rendering_matrix
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
342
|
+
@text_rendering_matrix ||= begin
|
343
|
+
state_matrix = TransformationMatrix.new(
|
344
|
+
font_size * state[:h_scaling], 0,
|
345
|
+
0, font_size,
|
346
|
+
0, state[:text_rise]
|
347
|
+
)
|
348
|
+
state_matrix.multiply!(
|
349
|
+
@text_matrix.a, @text_matrix.b,
|
350
|
+
@text_matrix.c, @text_matrix.d,
|
351
|
+
@text_matrix.e, @text_matrix.f
|
352
|
+
)
|
353
|
+
state_matrix.multiply!(
|
354
|
+
ctm.a, ctm.b,
|
355
|
+
ctm.c, ctm.d,
|
356
|
+
ctm.e, ctm.f
|
357
|
+
)
|
358
|
+
end
|
253
359
|
end
|
254
360
|
|
255
361
|
# return the current transformation matrix
|
@@ -272,25 +378,22 @@ module PDF
|
|
272
378
|
::Hash[wrapped_fonts]
|
273
379
|
end
|
274
380
|
|
275
|
-
|
276
|
-
#
|
277
|
-
|
278
|
-
|
279
|
-
# This
|
280
|
-
#
|
281
|
-
#
|
282
|
-
#
|
283
|
-
#
|
284
|
-
#
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
end
|
381
|
+
#####################################################
|
382
|
+
# Low-level Matrix Operations
|
383
|
+
#####################################################
|
384
|
+
|
385
|
+
# This class uses 3x3 matrices to represent geometric transformations
|
386
|
+
# These matrices are represented by arrays with 9 elements
|
387
|
+
# The array [a,b,c,d,e,f,g,h,i] would represent a matrix like:
|
388
|
+
# a b c
|
389
|
+
# d e f
|
390
|
+
# g h i
|
391
|
+
|
392
|
+
def identity_matrix
|
393
|
+
TransformationMatrix.new(1, 0,
|
394
|
+
0, 1,
|
395
|
+
0, 0)
|
291
396
|
end
|
292
397
|
|
293
398
|
end
|
294
|
-
end
|
295
399
|
end
|
296
|
-
|
@@ -1,13 +1,22 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
-
require 'matrix'
|
4
3
|
require 'forwardable'
|
4
|
+
require 'pdf/reader/page_layout'
|
5
5
|
|
6
6
|
module PDF
|
7
7
|
class Reader
|
8
|
+
|
9
|
+
# Builds a UTF-8 string of all the text on a single page by processing all
|
10
|
+
# the operaters in a content stream.
|
11
|
+
#
|
8
12
|
class PageTextReceiver
|
9
13
|
extend Forwardable
|
10
14
|
|
15
|
+
SPACE = " "
|
16
|
+
|
17
|
+
attr_reader :state, :content, :options
|
18
|
+
|
19
|
+
########## BEGIN FORWARDERS ##########
|
11
20
|
# Graphics State Operators
|
12
21
|
def_delegators :@state, :save_graphics_state, :restore_graphics_state
|
13
22
|
|
@@ -26,41 +35,32 @@ module PDF
|
|
26
35
|
# Text Positioning Operators
|
27
36
|
def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
|
28
37
|
def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
|
38
|
+
########## END FORWARDERS ##########
|
29
39
|
|
30
40
|
# starting a new page
|
31
41
|
def page=(page)
|
32
42
|
@state = PageState.new(page)
|
33
|
-
@content =
|
43
|
+
@content = []
|
44
|
+
@characters = []
|
45
|
+
@mediabox = page.attributes[:MediaBox]
|
34
46
|
end
|
35
47
|
|
36
48
|
def content
|
37
|
-
|
38
|
-
keys.map { |key|
|
39
|
-
@content[key]
|
40
|
-
}.join("\n")
|
49
|
+
PageLayout.new(@characters, @mediabox).to_s
|
41
50
|
end
|
42
51
|
|
43
52
|
#####################################################
|
44
53
|
# Text Showing Operators
|
45
54
|
#####################################################
|
46
|
-
|
47
55
|
# record text that is drawn on the page
|
48
|
-
def show_text(string) # Tj
|
49
|
-
|
50
|
-
newx, newy = @state.trm_transform(0,0)
|
51
|
-
@content[newy] ||= ""
|
52
|
-
@content[newy] << @state.current_font.to_utf8(string)
|
56
|
+
def show_text(string) # Tj (AWAY)
|
57
|
+
internal_show_text(string)
|
53
58
|
end
|
54
59
|
|
55
|
-
def show_text_with_positioning(params) # TJ
|
56
|
-
params.each
|
57
|
-
|
58
|
-
|
59
|
-
show_text(arg)
|
60
|
-
when Fixnum, Float
|
61
|
-
show_text(" ") if arg > 1000
|
62
|
-
end
|
63
|
-
}
|
60
|
+
def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
|
61
|
+
params.each_slice(2).each do |string, kerning|
|
62
|
+
internal_show_text(string, kerning || 0)
|
63
|
+
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def move_to_next_line_and_show_text(str) # '
|
@@ -86,6 +86,35 @@ module PDF
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
private
|
90
|
+
|
91
|
+
def internal_show_text(string, kerning = 0)
|
92
|
+
if @state.current_font.nil?
|
93
|
+
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
94
|
+
end
|
95
|
+
glyphs = @state.current_font.unpack(string)
|
96
|
+
glyphs.each_with_index do |glyph_code, index|
|
97
|
+
# paint the current glyph
|
98
|
+
newx, newy = @state.trm_transform(0,0)
|
99
|
+
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
100
|
+
|
101
|
+
# apply to glyph displacment for the current glyph so the next
|
102
|
+
# glyph will appear in the correct position
|
103
|
+
glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
|
104
|
+
th = 1
|
105
|
+
if kerning != 0 && index == glyphs.size - 1
|
106
|
+
tj = kerning
|
107
|
+
else
|
108
|
+
tj = 0
|
109
|
+
end
|
110
|
+
scaled_glyph_width = glyph_width * @state.font_size * th
|
111
|
+
unless utf8_chars == SPACE
|
112
|
+
@characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
|
113
|
+
end
|
114
|
+
@state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
89
118
|
end
|
90
119
|
end
|
91
120
|
end
|