pdf-reader 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -1
- data/README.rdoc +1 -0
- data/Rakefile +23 -8
- data/lib/pdf-reader.rb +3 -1
- data/lib/pdf/hash.rb +5 -1
- data/lib/pdf/reader.rb +8 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +14 -6
- data/lib/pdf/reader/cid_widths.rb +61 -0
- data/lib/pdf/reader/cmap.rb +8 -2
- data/lib/pdf/reader/encoding.rb +52 -27
- data/lib/pdf/reader/error.rb +16 -1
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/filter/ascii85.rb +3 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
- data/lib/pdf/reader/filter/depredict.rb +2 -0
- data/lib/pdf/reader/filter/flate.rb +3 -1
- data/lib/pdf/reader/filter/lzw.rb +1 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -1
- data/lib/pdf/reader/font.rb +74 -18
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/glyph_hash.rb +6 -0
- data/lib/pdf/reader/lzw.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +172 -69
- data/lib/pdf/reader/page_text_receiver.rb +50 -21
- data/lib/pdf/reader/pages_strategy.rb +17 -4
- data/lib/pdf/reader/parser.rb +25 -52
- data/lib/pdf/reader/print_receiver.rb +5 -0
- data/lib/pdf/reader/reference.rb +2 -0
- data/lib/pdf/reader/register_receiver.rb +1 -1
- data/lib/pdf/reader/standard_security_handler.rb +2 -0
- data/lib/pdf/reader/stream.rb +2 -0
- data/lib/pdf/reader/synchronized_cache.rb +32 -0
- data/lib/pdf/reader/text_receiver.rb +5 -4
- data/lib/pdf/reader/text_run.rb +80 -0
- data/lib/pdf/reader/token.rb +2 -0
- data/lib/pdf/reader/transformation_matrix.rb +194 -0
- data/lib/pdf/reader/width_calculator.rb +11 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
- data/lib/pdf/reader/width_calculator/composite.rb +27 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
- data/lib/pdf/reader/xref.rb +9 -2
- metadata +119 -13
@@ -1,22 +1,23 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'pdf/reader/transformation_matrix'
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
class PDF::Reader
|
6
|
+
# encapsulates logic for tracking graphics state as the instructions for
|
7
|
+
# a single page are processed. Most of the public methods correspond
|
8
|
+
# directly to PDF operators.
|
7
9
|
class PageState
|
8
10
|
|
9
11
|
DEFAULT_GRAPHICS_STATE = {
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:text_font => nil,
|
12
|
+
:char_spacing => 0,
|
13
|
+
:word_spacing => 0,
|
14
|
+
:h_scaling => 1.0,
|
15
|
+
:text_leading => 0,
|
16
|
+
:text_font => nil,
|
16
17
|
:text_font_size => nil,
|
17
|
-
:text_mode
|
18
|
-
:text_rise
|
19
|
-
:text_knockout
|
18
|
+
:text_mode => 0,
|
19
|
+
:text_rise => 0,
|
20
|
+
:text_knockout => 0
|
20
21
|
}
|
21
22
|
|
22
23
|
# starting a new page
|
@@ -28,16 +29,23 @@ module PDF
|
|
28
29
|
@xobject_stack = [page.xobjects]
|
29
30
|
@cs_stack = [page.color_spaces]
|
30
31
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
32
|
+
state[:ctm] = identity_matrix
|
31
33
|
end
|
32
34
|
|
33
35
|
#####################################################
|
34
36
|
# Graphics State Operators
|
35
37
|
#####################################################
|
36
38
|
|
39
|
+
# Clones the current graphics state and push it onto the top of the stack.
|
40
|
+
# Any changes that are subsequently made to the state can then by reversed
|
41
|
+
# by calling restore_graphics_state.
|
42
|
+
#
|
37
43
|
def save_graphics_state
|
38
44
|
@stack.push clone_state
|
39
45
|
end
|
40
46
|
|
47
|
+
# Restore the state to the previous value on the stack.
|
48
|
+
#
|
41
49
|
def restore_graphics_state
|
42
50
|
@stack.pop
|
43
51
|
end
|
@@ -54,16 +62,12 @@ module PDF
|
|
54
62
|
# with the new matrix to form the updated matrix.
|
55
63
|
#
|
56
64
|
def concatenate_matrix(a, b, c, d, e, f)
|
57
|
-
transform = Matrix[
|
58
|
-
[a, b, 0],
|
59
|
-
[c, d, 0],
|
60
|
-
[e, f, 1]
|
61
|
-
]
|
62
65
|
if state[:ctm]
|
63
|
-
state[:ctm]
|
66
|
+
state[:ctm].multiply!(a,b,c,d,e,f)
|
64
67
|
else
|
65
|
-
state[:ctm] =
|
68
|
+
state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
|
66
69
|
end
|
70
|
+
@text_rendering_matrix = nil # invalidate cached value
|
67
71
|
end
|
68
72
|
|
69
73
|
#####################################################
|
@@ -71,13 +75,13 @@ module PDF
|
|
71
75
|
#####################################################
|
72
76
|
|
73
77
|
def begin_text_object
|
74
|
-
@text_matrix =
|
75
|
-
@text_line_matrix =
|
78
|
+
@text_matrix = identity_matrix
|
79
|
+
@text_line_matrix = identity_matrix
|
80
|
+
@font_size = nil
|
76
81
|
end
|
77
82
|
|
78
83
|
def end_text_object
|
79
|
-
|
80
|
-
@text_line_matrix = Matrix.identity(3)
|
84
|
+
# don't need to do anything
|
81
85
|
end
|
82
86
|
|
83
87
|
#####################################################
|
@@ -89,7 +93,7 @@ module PDF
|
|
89
93
|
end
|
90
94
|
|
91
95
|
def set_horizontal_text_scaling(h_scaling)
|
92
|
-
state[:h_scaling] = h_scaling
|
96
|
+
state[:h_scaling] = h_scaling / 100.0
|
93
97
|
end
|
94
98
|
|
95
99
|
def set_text_font_and_size(label, size)
|
@@ -98,7 +102,7 @@ module PDF
|
|
98
102
|
end
|
99
103
|
|
100
104
|
def font_size
|
101
|
-
state[:text_font_size] * @text_matrix
|
105
|
+
@font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
|
102
106
|
end
|
103
107
|
|
104
108
|
def set_text_leading(leading)
|
@@ -122,12 +126,16 @@ module PDF
|
|
122
126
|
#####################################################
|
123
127
|
|
124
128
|
def move_text_position(x, y) # Td
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
129
|
+
temp = TransformationMatrix.new(1, 0,
|
130
|
+
0, 1,
|
131
|
+
x, y)
|
132
|
+
@text_line_matrix = temp.multiply!(
|
133
|
+
@text_line_matrix.a, @text_line_matrix.b,
|
134
|
+
@text_line_matrix.c, @text_line_matrix.d,
|
135
|
+
@text_line_matrix.e, @text_line_matrix.f
|
136
|
+
)
|
137
|
+
@text_matrix = @text_line_matrix.dup
|
138
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
131
139
|
end
|
132
140
|
|
133
141
|
def move_text_position_and_set_leading(x, y) # TD
|
@@ -136,11 +144,13 @@ module PDF
|
|
136
144
|
end
|
137
145
|
|
138
146
|
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
139
|
-
@text_matrix =
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
147
|
+
@text_matrix = TransformationMatrix.new(
|
148
|
+
a, b,
|
149
|
+
c, d,
|
150
|
+
e, f
|
151
|
+
)
|
152
|
+
@text_line_matrix = @text_matrix.dup
|
153
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
144
154
|
end
|
145
155
|
|
146
156
|
def move_to_start_of_next_line # T*
|
@@ -197,22 +207,29 @@ module PDF
|
|
197
207
|
# transform x and y co-ordinates from the current user space to the
|
198
208
|
# underlying device space.
|
199
209
|
#
|
200
|
-
def ctm_transform(x, y
|
210
|
+
def ctm_transform(x, y)
|
201
211
|
[
|
202
|
-
(ctm
|
203
|
-
(ctm
|
212
|
+
(ctm.a * x) + (ctm.c * y) + (ctm.e),
|
213
|
+
(ctm.b * x) + (ctm.d * y) + (ctm.f)
|
204
214
|
]
|
205
215
|
end
|
206
216
|
|
207
217
|
# transform x and y co-ordinates from the current text space to the
|
208
218
|
# underlying device space.
|
209
219
|
#
|
210
|
-
|
220
|
+
# transforming (0,0) is a really common case, so optimise for it to
|
221
|
+
# avoid unnecessary object allocations
|
222
|
+
#
|
223
|
+
def trm_transform(x, y)
|
211
224
|
trm = text_rendering_matrix
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
225
|
+
if x == 0 && y == 0
|
226
|
+
[trm.e, trm.f]
|
227
|
+
else
|
228
|
+
[
|
229
|
+
(trm.a * x) + (trm.c * y) + (trm.e),
|
230
|
+
(trm.b * x) + (trm.d * y) + (trm.f)
|
231
|
+
]
|
232
|
+
end
|
216
233
|
end
|
217
234
|
|
218
235
|
def current_font
|
@@ -240,16 +257,105 @@ module PDF
|
|
240
257
|
dict ? dict[label] : nil
|
241
258
|
end
|
242
259
|
|
260
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
261
|
+
# current state onto the stack. That way any modifications to the state
|
262
|
+
# will be undone once restore_graphics_state is called.
|
263
|
+
#
|
264
|
+
def stack_depth
|
265
|
+
@stack.size
|
266
|
+
end
|
267
|
+
|
268
|
+
# This returns a deep clone of the current state, ensuring changes are
|
269
|
+
# keep separate from earlier states.
|
270
|
+
#
|
271
|
+
# Marshal is used to round-trip the state through a string to easily
|
272
|
+
# perform the deep clone. Kinda hacky, but effective.
|
273
|
+
#
|
274
|
+
def clone_state
|
275
|
+
if @stack.empty?
|
276
|
+
{}
|
277
|
+
else
|
278
|
+
Marshal.load Marshal.dump(@stack.last)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
# after each glyph is painted onto the page the text matrix must be
|
283
|
+
# modified. There's no defined operator for this, but depending on
|
284
|
+
# the use case some receivers may need to mutate the state with this
|
285
|
+
# while walking a page.
|
286
|
+
#
|
287
|
+
# NOTE: some of the variable names in this method are obscure because
|
288
|
+
# they mirror variable names from the PDF spec
|
289
|
+
#
|
290
|
+
# NOTE: see Section 9.4.4, PDF 32000-1:2008, pp 252
|
291
|
+
#
|
292
|
+
# Arguments:
|
293
|
+
#
|
294
|
+
# w0 - the glyph width in *text space*. This generally means the width
|
295
|
+
# in glyph space should be divded by 1000 before being passed to
|
296
|
+
# this function
|
297
|
+
# tj - any kerning that should be applied to the text matrix before the
|
298
|
+
# following glyph is painted. This is usually the numeric arguments
|
299
|
+
# in the array passed to a TJ operator
|
300
|
+
# word_boundary - a boolean indicating if a word boundary was just
|
301
|
+
# reached. Depending on the current state extra space
|
302
|
+
# may need to be added
|
303
|
+
#
|
304
|
+
def process_glyph_displacement(w0, tj, word_boundary)
|
305
|
+
fs = font_size # font size
|
306
|
+
tc = state[:char_spacing]
|
307
|
+
if word_boundary
|
308
|
+
tw = state[:word_spacing]
|
309
|
+
else
|
310
|
+
tw = 0
|
311
|
+
end
|
312
|
+
th = state[:h_scaling]
|
313
|
+
# optimise the common path to reduce Float allocations
|
314
|
+
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
315
|
+
glyph_width = w0 * fs
|
316
|
+
tx = glyph_width
|
317
|
+
else
|
318
|
+
glyph_width = ((w0 - (tj/1000.0)) * fs) * th
|
319
|
+
tx = glyph_width + ((tc + tw) * th)
|
320
|
+
end
|
321
|
+
ty = 0
|
322
|
+
|
323
|
+
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
324
|
+
# ctm[0] here, but this gets my tests green and I'm out of
|
325
|
+
# ideas for now
|
326
|
+
# TODO: support ty > 0
|
327
|
+
if ctm.a == 1
|
328
|
+
@text_matrix.horizontal_displacement_multiply!(tx)
|
329
|
+
else
|
330
|
+
@text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
|
331
|
+
end
|
332
|
+
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
333
|
+
end
|
334
|
+
|
243
335
|
private
|
244
336
|
|
337
|
+
# used for many and varied text positioning calculations. We potentially
|
338
|
+
# need to access the results of this method many times when working with
|
339
|
+
# text, so memoize it
|
340
|
+
#
|
245
341
|
def text_rendering_matrix
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
342
|
+
@text_rendering_matrix ||= begin
|
343
|
+
state_matrix = TransformationMatrix.new(
|
344
|
+
font_size * state[:h_scaling], 0,
|
345
|
+
0, font_size,
|
346
|
+
0, state[:text_rise]
|
347
|
+
)
|
348
|
+
state_matrix.multiply!(
|
349
|
+
@text_matrix.a, @text_matrix.b,
|
350
|
+
@text_matrix.c, @text_matrix.d,
|
351
|
+
@text_matrix.e, @text_matrix.f
|
352
|
+
)
|
353
|
+
state_matrix.multiply!(
|
354
|
+
ctm.a, ctm.b,
|
355
|
+
ctm.c, ctm.d,
|
356
|
+
ctm.e, ctm.f
|
357
|
+
)
|
358
|
+
end
|
253
359
|
end
|
254
360
|
|
255
361
|
# return the current transformation matrix
|
@@ -272,25 +378,22 @@ module PDF
|
|
272
378
|
::Hash[wrapped_fonts]
|
273
379
|
end
|
274
380
|
|
275
|
-
|
276
|
-
#
|
277
|
-
|
278
|
-
|
279
|
-
# This
|
280
|
-
#
|
281
|
-
#
|
282
|
-
#
|
283
|
-
#
|
284
|
-
#
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
end
|
381
|
+
#####################################################
|
382
|
+
# Low-level Matrix Operations
|
383
|
+
#####################################################
|
384
|
+
|
385
|
+
# This class uses 3x3 matrices to represent geometric transformations
|
386
|
+
# These matrices are represented by arrays with 9 elements
|
387
|
+
# The array [a,b,c,d,e,f,g,h,i] would represent a matrix like:
|
388
|
+
# a b c
|
389
|
+
# d e f
|
390
|
+
# g h i
|
391
|
+
|
392
|
+
def identity_matrix
|
393
|
+
TransformationMatrix.new(1, 0,
|
394
|
+
0, 1,
|
395
|
+
0, 0)
|
291
396
|
end
|
292
397
|
|
293
398
|
end
|
294
|
-
end
|
295
399
|
end
|
296
|
-
|
@@ -1,13 +1,22 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
-
require 'matrix'
|
4
3
|
require 'forwardable'
|
4
|
+
require 'pdf/reader/page_layout'
|
5
5
|
|
6
6
|
module PDF
|
7
7
|
class Reader
|
8
|
+
|
9
|
+
# Builds a UTF-8 string of all the text on a single page by processing all
|
10
|
+
# the operaters in a content stream.
|
11
|
+
#
|
8
12
|
class PageTextReceiver
|
9
13
|
extend Forwardable
|
10
14
|
|
15
|
+
SPACE = " "
|
16
|
+
|
17
|
+
attr_reader :state, :content, :options
|
18
|
+
|
19
|
+
########## BEGIN FORWARDERS ##########
|
11
20
|
# Graphics State Operators
|
12
21
|
def_delegators :@state, :save_graphics_state, :restore_graphics_state
|
13
22
|
|
@@ -26,41 +35,32 @@ module PDF
|
|
26
35
|
# Text Positioning Operators
|
27
36
|
def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
|
28
37
|
def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
|
38
|
+
########## END FORWARDERS ##########
|
29
39
|
|
30
40
|
# starting a new page
|
31
41
|
def page=(page)
|
32
42
|
@state = PageState.new(page)
|
33
|
-
@content =
|
43
|
+
@content = []
|
44
|
+
@characters = []
|
45
|
+
@mediabox = page.attributes[:MediaBox]
|
34
46
|
end
|
35
47
|
|
36
48
|
def content
|
37
|
-
|
38
|
-
keys.map { |key|
|
39
|
-
@content[key]
|
40
|
-
}.join("\n")
|
49
|
+
PageLayout.new(@characters, @mediabox).to_s
|
41
50
|
end
|
42
51
|
|
43
52
|
#####################################################
|
44
53
|
# Text Showing Operators
|
45
54
|
#####################################################
|
46
|
-
|
47
55
|
# record text that is drawn on the page
|
48
|
-
def show_text(string) # Tj
|
49
|
-
|
50
|
-
newx, newy = @state.trm_transform(0,0)
|
51
|
-
@content[newy] ||= ""
|
52
|
-
@content[newy] << @state.current_font.to_utf8(string)
|
56
|
+
def show_text(string) # Tj (AWAY)
|
57
|
+
internal_show_text(string)
|
53
58
|
end
|
54
59
|
|
55
|
-
def show_text_with_positioning(params) # TJ
|
56
|
-
params.each
|
57
|
-
|
58
|
-
|
59
|
-
show_text(arg)
|
60
|
-
when Fixnum, Float
|
61
|
-
show_text(" ") if arg > 1000
|
62
|
-
end
|
63
|
-
}
|
60
|
+
def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
|
61
|
+
params.each_slice(2).each do |string, kerning|
|
62
|
+
internal_show_text(string, kerning || 0)
|
63
|
+
end
|
64
64
|
end
|
65
65
|
|
66
66
|
def move_to_next_line_and_show_text(str) # '
|
@@ -86,6 +86,35 @@ module PDF
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
private
|
90
|
+
|
91
|
+
def internal_show_text(string, kerning = 0)
|
92
|
+
if @state.current_font.nil?
|
93
|
+
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
94
|
+
end
|
95
|
+
glyphs = @state.current_font.unpack(string)
|
96
|
+
glyphs.each_with_index do |glyph_code, index|
|
97
|
+
# paint the current glyph
|
98
|
+
newx, newy = @state.trm_transform(0,0)
|
99
|
+
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
100
|
+
|
101
|
+
# apply to glyph displacment for the current glyph so the next
|
102
|
+
# glyph will appear in the correct position
|
103
|
+
glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
|
104
|
+
th = 1
|
105
|
+
if kerning != 0 && index == glyphs.size - 1
|
106
|
+
tj = kerning
|
107
|
+
else
|
108
|
+
tj = 0
|
109
|
+
end
|
110
|
+
scaled_glyph_width = glyph_width * @state.font_size * th
|
111
|
+
unless utf8_chars == SPACE
|
112
|
+
@characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
|
113
|
+
end
|
114
|
+
@state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
89
118
|
end
|
90
119
|
end
|
91
120
|
end
|