pdf-reader 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -0
- data/lib/pdf/reader.rb +1 -0
- data/lib/pdf/reader/buffer.rb +14 -2
- data/lib/pdf/reader/encoding.rb +31 -28
- data/lib/pdf/reader/font.rb +31 -13
- data/lib/pdf/reader/form_xobject.rb +2 -0
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page_state.rb +295 -0
- data/lib/pdf/reader/page_text_receiver.rb +32 -234
- data/lib/pdf/reader/parser.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +32 -0
- data/lib/pdf/reader/xref.rb +1 -1
- metadata +15 -14
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v1.1.0 (25th March 2012)
|
2
|
+
- new PageState class for handling common state tracking in page receivers
|
3
|
+
- see PageTextReceiver for example usage
|
4
|
+
- various bugfixes to support reading more PDF dialects
|
5
|
+
|
1
6
|
v1.0.0 (16th January 2012)
|
2
7
|
- support a new encryption variation
|
3
8
|
- bugfix in PageTextRender (thanks Paul Gallagher)
|
data/lib/pdf/reader.rb
CHANGED
@@ -354,6 +354,7 @@ require 'pdf/reader/register_receiver'
|
|
354
354
|
require 'pdf/reader/standard_security_handler'
|
355
355
|
require 'pdf/reader/stream'
|
356
356
|
require 'pdf/reader/text_receiver'
|
357
|
+
require 'pdf/reader/page_state'
|
357
358
|
require 'pdf/reader/page_text_receiver'
|
358
359
|
require 'pdf/reader/token'
|
359
360
|
require 'pdf/reader/xref'
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -171,7 +171,11 @@ class PDF::Reader
|
|
171
171
|
when "<" then :hex_string
|
172
172
|
when "stream" then :stream
|
173
173
|
when "ID"
|
174
|
-
in_content_stream?
|
174
|
+
if in_content_stream? && @tokens[-2] != "/"
|
175
|
+
:inline
|
176
|
+
else
|
177
|
+
:regular
|
178
|
+
end
|
175
179
|
else
|
176
180
|
:regular
|
177
181
|
end
|
@@ -314,7 +318,7 @@ class PDF::Reader
|
|
314
318
|
@tokens << chr
|
315
319
|
tok = ""
|
316
320
|
break
|
317
|
-
when "\x28", "\x5B", "\x7B"
|
321
|
+
when "\x28", "\x5B", "\x7B"
|
318
322
|
# opening delimiter, start of new token
|
319
323
|
@tokens << tok if tok.size > 0
|
320
324
|
@tokens << chr
|
@@ -326,6 +330,14 @@ class PDF::Reader
|
|
326
330
|
@tokens << chr
|
327
331
|
tok = ""
|
328
332
|
break
|
333
|
+
when "\x2F"
|
334
|
+
# PDF name, start of new token
|
335
|
+
@tokens << tok if tok.size > 0
|
336
|
+
@tokens << chr
|
337
|
+
next_char = peek_char
|
338
|
+
@tokens << "" if chr == "/" && [nil, " ", "\n"].include?(next_char)
|
339
|
+
tok = ""
|
340
|
+
break
|
329
341
|
else
|
330
342
|
tok << chr
|
331
343
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -41,16 +41,12 @@ class PDF::Reader
|
|
41
41
|
enc = nil
|
42
42
|
end
|
43
43
|
|
44
|
-
@
|
44
|
+
@enc_name = enc
|
45
45
|
@unpack = get_unpack(enc)
|
46
46
|
@map_file = get_mapping_file(enc)
|
47
47
|
load_mapping(@map_file) if @map_file
|
48
48
|
end
|
49
49
|
|
50
|
-
def to_unicode_required?
|
51
|
-
@to_unicode_required
|
52
|
-
end
|
53
|
-
|
54
50
|
# set the differences table for this encoding. should be an array in the following format:
|
55
51
|
#
|
56
52
|
# [25, :A, 26, :B]
|
@@ -91,13 +87,40 @@ class PDF::Reader
|
|
91
87
|
# * pack the final array of Unicode codepoints into a utf-8 string
|
92
88
|
# * mark the string as utf-8 if we're running on a M17N aware VM
|
93
89
|
#
|
94
|
-
def to_utf8(str
|
90
|
+
def to_utf8(str)
|
91
|
+
if utf8_conversion_impossible?
|
92
|
+
little_boxes(str.unpack(unpack).size)
|
93
|
+
else
|
94
|
+
convert_to_utf8(str)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def utf8_conversion_impossible?
|
101
|
+
@enc_name == :"Identity-H" || @enc_name == :"Identity-V"
|
102
|
+
end
|
103
|
+
|
104
|
+
def little_boxes(times)
|
105
|
+
codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
|
106
|
+
ret = codepoints.pack("U*")
|
107
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
108
|
+
ret
|
109
|
+
end
|
110
|
+
|
111
|
+
def convert_to_utf8(str)
|
95
112
|
ret = str.unpack(unpack).map { |c|
|
96
113
|
differences[c] || c
|
97
|
-
}.map { |
|
98
|
-
|
114
|
+
}.map { |c|
|
115
|
+
mapping[c] || c
|
99
116
|
}.map { |c|
|
100
117
|
names_to_unicode[c] || c
|
118
|
+
}.map { |c|
|
119
|
+
if PDF::Reader::Encoding::CONTROL_CHARS.include?(c)
|
120
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
121
|
+
else
|
122
|
+
c
|
123
|
+
end
|
101
124
|
}.map { |c|
|
102
125
|
if c.nil? || !c.is_a?(Fixnum)
|
103
126
|
PDF::Reader::Encoding::UNKNOWN_CHAR
|
@@ -111,22 +134,6 @@ class PDF::Reader
|
|
111
134
|
ret
|
112
135
|
end
|
113
136
|
|
114
|
-
private
|
115
|
-
|
116
|
-
def original_codepoint_to_unicode(cp, tounicode = nil)
|
117
|
-
if tounicode && (code = tounicode.decode(cp))
|
118
|
-
code
|
119
|
-
elsif to_unicode_required? && (tounicode.nil? || tounicode.decode(cp).nil?)
|
120
|
-
PDF::Reader::Encoding::UNKNOWN_CHAR
|
121
|
-
elsif mapping[cp]
|
122
|
-
mapping[cp]
|
123
|
-
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
|
124
|
-
PDF::Reader::Encoding::UNKNOWN_CHAR
|
125
|
-
else
|
126
|
-
cp
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
137
|
def get_unpack(enc)
|
131
138
|
case enc
|
132
139
|
when :"Identity-H", :"Identity-V", :UTF16Encoding
|
@@ -157,10 +164,6 @@ class PDF::Reader
|
|
157
164
|
end
|
158
165
|
end
|
159
166
|
|
160
|
-
def unicode_required?(enc)
|
161
|
-
enc == :"Identity-H" or enc == :"Identity-V"
|
162
|
-
end
|
163
|
-
|
164
167
|
def mapping
|
165
168
|
@mapping ||= {}
|
166
169
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -35,6 +35,7 @@ class PDF::Reader
|
|
35
35
|
return
|
36
36
|
end
|
37
37
|
@ohash = ohash
|
38
|
+
@tounicode = nil
|
38
39
|
|
39
40
|
extract_base_info(obj)
|
40
41
|
extract_descriptor(obj)
|
@@ -58,24 +59,17 @@ class PDF::Reader
|
|
58
59
|
end
|
59
60
|
|
60
61
|
def to_utf8(params)
|
61
|
-
|
62
|
-
|
63
|
-
if params.class == String
|
64
|
-
encoding.to_utf8(params, tounicode)
|
65
|
-
elsif params.class == Array
|
66
|
-
params.collect { |param| self.to_utf8(param) }
|
62
|
+
if @tounicode
|
63
|
+
to_utf8_via_cmap(params)
|
67
64
|
else
|
68
|
-
params
|
65
|
+
to_utf8_via_encoding(params)
|
69
66
|
end
|
70
67
|
end
|
71
68
|
|
72
69
|
def glyph_width(c)
|
73
70
|
@missing_width ||= 0
|
74
|
-
|
75
|
-
|
76
|
-
else
|
77
|
-
@widths.fetch(c.codepoints.first - @first_char, @missing_width)
|
78
|
-
end
|
71
|
+
@widths ||= []
|
72
|
+
@widths.fetch(c - @first_char, @missing_width)
|
79
73
|
end
|
80
74
|
|
81
75
|
private
|
@@ -84,7 +78,7 @@ class PDF::Reader
|
|
84
78
|
@subtype = @ohash.object(obj[:Subtype])
|
85
79
|
@basefont = @ohash.object(obj[:BaseFont])
|
86
80
|
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
87
|
-
@widths = @ohash.object(obj[:Widths])
|
81
|
+
@widths = @ohash.object(obj[:Widths]) || []
|
88
82
|
@first_char = @ohash.object(obj[:FirstChar])
|
89
83
|
if obj[:ToUnicode]
|
90
84
|
stream = @ohash.object(obj[:ToUnicode])
|
@@ -111,5 +105,29 @@ class PDF::Reader
|
|
111
105
|
}
|
112
106
|
end
|
113
107
|
|
108
|
+
def to_utf8_via_cmap(params)
|
109
|
+
if params.class == String
|
110
|
+
params.unpack(encoding.unpack).map { |c|
|
111
|
+
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
|
+
}.pack("U*")
|
113
|
+
elsif params.class == Array
|
114
|
+
params.collect { |param| to_utf8_via_cmap(param) }
|
115
|
+
else
|
116
|
+
params
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def to_utf8_via_encoding(params)
|
121
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
|
122
|
+
|
123
|
+
if params.class == String
|
124
|
+
encoding.to_utf8(params)
|
125
|
+
elsif params.class == Array
|
126
|
+
params.collect { |param| to_utf8_via_encoding(param) }
|
127
|
+
else
|
128
|
+
params
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
114
132
|
end
|
115
133
|
end
|
@@ -0,0 +1,295 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
class PageState
|
8
|
+
|
9
|
+
DEFAULT_GRAPHICS_STATE = {
|
10
|
+
:ctm => Matrix.identity(3),
|
11
|
+
:char_spacing => 0,
|
12
|
+
:word_spacing => 0,
|
13
|
+
:h_scaling => 100,
|
14
|
+
:text_leading => 0,
|
15
|
+
:text_font => nil,
|
16
|
+
:text_font_size => nil,
|
17
|
+
:text_mode => 0,
|
18
|
+
:text_rise => 0,
|
19
|
+
:text_knockout => 0
|
20
|
+
}
|
21
|
+
|
22
|
+
# starting a new page
|
23
|
+
def initialize(page)
|
24
|
+
@page = page
|
25
|
+
@objects = page.objects
|
26
|
+
@font_stack = [build_fonts(page.fonts)]
|
27
|
+
@xobject_stack = [page.xobjects]
|
28
|
+
@cs_stack = [page.color_spaces]
|
29
|
+
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
30
|
+
end
|
31
|
+
|
32
|
+
#####################################################
|
33
|
+
# Graphics State Operators
|
34
|
+
#####################################################
|
35
|
+
|
36
|
+
def save_graphics_state
|
37
|
+
@stack.push clone_state
|
38
|
+
end
|
39
|
+
|
40
|
+
def restore_graphics_state
|
41
|
+
@stack.pop
|
42
|
+
end
|
43
|
+
|
44
|
+
#####################################################
|
45
|
+
# Matrix Operators
|
46
|
+
#####################################################
|
47
|
+
|
48
|
+
# update the current transformation matrix.
|
49
|
+
#
|
50
|
+
# If the CTM is currently undefined, just store the new values.
|
51
|
+
#
|
52
|
+
# If there's an existing CTM, then multiply the existing matrix
|
53
|
+
# with the new matrix to form the updated matrix.
|
54
|
+
#
|
55
|
+
def concatenate_matrix(a, b, c, d, e, f)
|
56
|
+
transform = Matrix[
|
57
|
+
[a, b, 0],
|
58
|
+
[c, d, 0],
|
59
|
+
[e, f, 1]
|
60
|
+
]
|
61
|
+
if state[:ctm]
|
62
|
+
state[:ctm] = transform * state[:ctm]
|
63
|
+
else
|
64
|
+
state[:ctm] = transform
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
#####################################################
|
69
|
+
# Text Object Operators
|
70
|
+
#####################################################
|
71
|
+
|
72
|
+
def begin_text_object
|
73
|
+
@text_matrix = Matrix.identity(3)
|
74
|
+
@text_line_matrix = Matrix.identity(3)
|
75
|
+
end
|
76
|
+
|
77
|
+
def end_text_object
|
78
|
+
@text_matrix = Matrix.identity(3)
|
79
|
+
@text_line_matrix = Matrix.identity(3)
|
80
|
+
end
|
81
|
+
|
82
|
+
#####################################################
|
83
|
+
# Text State Operators
|
84
|
+
#####################################################
|
85
|
+
|
86
|
+
def set_character_spacing(char_spacing)
|
87
|
+
state[:char_spacing] = char_spacing
|
88
|
+
end
|
89
|
+
|
90
|
+
def set_horizontal_text_scaling(h_scaling)
|
91
|
+
state[:h_scaling] = h_scaling
|
92
|
+
end
|
93
|
+
|
94
|
+
def set_text_font_and_size(label, size)
|
95
|
+
state[:text_font] = label
|
96
|
+
state[:text_font_size] = size
|
97
|
+
end
|
98
|
+
|
99
|
+
def font_size
|
100
|
+
state[:text_font_size] * @text_matrix[0,0]
|
101
|
+
end
|
102
|
+
|
103
|
+
def set_text_leading(leading)
|
104
|
+
state[:text_leading] = leading
|
105
|
+
end
|
106
|
+
|
107
|
+
def set_text_rendering_mode(mode)
|
108
|
+
state[:text_mode] = mode
|
109
|
+
end
|
110
|
+
|
111
|
+
def set_text_rise(rise)
|
112
|
+
state[:text_rise] = rise
|
113
|
+
end
|
114
|
+
|
115
|
+
def set_word_spacing(word_spacing)
|
116
|
+
state[:word_spacing] = word_spacing
|
117
|
+
end
|
118
|
+
|
119
|
+
#####################################################
|
120
|
+
# Text Positioning Operators
|
121
|
+
#####################################################
|
122
|
+
|
123
|
+
def move_text_position(x, y) # Td
|
124
|
+
temp_matrix = Matrix[
|
125
|
+
[1, 0, 0],
|
126
|
+
[0, 1, 0],
|
127
|
+
[x, y, 1]
|
128
|
+
]
|
129
|
+
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
130
|
+
end
|
131
|
+
|
132
|
+
def move_text_position_and_set_leading(x, y) # TD
|
133
|
+
set_text_leading(-1 * y)
|
134
|
+
move_text_position(x, y)
|
135
|
+
end
|
136
|
+
|
137
|
+
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
138
|
+
@text_matrix = @text_line_matrix = Matrix[
|
139
|
+
[a, b, 0],
|
140
|
+
[c, d, 0],
|
141
|
+
[e, f, 1]
|
142
|
+
]
|
143
|
+
end
|
144
|
+
|
145
|
+
def move_to_start_of_next_line # T*
|
146
|
+
move_text_position(0, -state[:text_leading])
|
147
|
+
end
|
148
|
+
|
149
|
+
#####################################################
|
150
|
+
# Text Showing Operators
|
151
|
+
#####################################################
|
152
|
+
|
153
|
+
def show_text_with_positioning(params) # TJ
|
154
|
+
# TODO record position changes in state here
|
155
|
+
end
|
156
|
+
|
157
|
+
def move_to_next_line_and_show_text(str) # '
|
158
|
+
move_to_start_of_next_line
|
159
|
+
end
|
160
|
+
|
161
|
+
def set_spacing_next_line_show_text(aw, ac, string) # "
|
162
|
+
set_word_spacing(aw)
|
163
|
+
set_character_spacing(ac)
|
164
|
+
move_to_next_line_and_show_text(string)
|
165
|
+
end
|
166
|
+
|
167
|
+
#####################################################
|
168
|
+
# XObjects
|
169
|
+
#####################################################
|
170
|
+
def invoke_xobject(label)
|
171
|
+
save_graphics_state
|
172
|
+
xobject = find_xobject(label)
|
173
|
+
|
174
|
+
raise MalformedPDFError, "XObject #{label} not found" if xobject.nil?
|
175
|
+
matrix = xobject.hash[:Matrix]
|
176
|
+
concatenate_matrix(*matrix) if matrix
|
177
|
+
|
178
|
+
if xobject.hash[:Subtype] == :Form
|
179
|
+
form = PDF::Reader::FormXObject.new(@page, xobject)
|
180
|
+
@font_stack.unshift(form.font_objects)
|
181
|
+
@xobject_stack.unshift(form.xobjects)
|
182
|
+
yield form if block_given?
|
183
|
+
@font_stack.shift
|
184
|
+
@xobject_stack.shift
|
185
|
+
else
|
186
|
+
yield xobject if block_given?
|
187
|
+
end
|
188
|
+
|
189
|
+
restore_graphics_state
|
190
|
+
end
|
191
|
+
|
192
|
+
#####################################################
|
193
|
+
# Public Visible State
|
194
|
+
#####################################################
|
195
|
+
|
196
|
+
# transform x and y co-ordinates from the current user space to the
|
197
|
+
# underlying device space.
|
198
|
+
#
|
199
|
+
def ctm_transform(x, y, z = 1)
|
200
|
+
[
|
201
|
+
(ctm[0,0] * x) + (ctm[1,0] * y) + (ctm[2,0] * z),
|
202
|
+
(ctm[0,1] * x) + (ctm[1,1] * y) + (ctm[2,1] * z)
|
203
|
+
]
|
204
|
+
end
|
205
|
+
|
206
|
+
# transform x and y co-ordinates from the current text space to the
|
207
|
+
# underlying device space.
|
208
|
+
#
|
209
|
+
def trm_transform(x, y, z = 1)
|
210
|
+
trm = text_rendering_matrix
|
211
|
+
[
|
212
|
+
(trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
|
213
|
+
(trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
|
214
|
+
]
|
215
|
+
end
|
216
|
+
|
217
|
+
def current_font
|
218
|
+
find_font(state[:text_font])
|
219
|
+
end
|
220
|
+
|
221
|
+
def find_font(label)
|
222
|
+
dict = @font_stack.detect { |fonts|
|
223
|
+
fonts.has_key?(label)
|
224
|
+
}
|
225
|
+
dict ? dict[label] : nil
|
226
|
+
end
|
227
|
+
|
228
|
+
def find_color_space(label)
|
229
|
+
dict = @cs_stack.detect { |colorspaces|
|
230
|
+
colorspaces.has_key?(label)
|
231
|
+
}
|
232
|
+
dict ? dict[label] : nil
|
233
|
+
end
|
234
|
+
|
235
|
+
def find_xobject(label)
|
236
|
+
dict = @xobject_stack.detect { |xobjects|
|
237
|
+
xobjects.has_key?(label)
|
238
|
+
}
|
239
|
+
dict ? dict[label] : nil
|
240
|
+
end
|
241
|
+
|
242
|
+
private
|
243
|
+
|
244
|
+
def text_rendering_matrix
|
245
|
+
state_matrix = Matrix[
|
246
|
+
[font_size * state[:h_scaling], 0, 0],
|
247
|
+
[0, font_size, 0],
|
248
|
+
[0, state[:text_rise], 1]
|
249
|
+
]
|
250
|
+
|
251
|
+
state_matrix * @text_matrix * ctm
|
252
|
+
end
|
253
|
+
|
254
|
+
# return the current transformation matrix
|
255
|
+
#
|
256
|
+
def ctm
|
257
|
+
state[:ctm]
|
258
|
+
end
|
259
|
+
|
260
|
+
def state
|
261
|
+
@stack.last
|
262
|
+
end
|
263
|
+
|
264
|
+
# wrap the raw PDF Font objects in handy ruby Font objects.
|
265
|
+
#
|
266
|
+
def build_fonts(raw_fonts)
|
267
|
+
wrapped_fonts = raw_fonts.map { |label, font|
|
268
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
|
269
|
+
}
|
270
|
+
|
271
|
+
::Hash[wrapped_fonts]
|
272
|
+
end
|
273
|
+
|
274
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
275
|
+
# current state onto the stack. That way any modifications to the state
|
276
|
+
# will be undone once restore_graphics_state is called.
|
277
|
+
#
|
278
|
+
# This returns a deep clone of the current state, ensuring changes are
|
279
|
+
# keep separate from earlier states.
|
280
|
+
#
|
281
|
+
# Marshal is used to round-trip the state through a string to easily
|
282
|
+
# perform the deep clone. Kinda hacky, but effective.
|
283
|
+
#
|
284
|
+
def clone_state
|
285
|
+
if @stack.empty?
|
286
|
+
{}
|
287
|
+
else
|
288
|
+
Marshal.load Marshal.dump(@stack.last)
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
@@ -1,156 +1,43 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
3
|
require 'matrix'
|
4
|
+
require 'forwardable'
|
4
5
|
|
5
6
|
module PDF
|
6
7
|
class Reader
|
7
8
|
class PageTextReceiver
|
9
|
+
extend Forwardable
|
8
10
|
|
9
|
-
DEFAULT_GRAPHICS_STATE = {
|
10
|
-
:ctm => Matrix.identity(3),
|
11
|
-
:char_spacing => 0,
|
12
|
-
:word_spacing => 0,
|
13
|
-
:h_scaling => 100,
|
14
|
-
:text_leading => 0,
|
15
|
-
:text_font => nil,
|
16
|
-
:text_font_size => nil,
|
17
|
-
:text_mode => 0,
|
18
|
-
:text_rise => 0,
|
19
|
-
:text_knockout => 0
|
20
|
-
}
|
21
|
-
|
22
|
-
# starting a new page
|
23
|
-
def page=(page)
|
24
|
-
@page = page
|
25
|
-
@objects = page.objects
|
26
|
-
@font_stack = [build_fonts(page.fonts)]
|
27
|
-
@xobject_stack = [page.xobjects]
|
28
|
-
@content = {}
|
29
|
-
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
30
|
-
end
|
31
|
-
|
32
|
-
def content
|
33
|
-
keys = @content.keys.sort.reverse
|
34
|
-
keys.map { |key|
|
35
|
-
@content[key]
|
36
|
-
}.join("\n")
|
37
|
-
end
|
38
|
-
|
39
|
-
#####################################################
|
40
11
|
# Graphics State Operators
|
41
|
-
|
12
|
+
def_delegators :@state, :save_graphics_state, :restore_graphics_state
|
42
13
|
|
43
|
-
def save_graphics_state
|
44
|
-
@stack.push clone_state
|
45
|
-
end
|
46
|
-
|
47
|
-
def restore_graphics_state
|
48
|
-
@stack.pop
|
49
|
-
end
|
50
|
-
|
51
|
-
#####################################################
|
52
14
|
# Matrix Operators
|
53
|
-
|
15
|
+
def_delegators :@state, :concatenate_matrix
|
54
16
|
|
55
|
-
# update the current transformation matrix.
|
56
|
-
#
|
57
|
-
# If the CTM is currently undefined, just store the new values.
|
58
|
-
#
|
59
|
-
# If there's an existing CTM, then multiply the existing matrix
|
60
|
-
# with the new matrix to form the updated matrix.
|
61
|
-
#
|
62
|
-
def concatenate_matrix(a, b, c, d, e, f)
|
63
|
-
transform = Matrix[
|
64
|
-
[a, b, 0],
|
65
|
-
[c, d, 0],
|
66
|
-
[e, f, 1]
|
67
|
-
]
|
68
|
-
if state[:ctm]
|
69
|
-
state[:ctm] = transform * state[:ctm]
|
70
|
-
else
|
71
|
-
state[:ctm] = transform
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
#####################################################
|
76
17
|
# Text Object Operators
|
77
|
-
|
18
|
+
def_delegators :@state, :begin_text_object, :end_text_object
|
78
19
|
|
79
|
-
def begin_text_object
|
80
|
-
@text_matrix = Matrix.identity(3)
|
81
|
-
@text_line_matrix = Matrix.identity(3)
|
82
|
-
end
|
83
|
-
|
84
|
-
def end_text_object
|
85
|
-
@text_matrix = Matrix.identity(3)
|
86
|
-
@text_line_matrix = Matrix.identity(3)
|
87
|
-
end
|
88
|
-
|
89
|
-
#####################################################
|
90
20
|
# Text State Operators
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
end
|
21
|
+
def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling
|
22
|
+
def_delegators :@state, :set_text_font_and_size, :font_size
|
23
|
+
def_delegators :@state, :set_text_leading, :set_text_rendering_mode
|
24
|
+
def_delegators :@state, :set_text_rise, :set_word_spacing
|
96
25
|
|
97
|
-
def set_horizontal_text_scaling(h_scaling)
|
98
|
-
state[:h_scaling] = h_scaling
|
99
|
-
end
|
100
|
-
|
101
|
-
def set_text_font_and_size(label, size)
|
102
|
-
state[:text_font] = label
|
103
|
-
state[:text_font_size] = size
|
104
|
-
end
|
105
|
-
|
106
|
-
def font_size
|
107
|
-
state[:text_font_size] * @text_matrix[0,0]
|
108
|
-
end
|
109
|
-
|
110
|
-
def set_text_leading(leading)
|
111
|
-
state[:text_leading] = leading
|
112
|
-
end
|
113
|
-
|
114
|
-
def set_text_rendering_mode(mode)
|
115
|
-
state[:text_mode] = mode
|
116
|
-
end
|
117
|
-
|
118
|
-
def set_text_rise(rise)
|
119
|
-
state[:text_rise] = rise
|
120
|
-
end
|
121
|
-
|
122
|
-
def set_word_spacing(word_spacing)
|
123
|
-
state[:word_spacing] = word_spacing
|
124
|
-
end
|
125
|
-
|
126
|
-
#####################################################
|
127
26
|
# Text Positioning Operators
|
128
|
-
|
27
|
+
def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
|
28
|
+
def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
|
129
29
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
[x, y, 1]
|
135
|
-
]
|
136
|
-
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
137
|
-
end
|
138
|
-
|
139
|
-
def move_text_position_and_set_leading(x, y) # TD
|
140
|
-
set_text_leading(-1 * y)
|
141
|
-
move_text_position(x, y)
|
142
|
-
end
|
143
|
-
|
144
|
-
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
145
|
-
@text_matrix = @text_line_matrix = Matrix[
|
146
|
-
[a, b, 0],
|
147
|
-
[c, d, 0],
|
148
|
-
[e, f, 1]
|
149
|
-
]
|
30
|
+
# starting a new page
|
31
|
+
def page=(page)
|
32
|
+
@state = PageState.new(page)
|
33
|
+
@content = {}
|
150
34
|
end
|
151
35
|
|
152
|
-
def
|
153
|
-
|
36
|
+
def content
|
37
|
+
keys = @content.keys.sort.reverse
|
38
|
+
keys.map { |key|
|
39
|
+
@content[key]
|
40
|
+
}.join("\n")
|
154
41
|
end
|
155
42
|
|
156
43
|
#####################################################
|
@@ -159,10 +46,10 @@ module PDF
|
|
159
46
|
|
160
47
|
# record text that is drawn on the page
|
161
48
|
def show_text(string) # Tj
|
162
|
-
raise PDF::Reader::MalformedPDFError, "current font is invalid" if current_font.nil?
|
163
|
-
|
164
|
-
@content[
|
165
|
-
@content[
|
49
|
+
raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
|
50
|
+
newx, newy = @state.trm_transform(0,0)
|
51
|
+
@content[newy] ||= ""
|
52
|
+
@content[newy] << @state.current_font.to_utf8(string)
|
166
53
|
end
|
167
54
|
|
168
55
|
def show_text_with_positioning(params) # TJ
|
@@ -177,13 +64,13 @@ module PDF
|
|
177
64
|
end
|
178
65
|
|
179
66
|
def move_to_next_line_and_show_text(str) # '
|
180
|
-
move_to_start_of_next_line
|
67
|
+
@state.move_to_start_of_next_line
|
181
68
|
show_text(str)
|
182
69
|
end
|
183
70
|
|
184
71
|
def set_spacing_next_line_show_text(aw, ac, string) # "
|
185
|
-
set_word_spacing(aw)
|
186
|
-
set_character_spacing(ac)
|
72
|
+
@state.set_word_spacing(aw)
|
73
|
+
@state.set_character_spacing(ac)
|
187
74
|
move_to_next_line_and_show_text(string)
|
188
75
|
end
|
189
76
|
|
@@ -191,103 +78,14 @@ module PDF
|
|
191
78
|
# XObjects
|
192
79
|
#####################################################
|
193
80
|
def invoke_xobject(label)
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
raise MalformedPDFError, "XObject #{label} not found" if xobject.nil?
|
201
|
-
matrix = xobject.hash[:Matrix]
|
202
|
-
concatenate_matrix(*matrix) if matrix
|
203
|
-
|
204
|
-
if xobject.hash[:Subtype] == :Form
|
205
|
-
form = PDF::Reader::FormXObject.new(@page, xobject)
|
206
|
-
@font_stack.unshift(form.font_objects)
|
207
|
-
@xobject_stack.unshift(form.xobjects)
|
208
|
-
form.walk(self)
|
209
|
-
@font_stack.shift
|
210
|
-
@xobject_stack.shift
|
211
|
-
end
|
212
|
-
|
213
|
-
restore_graphics_state
|
214
|
-
end
|
215
|
-
|
216
|
-
private
|
217
|
-
|
218
|
-
# wrap the raw PDF Font objects in handy ruby Font objects.
|
219
|
-
#
|
220
|
-
def build_fonts(raw_fonts)
|
221
|
-
wrapped_fonts = raw_fonts.map { |label, font|
|
222
|
-
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
|
223
|
-
}
|
224
|
-
|
225
|
-
::Hash[wrapped_fonts]
|
226
|
-
end
|
227
|
-
|
228
|
-
# transform x and y co-ordinates from the current text space to the
|
229
|
-
# underlying device space.
|
230
|
-
#
|
231
|
-
def transform(point, z = 1)
|
232
|
-
point.transform(text_rendering_matrix, z)
|
233
|
-
end
|
234
|
-
|
235
|
-
def text_rendering_matrix
|
236
|
-
state_matrix = Matrix[
|
237
|
-
[font_size * state[:h_scaling], 0, 0],
|
238
|
-
[0, font_size, 0],
|
239
|
-
[0, state[:text_rise], 1]
|
240
|
-
]
|
241
|
-
|
242
|
-
state_matrix * @text_matrix * ctm
|
243
|
-
end
|
244
|
-
|
245
|
-
def state
|
246
|
-
@stack.last
|
247
|
-
end
|
248
|
-
|
249
|
-
# when save_graphics_state is called, we need to push a new copy of the
|
250
|
-
# current state onto the stack. That way any modifications to the state
|
251
|
-
# will be undone once restore_graphics_state is called.
|
252
|
-
#
|
253
|
-
# This returns a deep clone of the current state, ensuring changes are
|
254
|
-
# keep separate from earlier states.
|
255
|
-
#
|
256
|
-
# Marshal is used to round-trip the state through a string to easily
|
257
|
-
# perform the deep clone. Kinda hacky, but effective.
|
258
|
-
#
|
259
|
-
def clone_state
|
260
|
-
if @stack.empty?
|
261
|
-
{}
|
262
|
-
else
|
263
|
-
Marshal.load Marshal.dump(@stack.last)
|
81
|
+
@state.invoke_xobject(label) do |xobj|
|
82
|
+
case xobj
|
83
|
+
when PDF::Reader::FormXObject then
|
84
|
+
xobj.walk(self)
|
85
|
+
end
|
264
86
|
end
|
265
87
|
end
|
266
88
|
|
267
|
-
# return the current transformation matrix
|
268
|
-
#
|
269
|
-
def ctm
|
270
|
-
state[:ctm]
|
271
|
-
end
|
272
|
-
|
273
|
-
def current_font
|
274
|
-
dict = @font_stack.detect { |fonts|
|
275
|
-
fonts.has_key?(state[:text_font])
|
276
|
-
}
|
277
|
-
dict ? dict[state[:text_font]] : nil
|
278
|
-
end
|
279
|
-
|
280
|
-
# private class for representing points on a cartesian plain. Used
|
281
|
-
# to simplify maths.
|
282
|
-
#
|
283
|
-
class Point < Struct.new(:x, :y)
|
284
|
-
def transform(trm, z)
|
285
|
-
Point.new(
|
286
|
-
(trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
|
287
|
-
(trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
|
288
|
-
)
|
289
|
-
end
|
290
|
-
end
|
291
89
|
end
|
292
90
|
end
|
293
91
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -130,6 +130,7 @@ class PDF::Reader
|
|
130
130
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
131
131
|
def pdf_name
|
132
132
|
tok = @buffer.token
|
133
|
+
tok = " " if tok == "" && RUBY_VERSION < "1.9"
|
133
134
|
tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
|
134
135
|
match[1, 2].hex.chr
|
135
136
|
end
|
@@ -8,12 +8,20 @@ module PDF
|
|
8
8
|
module ResourceMethods
|
9
9
|
# Returns a Hash of color spaces that are available to this page
|
10
10
|
#
|
11
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
12
|
+
# with no caching. You will want to cache the results instead
|
13
|
+
# of calling it over and over.
|
14
|
+
#
|
11
15
|
def color_spaces
|
12
16
|
@objects.deref!(resources[:ColorSpace]) || {}
|
13
17
|
end
|
14
18
|
|
15
19
|
# Returns a Hash of fonts that are available to this page
|
16
20
|
#
|
21
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
22
|
+
# with no caching. You will want to cache the results instead
|
23
|
+
# of calling it over and over.
|
24
|
+
#
|
17
25
|
def fonts
|
18
26
|
@objects.deref!(resources[:Font]) || {}
|
19
27
|
end
|
@@ -21,36 +29,60 @@ module PDF
|
|
21
29
|
# Returns a Hash of external graphic states that are available to this
|
22
30
|
# page
|
23
31
|
#
|
32
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
33
|
+
# with no caching. You will want to cache the results instead
|
34
|
+
# of calling it over and over.
|
35
|
+
#
|
24
36
|
def graphic_states
|
25
37
|
@objects.deref!(resources[:ExtGState]) || {}
|
26
38
|
end
|
27
39
|
|
28
40
|
# Returns a Hash of patterns that are available to this page
|
29
41
|
#
|
42
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
43
|
+
# with no caching. You will want to cache the results instead
|
44
|
+
# of calling it over and over.
|
45
|
+
#
|
30
46
|
def patterns
|
31
47
|
@objects.deref!(resources[:Pattern]) || {}
|
32
48
|
end
|
33
49
|
|
34
50
|
# Returns an Array of procedure sets that are available to this page
|
35
51
|
#
|
52
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
53
|
+
# with no caching. You will want to cache the results instead
|
54
|
+
# of calling it over and over.
|
55
|
+
#
|
36
56
|
def procedure_sets
|
37
57
|
@objects.deref!(resources[:ProcSet]) || []
|
38
58
|
end
|
39
59
|
|
40
60
|
# Returns a Hash of properties sets that are available to this page
|
41
61
|
#
|
62
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
63
|
+
# with no caching. You will want to cache the results instead
|
64
|
+
# of calling it over and over.
|
65
|
+
#
|
42
66
|
def properties
|
43
67
|
@objects.deref!(resources[:Properties]) || {}
|
44
68
|
end
|
45
69
|
|
46
70
|
# Returns a Hash of shadings that are available to this page
|
47
71
|
#
|
72
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
73
|
+
# with no caching. You will want to cache the results instead
|
74
|
+
# of calling it over and over.
|
75
|
+
#
|
48
76
|
def shadings
|
49
77
|
@objects.deref!(resources[:Shading]) || {}
|
50
78
|
end
|
51
79
|
|
52
80
|
# Returns a Hash of XObjects that are available to this page
|
53
81
|
#
|
82
|
+
# NOTE: this method de-serialise objects from the underlying PDF
|
83
|
+
# with no caching. You will want to cache the results instead
|
84
|
+
# of calling it over and over.
|
85
|
+
#
|
54
86
|
def xobjects
|
55
87
|
@objects.deref!(resources[:XObject]) || {}
|
56
88
|
end
|
data/lib/pdf/reader/xref.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &20774400 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *20774400
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: roodi
|
27
|
-
requirement: &
|
27
|
+
requirement: &20772820 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *20772820
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &20771660 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '2.3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *20771660
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: ZenTest
|
49
|
-
requirement: &
|
49
|
+
requirement: &20770840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 4.4.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *20770840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: Ascii85
|
60
|
-
requirement: &
|
60
|
+
requirement: &20770080 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.0
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *20770080
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby-rc4
|
71
|
-
requirement: &
|
71
|
+
requirement: &20769400 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *20769400
|
80
80
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
81
81
|
possible to the PDF specification from Adobe
|
82
82
|
email:
|
@@ -114,6 +114,7 @@ files:
|
|
114
114
|
- lib/pdf/reader/filter.rb
|
115
115
|
- lib/pdf/reader/object_hash.rb
|
116
116
|
- lib/pdf/reader/stream.rb
|
117
|
+
- lib/pdf/reader/page_state.rb
|
117
118
|
- lib/pdf/reader/standard_security_handler.rb
|
118
119
|
- lib/pdf/reader/cmap.rb
|
119
120
|
- lib/pdf/reader/form_xobject.rb
|