pdf-reader 0.6.1 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/Rakefile +1 -1
- data/TODO +7 -2
- data/lib/pdf/reader.rb +0 -1
- data/lib/pdf/reader/buffer.rb +21 -3
- data/lib/pdf/reader/cmap.rb +41 -5
- data/lib/pdf/reader/content.rb +97 -10
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/filter.rb +11 -4
- data/lib/pdf/reader/parser.rb +18 -14
- data/lib/pdf/reader/xref.rb +40 -15
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.6.2 (22nd March 2008)
|
2
|
+
- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
|
3
|
+
- Added support for processing inline images
|
4
|
+
- Support for parsing XRef tables that have multiple subsections
|
5
|
+
- Added a few callbacks to improve the way we supply information on page resources
|
6
|
+
- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
|
7
|
+
- Use our "unknown character box" when a single character in an Identity-H string fails to decode
|
8
|
+
- Support ToUnicode CMaps that use the bfrange operator
|
9
|
+
- Tweaked tokenising code to ensure whitespace doesn't get in the way
|
10
|
+
|
1
11
|
v0.6.1 (12th March 2008)
|
2
12
|
- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
|
3
13
|
just replace each character with a little box.
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -4,7 +4,6 @@ v0.7
|
|
4
4
|
- maybe a third option to Reader.parse?
|
5
5
|
parse(io, receiver, {:pages => true, :fonts => false, :metadata => true, :bookmarks => false})
|
6
6
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
7
|
-
- When parsing a CMap into a ruby object, recognise ranged mappings defined by begincodespacerange (see spec, section 5.9.2)
|
8
7
|
- Provide a way to get raw access to a particular object. Good for testing purposes
|
9
8
|
|
10
9
|
v0.8
|
@@ -14,10 +13,14 @@ v0.8
|
|
14
13
|
|
15
14
|
v0.9
|
16
15
|
- Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
|
16
|
+
- Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
|
17
17
|
- Add a way to extract raster images
|
18
|
-
|
18
|
+
- see XObjects section of spec (section 4.7)
|
19
|
+
- Add a way to extract font data?
|
19
20
|
|
20
21
|
Sometime
|
22
|
+
- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
|
23
|
+
|
21
24
|
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
22
25
|
rspec over generated PDF files
|
23
26
|
|
@@ -33,3 +36,5 @@ Sometime
|
|
33
36
|
- Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
|
34
37
|
|
35
38
|
- Investigate how R->L text is handled
|
39
|
+
|
40
|
+
- Add support for object streams (spec section 3.4.6)
|
data/lib/pdf/reader.rb
CHANGED
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -56,6 +56,24 @@ class PDF::Reader
|
|
56
56
|
out
|
57
57
|
end
|
58
58
|
################################################################################
|
59
|
+
# Reads from the buffer until the specified token is found, or the end of the buffer
|
60
|
+
#
|
61
|
+
# bytes - the bytes to search for.
|
62
|
+
def read_until(bytes)
|
63
|
+
out = ""
|
64
|
+
size = bytes.size
|
65
|
+
|
66
|
+
loop do
|
67
|
+
out << @io.read(1)
|
68
|
+
if out[-1 * size,size].eql?(bytes)
|
69
|
+
out = out[0, out.size - size]
|
70
|
+
seek(pos - size)
|
71
|
+
break
|
72
|
+
end
|
73
|
+
end
|
74
|
+
out
|
75
|
+
end
|
76
|
+
################################################################################
|
59
77
|
# returns true if the underlying IO object is at end and the internal buffer
|
60
78
|
# is empty
|
61
79
|
def eof?
|
@@ -71,21 +89,21 @@ class PDF::Reader
|
|
71
89
|
end
|
72
90
|
################################################################################
|
73
91
|
# PDF files are processed by tokenising the content into a series of objects and commands.
|
74
|
-
# This prepares the buffer for use by
|
92
|
+
# This prepares the buffer for use by reading the next line of tokens into memory.
|
75
93
|
def ready_token (with_strip=true, skip_blanks=true)
|
76
94
|
while @buffer.nil? or @buffer.empty?
|
77
95
|
@buffer = @io.readline
|
78
96
|
@buffer.sub!(/%.*$/, '')
|
79
97
|
@buffer.chomp!
|
80
|
-
@buffer.lstrip! if with_strip
|
81
98
|
break unless skip_blanks
|
82
99
|
end
|
100
|
+
@buffer.lstrip! if with_strip
|
83
101
|
end
|
84
102
|
################################################################################
|
85
103
|
# return the next token from the underlying IO stream
|
86
104
|
def token
|
87
105
|
ready_token
|
88
|
-
|
106
|
+
|
89
107
|
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
90
108
|
|
91
109
|
token_chars =
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -28,12 +28,24 @@ class PDF::Reader
|
|
28
28
|
|
29
29
|
def initialize(data)
|
30
30
|
@map = {}
|
31
|
-
|
31
|
+
in_char_mode = false
|
32
|
+
in_range_mode = false
|
33
|
+
|
32
34
|
data.each_line do |l|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
35
|
+
if l.include?("beginbfchar")
|
36
|
+
in_char_mode = true
|
37
|
+
elsif l.include?("endbfchar")
|
38
|
+
in_char_mode = false
|
39
|
+
elsif l.include?("beginbfrange")
|
40
|
+
in_range_mode = true
|
41
|
+
elsif l.include?("endbfrange")
|
42
|
+
in_range_mode = false
|
43
|
+
end
|
44
|
+
|
45
|
+
if in_char_mode
|
46
|
+
process_bfchar_line(l)
|
47
|
+
elsif in_range_mode
|
48
|
+
process_bfrange_line(l)
|
37
49
|
end
|
38
50
|
end
|
39
51
|
end
|
@@ -44,5 +56,29 @@ class PDF::Reader
|
|
44
56
|
@map[c]
|
45
57
|
end
|
46
58
|
|
59
|
+
private
|
60
|
+
|
61
|
+
def process_bfchar_line(l)
|
62
|
+
m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
|
63
|
+
@map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
|
64
|
+
end
|
65
|
+
|
66
|
+
def process_bfrange_line(l)
|
67
|
+
m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
|
68
|
+
if start_code && end_code && dst
|
69
|
+
start_code = "0x#{start_code}".hex
|
70
|
+
end_code = "0x#{end_code}".hex
|
71
|
+
dst = "0x#{dst}".hex
|
72
|
+
incr = 0
|
73
|
+
|
74
|
+
# add all values in the range to our mapping
|
75
|
+
(start_code..end_code).each do |val|
|
76
|
+
@map[val] = dst + incr
|
77
|
+
incr += 1
|
78
|
+
# ensure a single range does not exceed 255 chars
|
79
|
+
raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if incr > 255
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
47
83
|
end
|
48
84
|
end
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -23,6 +23,7 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
require 'stringio'
|
26
|
+
#require 'enumerable'
|
26
27
|
|
27
28
|
class PDF::Reader
|
28
29
|
################################################################################
|
@@ -144,6 +145,25 @@ class PDF::Reader
|
|
144
145
|
# - end_page_container
|
145
146
|
# - begin_page
|
146
147
|
# - end_page
|
148
|
+
#
|
149
|
+
# == Resource Callbacks
|
150
|
+
#
|
151
|
+
# Each page and page_container can contain a range of resources required for the page,
|
152
|
+
# including things like fonts and images. The following callbacks may appear
|
153
|
+
# after begin_page_container and begin_page if the relevant resources exist
|
154
|
+
# on a page:
|
155
|
+
#
|
156
|
+
# In most cases, these callbacks associate a name with each resource, allowing it
|
157
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
158
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
159
|
+
# invoke_xobject "IM1".
|
160
|
+
#
|
161
|
+
# - resource_procset
|
162
|
+
# - resource_xobject
|
163
|
+
# - resource_extgstate
|
164
|
+
# - resource_colorspace
|
165
|
+
# - resource_pattern
|
166
|
+
# - resource_font
|
147
167
|
class Content
|
148
168
|
OPERATORS = {
|
149
169
|
'b' => :close_fill_stroke,
|
@@ -240,20 +260,27 @@ class PDF::Reader
|
|
240
260
|
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
241
261
|
# its content
|
242
262
|
def walk_pages (page)
|
243
|
-
|
263
|
+
|
264
|
+
if page['Resources']
|
265
|
+
res = page['Resources']
|
266
|
+
page.delete('Resources')
|
267
|
+
end
|
244
268
|
|
245
269
|
# extract page content
|
246
270
|
if page['Type'] == "Pages"
|
247
271
|
callback(:begin_page_container, [page])
|
272
|
+
walk_resources(@xref.object(res)) if res
|
248
273
|
page['Kids'].each {|child| walk_pages(@xref.object(child))}
|
249
274
|
callback(:end_page_container)
|
250
275
|
elsif page['Type'] == "Page"
|
251
276
|
callback(:begin_page, [page])
|
277
|
+
walk_resources(@xref.object(res)) if res
|
252
278
|
@page = page
|
253
279
|
@params = []
|
254
280
|
|
255
281
|
page['Contents'].to_a.each do |cstream|
|
256
|
-
|
282
|
+
obj, stream = @xref.object(cstream)
|
283
|
+
content_stream(stream)
|
257
284
|
end if page.has_key?('Contents') and page['Contents']
|
258
285
|
|
259
286
|
callback(:end_page)
|
@@ -274,9 +301,19 @@ class PDF::Reader
|
|
274
301
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
275
302
|
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
276
303
|
|
277
|
-
#
|
304
|
+
# handle special cases in response to certain operators
|
278
305
|
if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
|
306
|
+
# convert any text to utf-8
|
279
307
|
@params = @fonts[@current_font].to_utf8(@params)
|
308
|
+
elsif token == "ID"
|
309
|
+
# inline image data, first convert the current params into a more familiar hash
|
310
|
+
map = {}
|
311
|
+
@params.each_slice(2) do |a|
|
312
|
+
map[a.first] = a.last
|
313
|
+
end
|
314
|
+
@params = [map]
|
315
|
+
# read the raw image data from the buffer without tokenising
|
316
|
+
@params << @buffer.read_until("EI")
|
280
317
|
end
|
281
318
|
callback(OPERATORS[token], @params)
|
282
319
|
@params.clear
|
@@ -289,7 +326,43 @@ class PDF::Reader
|
|
289
326
|
rescue EOFError => e
|
290
327
|
end
|
291
328
|
################################################################################
|
292
|
-
def
|
329
|
+
def walk_resources(resources)
|
330
|
+
resources = resolve_references(resources)
|
331
|
+
|
332
|
+
# extract any procset information
|
333
|
+
if resources['ProcSet']
|
334
|
+
callback(:resource_procset, resources['ProcSet'])
|
335
|
+
end
|
336
|
+
|
337
|
+
# extract any xobject information
|
338
|
+
if resources['XObject']
|
339
|
+
@xref.object(resources['XObject']).each do |name, val|
|
340
|
+
obj, stream = @xref.object(val)
|
341
|
+
callback(:resource_xobject, [name, obj, stream])
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# extract any extgstate information
|
346
|
+
if resources['ExtGState']
|
347
|
+
@xref.object(resources['ExtGState']).each do |name, val|
|
348
|
+
callback(:resource_extgstate, [name, @xref.object(val)])
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
# extract any colorspace information
|
353
|
+
if resources['ColorSpace']
|
354
|
+
@xref.object(resources['ColorSpace']).each do |name, val|
|
355
|
+
callback(:resource_colorspace, [name, @xref.object(val)])
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# extract any pattern information
|
360
|
+
if resources['Pattern']
|
361
|
+
@xref.object(resources['Pattern']).each do |name, val|
|
362
|
+
callback(:resource_pattern, [name, @xref.object(val)])
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
293
366
|
# extract any font information
|
294
367
|
if resources['Font']
|
295
368
|
@xref.object(resources['Font']).each do |label, desc|
|
@@ -301,15 +374,29 @@ class PDF::Reader
|
|
301
374
|
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc['Encoding']))
|
302
375
|
@fonts[label].descendantfonts = desc['DescendantFonts'] if desc['DescendantFonts']
|
303
376
|
if desc['ToUnicode']
|
304
|
-
|
305
|
-
|
377
|
+
obj, cmap = @xref.object(desc['ToUnicode'])
|
378
|
+
|
379
|
+
# this stream is a cmap
|
380
|
+
begin
|
381
|
+
@fonts[label].tounicode = PDF::Reader::CMap.new(cmap)
|
382
|
+
rescue
|
383
|
+
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
384
|
+
end
|
306
385
|
end
|
386
|
+
callback(:resource_font, [label, @fonts[label]])
|
307
387
|
end
|
308
388
|
end
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
389
|
+
end
|
390
|
+
################################################################################
|
391
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
392
|
+
def resolve_references(obj)
|
393
|
+
case obj
|
394
|
+
when PDF::Reader::Reference then resolve_references(@xref.object(obj))
|
395
|
+
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
396
|
+
when Array then obj.collect { |item| resolve_references(item) }
|
397
|
+
else
|
398
|
+
obj
|
399
|
+
end
|
313
400
|
end
|
314
401
|
################################################################################
|
315
402
|
# calls the name callback method on the receiver class with params as the arguments
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -111,12 +111,13 @@ class PDF::Reader
|
|
111
111
|
# iterate over string, reading it in 2 byte chunks and interpreting those
|
112
112
|
# chunks as ints
|
113
113
|
str.unpack("n*").each do |c|
|
114
|
+
|
114
115
|
# convert the int to a unicode codepoint if possible.
|
115
116
|
# without a ToUnicode CMap, it's impossible to reliably convert this text
|
116
117
|
# to unicode, so just replace each character with a little box. Big smacks
|
117
118
|
# the the PDF producing app.
|
118
|
-
if map
|
119
|
-
array_enc <<
|
119
|
+
if map && (code = map.decode(c))
|
120
|
+
array_enc << code
|
120
121
|
else
|
121
122
|
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
122
123
|
end
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -40,20 +40,27 @@ class PDF::Reader
|
|
40
40
|
|
41
41
|
case name
|
42
42
|
when "FlateDecode" then @filter = :flate
|
43
|
-
else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
43
|
+
#else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
44
44
|
end
|
45
45
|
end
|
46
46
|
################################################################################
|
47
47
|
# attempts to decode the specified data with the current filter
|
48
48
|
def filter (data)
|
49
|
+
# leave the data untouched if we don't support the required filter
|
50
|
+
return data if @filter.nil?
|
51
|
+
|
52
|
+
# decode the data
|
49
53
|
self.send(@filter, data)
|
50
54
|
end
|
51
55
|
################################################################################
|
52
56
|
# Decode the specified data with the Zlib compression algorithm
|
53
57
|
def flate (data)
|
54
|
-
|
55
|
-
|
56
|
-
|
58
|
+
begin
|
59
|
+
z = Zlib::Inflate.new
|
60
|
+
z.inflate(data)
|
61
|
+
rescue Exception => e
|
62
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
63
|
+
end
|
57
64
|
end
|
58
65
|
################################################################################
|
59
66
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -58,9 +58,9 @@ class PDF::Reader
|
|
58
58
|
when "obj", "endobj" then return Token.new(token)
|
59
59
|
when "stream", "endstream" then return Token.new(token)
|
60
60
|
when ">>", "]", ">" then return Token.new(token)
|
61
|
-
else
|
61
|
+
else
|
62
62
|
if operators.has_key?(token) then return Token.new(token)
|
63
|
-
else
|
63
|
+
else return token.to_f
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
@@ -72,7 +72,7 @@ class PDF::Reader
|
|
72
72
|
loop do
|
73
73
|
key = parse_token
|
74
74
|
break if key.kind_of?(Token) and key == ">>"
|
75
|
-
raise MalformedPDFError, "
|
75
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Name)
|
76
76
|
|
77
77
|
value = parse_token
|
78
78
|
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
@@ -97,9 +97,15 @@ class PDF::Reader
|
|
97
97
|
################################################################################
|
98
98
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
99
99
|
def hex_string
|
100
|
-
str =
|
101
|
-
|
100
|
+
str = ""
|
101
|
+
|
102
|
+
loop do
|
103
|
+
token = @buffer.token
|
104
|
+
break if token == ">"
|
105
|
+
str << token
|
106
|
+
end
|
102
107
|
|
108
|
+
# add a missing digit if required, as required by the spec
|
103
109
|
str << "0" unless str.size % 2 == 0
|
104
110
|
str.scan(/../).map {|i| i.hex.chr}.join
|
105
111
|
end
|
@@ -151,11 +157,12 @@ class PDF::Reader
|
|
151
157
|
|
152
158
|
@buffer.head(to_remove, false)
|
153
159
|
end
|
154
|
-
|
155
160
|
str
|
156
161
|
end
|
157
162
|
################################################################################
|
158
163
|
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
164
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
165
|
+
# that describes it
|
159
166
|
#
|
160
167
|
# id - the object ID to return
|
161
168
|
# gen - the object revision number to return
|
@@ -166,11 +173,10 @@ class PDF::Reader
|
|
166
173
|
|
167
174
|
obj = parse_token
|
168
175
|
post_obj = parse_token
|
169
|
-
|
170
176
|
case post_obj
|
171
177
|
when "endobj" then return obj
|
172
|
-
when "stream" then return stream(obj)
|
173
|
-
else
|
178
|
+
when "stream" then return obj, stream(obj)
|
179
|
+
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
174
180
|
end
|
175
181
|
end
|
176
182
|
################################################################################
|
@@ -178,6 +184,7 @@ class PDF::Reader
|
|
178
184
|
def stream (dict)
|
179
185
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
|
180
186
|
data = @buffer.read(@xref.object(dict['Length']))
|
187
|
+
|
181
188
|
Error.str_assert(parse_token, "endstream")
|
182
189
|
Error.str_assert(parse_token, "endobj")
|
183
190
|
|
@@ -193,9 +200,6 @@ class PDF::Reader
|
|
193
200
|
end
|
194
201
|
end
|
195
202
|
|
196
|
-
# this stream is a cmap
|
197
|
-
data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")
|
198
|
-
|
199
203
|
data
|
200
204
|
end
|
201
205
|
################################################################################
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -42,44 +42,69 @@ class PDF::Reader
|
|
42
42
|
#
|
43
43
|
# Will fail silently if there is no xref table at the requested offset.
|
44
44
|
def load (offset = nil)
|
45
|
-
|
45
|
+
offset ||= @buffer.find_first_xref_offset
|
46
|
+
@buffer.seek(offset)
|
46
47
|
token = @buffer.token
|
47
|
-
|
48
|
-
if token == "xref"
|
48
|
+
|
49
|
+
if token == "xref" || token == "ref"
|
49
50
|
load_xref_table
|
51
|
+
else
|
52
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
50
53
|
end
|
51
54
|
end
|
52
55
|
################################################################################
|
53
56
|
# Return a string containing the contents of an entire PDF object. The object is requested
|
54
57
|
# by specifying a PDF::Reader::Reference object that contains the objects ID and revision
|
55
58
|
# number
|
59
|
+
#
|
60
|
+
# If the object is a stream, that is returned as well
|
56
61
|
def object (ref, save_pos = true)
|
57
62
|
return ref unless ref.kind_of?(Reference)
|
58
63
|
pos = @buffer.pos if save_pos
|
59
|
-
|
64
|
+
obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
60
65
|
@buffer.seek(pos) if save_pos
|
61
|
-
|
66
|
+
if stream
|
67
|
+
return obj, stream
|
68
|
+
else
|
69
|
+
return obj
|
70
|
+
end
|
62
71
|
end
|
63
72
|
################################################################################
|
64
73
|
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
65
74
|
# processes it into memory.
|
66
75
|
def load_xref_table
|
67
|
-
|
76
|
+
tok_one = tok_two = nil
|
77
|
+
|
78
|
+
begin
|
79
|
+
# loop over all subsections of the xref table
|
80
|
+
# In a well formed PDF, the 'trailer' token will indicate
|
81
|
+
# the end of the table. However we need to be careful in case
|
82
|
+
# we're processing a malformed pdf that is missing the trailer.
|
83
|
+
loop do
|
84
|
+
tok_one, tok_two = @buffer.token, @buffer.token
|
85
|
+
if tok_one != "trailer" && !tok_one.match(/\d+/)
|
86
|
+
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
87
|
+
end
|
88
|
+
break if tok_one == "trailer" or tok_one.nil?
|
89
|
+
objid, count = tok_one.to_i, tok_two.to_i
|
68
90
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
91
|
+
count.times do
|
92
|
+
offset = @buffer.token.to_i
|
93
|
+
generation = @buffer.token.to_i
|
94
|
+
state = @buffer.token
|
73
95
|
|
74
|
-
|
75
|
-
|
96
|
+
store(objid, generation, offset) if state == "n"
|
97
|
+
objid += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
rescue EOFError => e
|
101
|
+
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
76
102
|
end
|
77
103
|
|
78
|
-
raise MalformedPDFError, "PDF malformed,
|
79
|
-
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless @buffer.token == "<<"
|
104
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
80
105
|
|
81
106
|
trailer = Parser.new(@buffer, self).dictionary
|
82
|
-
load(trailer['Prev']) if trailer.has_key?('Prev')
|
107
|
+
load(trailer['Prev'].to_i) if trailer.has_key?('Prev')
|
83
108
|
|
84
109
|
trailer
|
85
110
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-03-
|
12
|
+
date: 2008-03-22 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|