pdf-reader 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/Rakefile +1 -1
- data/TODO +7 -2
- data/lib/pdf/reader.rb +0 -1
- data/lib/pdf/reader/buffer.rb +21 -3
- data/lib/pdf/reader/cmap.rb +41 -5
- data/lib/pdf/reader/content.rb +97 -10
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/filter.rb +11 -4
- data/lib/pdf/reader/parser.rb +18 -14
- data/lib/pdf/reader/xref.rb +40 -15
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.6.2 (22nd March 2008)
|
2
|
+
- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
|
3
|
+
- Added support for processing inline images
|
4
|
+
- Support for parsing XRef tables that have multiple subsections
|
5
|
+
- Added a few callbacks to improve the way we supply information on page resources
|
6
|
+
- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
|
7
|
+
- Use our "unknown character box" when a single character in an Identity-H string fails to decode
|
8
|
+
- Support ToUnicode CMaps that use the bfrange operator
|
9
|
+
- Tweaked tokenising code to ensure whitespace doesn't get in the way
|
10
|
+
|
1
11
|
v0.6.1 (12th March 2008)
|
2
12
|
- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
|
3
13
|
just replace each character with a little box.
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -4,7 +4,6 @@ v0.7
|
|
4
4
|
- maybe a third option to Reader.parse?
|
5
5
|
parse(io, receiver, {:pages => true, :fonts => false, :metadata => true, :bookmarks => false})
|
6
6
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
7
|
-
- When parsing a CMap into a ruby object, recognise ranged mappings defined by begincodespacerange (see spec, section 5.9.2)
|
8
7
|
- Provide a way to get raw access to a particular object. Good for testing purposes
|
9
8
|
|
10
9
|
v0.8
|
@@ -14,10 +13,14 @@ v0.8
|
|
14
13
|
|
15
14
|
v0.9
|
16
15
|
- Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
|
16
|
+
- Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
|
17
17
|
- Add a way to extract raster images
|
18
|
-
|
18
|
+
- see XObjects section of spec (section 4.7)
|
19
|
+
- Add a way to extract font data?
|
19
20
|
|
20
21
|
Sometime
|
22
|
+
- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
|
23
|
+
|
21
24
|
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
22
25
|
rspec over generated PDF files
|
23
26
|
|
@@ -33,3 +36,5 @@ Sometime
|
|
33
36
|
- Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
|
34
37
|
|
35
38
|
- Investigate how R->L text is handled
|
39
|
+
|
40
|
+
- Add support for object streams (spec section 3.4.6)
|
data/lib/pdf/reader.rb
CHANGED
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -56,6 +56,24 @@ class PDF::Reader
|
|
56
56
|
out
|
57
57
|
end
|
58
58
|
################################################################################
|
59
|
+
# Reads from the buffer until the specified token is found, or the end of the buffer
|
60
|
+
#
|
61
|
+
# bytes - the bytes to search for.
|
62
|
+
def read_until(bytes)
|
63
|
+
out = ""
|
64
|
+
size = bytes.size
|
65
|
+
|
66
|
+
loop do
|
67
|
+
out << @io.read(1)
|
68
|
+
if out[-1 * size,size].eql?(bytes)
|
69
|
+
out = out[0, out.size - size]
|
70
|
+
seek(pos - size)
|
71
|
+
break
|
72
|
+
end
|
73
|
+
end
|
74
|
+
out
|
75
|
+
end
|
76
|
+
################################################################################
|
59
77
|
# returns true if the underlying IO object is at end and the internal buffer
|
60
78
|
# is empty
|
61
79
|
def eof?
|
@@ -71,21 +89,21 @@ class PDF::Reader
|
|
71
89
|
end
|
72
90
|
################################################################################
|
73
91
|
# PDF files are processed by tokenising the content into a series of objects and commands.
|
74
|
-
# This prepares the buffer for use by
|
92
|
+
# This prepares the buffer for use by reading the next line of tokens into memory.
|
75
93
|
def ready_token (with_strip=true, skip_blanks=true)
|
76
94
|
while @buffer.nil? or @buffer.empty?
|
77
95
|
@buffer = @io.readline
|
78
96
|
@buffer.sub!(/%.*$/, '')
|
79
97
|
@buffer.chomp!
|
80
|
-
@buffer.lstrip! if with_strip
|
81
98
|
break unless skip_blanks
|
82
99
|
end
|
100
|
+
@buffer.lstrip! if with_strip
|
83
101
|
end
|
84
102
|
################################################################################
|
85
103
|
# return the next token from the underlying IO stream
|
86
104
|
def token
|
87
105
|
ready_token
|
88
|
-
|
106
|
+
|
89
107
|
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
90
108
|
|
91
109
|
token_chars =
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -28,12 +28,24 @@ class PDF::Reader
|
|
28
28
|
|
29
29
|
def initialize(data)
|
30
30
|
@map = {}
|
31
|
-
|
31
|
+
in_char_mode = false
|
32
|
+
in_range_mode = false
|
33
|
+
|
32
34
|
data.each_line do |l|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
35
|
+
if l.include?("beginbfchar")
|
36
|
+
in_char_mode = true
|
37
|
+
elsif l.include?("endbfchar")
|
38
|
+
in_char_mode = false
|
39
|
+
elsif l.include?("beginbfrange")
|
40
|
+
in_range_mode = true
|
41
|
+
elsif l.include?("endbfrange")
|
42
|
+
in_range_mode = false
|
43
|
+
end
|
44
|
+
|
45
|
+
if in_char_mode
|
46
|
+
process_bfchar_line(l)
|
47
|
+
elsif in_range_mode
|
48
|
+
process_bfrange_line(l)
|
37
49
|
end
|
38
50
|
end
|
39
51
|
end
|
@@ -44,5 +56,29 @@ class PDF::Reader
|
|
44
56
|
@map[c]
|
45
57
|
end
|
46
58
|
|
59
|
+
private
|
60
|
+
|
61
|
+
def process_bfchar_line(l)
|
62
|
+
m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
|
63
|
+
@map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
|
64
|
+
end
|
65
|
+
|
66
|
+
def process_bfrange_line(l)
|
67
|
+
m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
|
68
|
+
if start_code && end_code && dst
|
69
|
+
start_code = "0x#{start_code}".hex
|
70
|
+
end_code = "0x#{end_code}".hex
|
71
|
+
dst = "0x#{dst}".hex
|
72
|
+
incr = 0
|
73
|
+
|
74
|
+
# add all values in the range to our mapping
|
75
|
+
(start_code..end_code).each do |val|
|
76
|
+
@map[val] = dst + incr
|
77
|
+
incr += 1
|
78
|
+
# ensure a single range does not exceed 255 chars
|
79
|
+
raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if incr > 255
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
47
83
|
end
|
48
84
|
end
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -23,6 +23,7 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
require 'stringio'
|
26
|
+
#require 'enumerable'
|
26
27
|
|
27
28
|
class PDF::Reader
|
28
29
|
################################################################################
|
@@ -144,6 +145,25 @@ class PDF::Reader
|
|
144
145
|
# - end_page_container
|
145
146
|
# - begin_page
|
146
147
|
# - end_page
|
148
|
+
#
|
149
|
+
# == Resource Callbacks
|
150
|
+
#
|
151
|
+
# Each page and page_container can contain a range of resources required for the page,
|
152
|
+
# including things like fonts and images. The following callbacks may appear
|
153
|
+
# after begin_page_container and begin_page if the relevant resources exist
|
154
|
+
# on a page:
|
155
|
+
#
|
156
|
+
# In most cases, these callbacks associate a name with each resource, allowing it
|
157
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
158
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
159
|
+
# invoke_xobject "IM1".
|
160
|
+
#
|
161
|
+
# - resource_procset
|
162
|
+
# - resource_xobject
|
163
|
+
# - resource_extgstate
|
164
|
+
# - resource_colorspace
|
165
|
+
# - resource_pattern
|
166
|
+
# - resource_font
|
147
167
|
class Content
|
148
168
|
OPERATORS = {
|
149
169
|
'b' => :close_fill_stroke,
|
@@ -240,20 +260,27 @@ class PDF::Reader
|
|
240
260
|
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
241
261
|
# its content
|
242
262
|
def walk_pages (page)
|
243
|
-
|
263
|
+
|
264
|
+
if page['Resources']
|
265
|
+
res = page['Resources']
|
266
|
+
page.delete('Resources')
|
267
|
+
end
|
244
268
|
|
245
269
|
# extract page content
|
246
270
|
if page['Type'] == "Pages"
|
247
271
|
callback(:begin_page_container, [page])
|
272
|
+
walk_resources(@xref.object(res)) if res
|
248
273
|
page['Kids'].each {|child| walk_pages(@xref.object(child))}
|
249
274
|
callback(:end_page_container)
|
250
275
|
elsif page['Type'] == "Page"
|
251
276
|
callback(:begin_page, [page])
|
277
|
+
walk_resources(@xref.object(res)) if res
|
252
278
|
@page = page
|
253
279
|
@params = []
|
254
280
|
|
255
281
|
page['Contents'].to_a.each do |cstream|
|
256
|
-
|
282
|
+
obj, stream = @xref.object(cstream)
|
283
|
+
content_stream(stream)
|
257
284
|
end if page.has_key?('Contents') and page['Contents']
|
258
285
|
|
259
286
|
callback(:end_page)
|
@@ -274,9 +301,19 @@ class PDF::Reader
|
|
274
301
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
275
302
|
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
276
303
|
|
277
|
-
#
|
304
|
+
# handle special cases in response to certain operators
|
278
305
|
if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
|
306
|
+
# convert any text to utf-8
|
279
307
|
@params = @fonts[@current_font].to_utf8(@params)
|
308
|
+
elsif token == "ID"
|
309
|
+
# inline image data, first convert the current params into a more familiar hash
|
310
|
+
map = {}
|
311
|
+
@params.each_slice(2) do |a|
|
312
|
+
map[a.first] = a.last
|
313
|
+
end
|
314
|
+
@params = [map]
|
315
|
+
# read the raw image data from the buffer without tokenising
|
316
|
+
@params << @buffer.read_until("EI")
|
280
317
|
end
|
281
318
|
callback(OPERATORS[token], @params)
|
282
319
|
@params.clear
|
@@ -289,7 +326,43 @@ class PDF::Reader
|
|
289
326
|
rescue EOFError => e
|
290
327
|
end
|
291
328
|
################################################################################
|
292
|
-
def
|
329
|
+
def walk_resources(resources)
|
330
|
+
resources = resolve_references(resources)
|
331
|
+
|
332
|
+
# extract any procset information
|
333
|
+
if resources['ProcSet']
|
334
|
+
callback(:resource_procset, resources['ProcSet'])
|
335
|
+
end
|
336
|
+
|
337
|
+
# extract any xobject information
|
338
|
+
if resources['XObject']
|
339
|
+
@xref.object(resources['XObject']).each do |name, val|
|
340
|
+
obj, stream = @xref.object(val)
|
341
|
+
callback(:resource_xobject, [name, obj, stream])
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# extract any extgstate information
|
346
|
+
if resources['ExtGState']
|
347
|
+
@xref.object(resources['ExtGState']).each do |name, val|
|
348
|
+
callback(:resource_extgstate, [name, @xref.object(val)])
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
# extract any colorspace information
|
353
|
+
if resources['ColorSpace']
|
354
|
+
@xref.object(resources['ColorSpace']).each do |name, val|
|
355
|
+
callback(:resource_colorspace, [name, @xref.object(val)])
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# extract any pattern information
|
360
|
+
if resources['Pattern']
|
361
|
+
@xref.object(resources['Pattern']).each do |name, val|
|
362
|
+
callback(:resource_pattern, [name, @xref.object(val)])
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
293
366
|
# extract any font information
|
294
367
|
if resources['Font']
|
295
368
|
@xref.object(resources['Font']).each do |label, desc|
|
@@ -301,15 +374,29 @@ class PDF::Reader
|
|
301
374
|
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc['Encoding']))
|
302
375
|
@fonts[label].descendantfonts = desc['DescendantFonts'] if desc['DescendantFonts']
|
303
376
|
if desc['ToUnicode']
|
304
|
-
|
305
|
-
|
377
|
+
obj, cmap = @xref.object(desc['ToUnicode'])
|
378
|
+
|
379
|
+
# this stream is a cmap
|
380
|
+
begin
|
381
|
+
@fonts[label].tounicode = PDF::Reader::CMap.new(cmap)
|
382
|
+
rescue
|
383
|
+
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
384
|
+
end
|
306
385
|
end
|
386
|
+
callback(:resource_font, [label, @fonts[label]])
|
307
387
|
end
|
308
388
|
end
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
389
|
+
end
|
390
|
+
################################################################################
|
391
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
392
|
+
def resolve_references(obj)
|
393
|
+
case obj
|
394
|
+
when PDF::Reader::Reference then resolve_references(@xref.object(obj))
|
395
|
+
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
396
|
+
when Array then obj.collect { |item| resolve_references(item) }
|
397
|
+
else
|
398
|
+
obj
|
399
|
+
end
|
313
400
|
end
|
314
401
|
################################################################################
|
315
402
|
# calls the name callback method on the receiver class with params as the arguments
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -111,12 +111,13 @@ class PDF::Reader
|
|
111
111
|
# iterate over string, reading it in 2 byte chunks and interpreting those
|
112
112
|
# chunks as ints
|
113
113
|
str.unpack("n*").each do |c|
|
114
|
+
|
114
115
|
# convert the int to a unicode codepoint if possible.
|
115
116
|
# without a ToUnicode CMap, it's impossible to reliably convert this text
|
116
117
|
# to unicode, so just replace each character with a little box. Big smacks
|
117
118
|
# the the PDF producing app.
|
118
|
-
if map
|
119
|
-
array_enc <<
|
119
|
+
if map && (code = map.decode(c))
|
120
|
+
array_enc << code
|
120
121
|
else
|
121
122
|
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
122
123
|
end
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -40,20 +40,27 @@ class PDF::Reader
|
|
40
40
|
|
41
41
|
case name
|
42
42
|
when "FlateDecode" then @filter = :flate
|
43
|
-
else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
43
|
+
#else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
44
44
|
end
|
45
45
|
end
|
46
46
|
################################################################################
|
47
47
|
# attempts to decode the specified data with the current filter
|
48
48
|
def filter (data)
|
49
|
+
# leave the data untouched if we don't support the required filter
|
50
|
+
return data if @filter.nil?
|
51
|
+
|
52
|
+
# decode the data
|
49
53
|
self.send(@filter, data)
|
50
54
|
end
|
51
55
|
################################################################################
|
52
56
|
# Decode the specified data with the Zlib compression algorithm
|
53
57
|
def flate (data)
|
54
|
-
|
55
|
-
|
56
|
-
|
58
|
+
begin
|
59
|
+
z = Zlib::Inflate.new
|
60
|
+
z.inflate(data)
|
61
|
+
rescue Exception => e
|
62
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
63
|
+
end
|
57
64
|
end
|
58
65
|
################################################################################
|
59
66
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -58,9 +58,9 @@ class PDF::Reader
|
|
58
58
|
when "obj", "endobj" then return Token.new(token)
|
59
59
|
when "stream", "endstream" then return Token.new(token)
|
60
60
|
when ">>", "]", ">" then return Token.new(token)
|
61
|
-
else
|
61
|
+
else
|
62
62
|
if operators.has_key?(token) then return Token.new(token)
|
63
|
-
else
|
63
|
+
else return token.to_f
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end
|
@@ -72,7 +72,7 @@ class PDF::Reader
|
|
72
72
|
loop do
|
73
73
|
key = parse_token
|
74
74
|
break if key.kind_of?(Token) and key == ">>"
|
75
|
-
raise MalformedPDFError, "
|
75
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Name)
|
76
76
|
|
77
77
|
value = parse_token
|
78
78
|
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
@@ -97,9 +97,15 @@ class PDF::Reader
|
|
97
97
|
################################################################################
|
98
98
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
99
99
|
def hex_string
|
100
|
-
str =
|
101
|
-
|
100
|
+
str = ""
|
101
|
+
|
102
|
+
loop do
|
103
|
+
token = @buffer.token
|
104
|
+
break if token == ">"
|
105
|
+
str << token
|
106
|
+
end
|
102
107
|
|
108
|
+
# add a missing digit if required, as required by the spec
|
103
109
|
str << "0" unless str.size % 2 == 0
|
104
110
|
str.scan(/../).map {|i| i.hex.chr}.join
|
105
111
|
end
|
@@ -151,11 +157,12 @@ class PDF::Reader
|
|
151
157
|
|
152
158
|
@buffer.head(to_remove, false)
|
153
159
|
end
|
154
|
-
|
155
160
|
str
|
156
161
|
end
|
157
162
|
################################################################################
|
158
163
|
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
164
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
165
|
+
# that describes it
|
159
166
|
#
|
160
167
|
# id - the object ID to return
|
161
168
|
# gen - the object revision number to return
|
@@ -166,11 +173,10 @@ class PDF::Reader
|
|
166
173
|
|
167
174
|
obj = parse_token
|
168
175
|
post_obj = parse_token
|
169
|
-
|
170
176
|
case post_obj
|
171
177
|
when "endobj" then return obj
|
172
|
-
when "stream" then return stream(obj)
|
173
|
-
else
|
178
|
+
when "stream" then return obj, stream(obj)
|
179
|
+
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
174
180
|
end
|
175
181
|
end
|
176
182
|
################################################################################
|
@@ -178,6 +184,7 @@ class PDF::Reader
|
|
178
184
|
def stream (dict)
|
179
185
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
|
180
186
|
data = @buffer.read(@xref.object(dict['Length']))
|
187
|
+
|
181
188
|
Error.str_assert(parse_token, "endstream")
|
182
189
|
Error.str_assert(parse_token, "endobj")
|
183
190
|
|
@@ -193,9 +200,6 @@ class PDF::Reader
|
|
193
200
|
end
|
194
201
|
end
|
195
202
|
|
196
|
-
# this stream is a cmap
|
197
|
-
data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")
|
198
|
-
|
199
203
|
data
|
200
204
|
end
|
201
205
|
################################################################################
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -42,44 +42,69 @@ class PDF::Reader
|
|
42
42
|
#
|
43
43
|
# Will fail silently if there is no xref table at the requested offset.
|
44
44
|
def load (offset = nil)
|
45
|
-
|
45
|
+
offset ||= @buffer.find_first_xref_offset
|
46
|
+
@buffer.seek(offset)
|
46
47
|
token = @buffer.token
|
47
|
-
|
48
|
-
if token == "xref"
|
48
|
+
|
49
|
+
if token == "xref" || token == "ref"
|
49
50
|
load_xref_table
|
51
|
+
else
|
52
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
50
53
|
end
|
51
54
|
end
|
52
55
|
################################################################################
|
53
56
|
# Return a string containing the contents of an entire PDF object. The object is requested
|
54
57
|
# by specifying a PDF::Reader::Reference object that contains the objects ID and revision
|
55
58
|
# number
|
59
|
+
#
|
60
|
+
# If the object is a stream, that is returned as well
|
56
61
|
def object (ref, save_pos = true)
|
57
62
|
return ref unless ref.kind_of?(Reference)
|
58
63
|
pos = @buffer.pos if save_pos
|
59
|
-
|
64
|
+
obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
60
65
|
@buffer.seek(pos) if save_pos
|
61
|
-
|
66
|
+
if stream
|
67
|
+
return obj, stream
|
68
|
+
else
|
69
|
+
return obj
|
70
|
+
end
|
62
71
|
end
|
63
72
|
################################################################################
|
64
73
|
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
65
74
|
# processes it into memory.
|
66
75
|
def load_xref_table
|
67
|
-
|
76
|
+
tok_one = tok_two = nil
|
77
|
+
|
78
|
+
begin
|
79
|
+
# loop over all subsections of the xref table
|
80
|
+
# In a well formed PDF, the 'trailer' token will indicate
|
81
|
+
# the end of the table. However we need to be careful in case
|
82
|
+
# we're processing a malformed pdf that is missing the trailer.
|
83
|
+
loop do
|
84
|
+
tok_one, tok_two = @buffer.token, @buffer.token
|
85
|
+
if tok_one != "trailer" && !tok_one.match(/\d+/)
|
86
|
+
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
87
|
+
end
|
88
|
+
break if tok_one == "trailer" or tok_one.nil?
|
89
|
+
objid, count = tok_one.to_i, tok_two.to_i
|
68
90
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
91
|
+
count.times do
|
92
|
+
offset = @buffer.token.to_i
|
93
|
+
generation = @buffer.token.to_i
|
94
|
+
state = @buffer.token
|
73
95
|
|
74
|
-
|
75
|
-
|
96
|
+
store(objid, generation, offset) if state == "n"
|
97
|
+
objid += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
rescue EOFError => e
|
101
|
+
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
76
102
|
end
|
77
103
|
|
78
|
-
raise MalformedPDFError, "PDF malformed,
|
79
|
-
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless @buffer.token == "<<"
|
104
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
80
105
|
|
81
106
|
trailer = Parser.new(@buffer, self).dictionary
|
82
|
-
load(trailer['Prev']) if trailer.has_key?('Prev')
|
107
|
+
load(trailer['Prev'].to_i) if trailer.has_key?('Prev')
|
83
108
|
|
84
109
|
trailer
|
85
110
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-03-
|
12
|
+
date: 2008-03-22 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|