pdf-reader 0.8.1 → 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/Rakefile +1 -1
- data/lib/pdf/hash.rb +1 -2
- data/lib/pdf/reader.rb +1 -3
- data/lib/pdf/reader/buffer.rb +248 -117
- data/lib/pdf/reader/content.rb +1 -1
- data/lib/pdf/reader/parser.rb +45 -83
- data/lib/pdf/reader/reference.rb +0 -10
- data/lib/pdf/reader/xref.rb +48 -52
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
v0.8.2 (1st January 2010)
|
2
|
+
- Fix parsing of files that use Form XObjects behind an indirect reference
|
3
|
+
(thanks Cornelius Illi and Patrick Crosby)
|
4
|
+
- Rewrote Buffer class to fix various speed issues reported over the years
|
5
|
+
- On my sample file extracting full text reduced from 220 seconds to 9 seconds.
|
6
|
+
|
1
7
|
v0.8.1 (27th November 2009)
|
2
8
|
- Added PDF::Hash#version. Provides access to the source file PDF version
|
3
9
|
|
data/Rakefile
CHANGED
data/lib/pdf/hash.rb
CHANGED
@@ -46,8 +46,7 @@ module PDF
|
|
46
46
|
raise ArgumentError, "input must be an IO-like object or a filename"
|
47
47
|
end
|
48
48
|
@version = read_version(io)
|
49
|
-
|
50
|
-
@xref = PDF::Reader::XRef.new(buffer)
|
49
|
+
@xref = PDF::Reader::XRef.new(io)
|
51
50
|
@trailer = @xref.load
|
52
51
|
end
|
53
52
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -122,9 +122,7 @@ class PDF::Reader
|
|
122
122
|
################################################################################
|
123
123
|
# Given an IO object that contains PDF data, parse it.
|
124
124
|
def parse (io, receiver, opts = {})
|
125
|
-
@
|
126
|
-
@xref = XRef.new(@buffer)
|
127
|
-
@parser = Parser.new(@buffer, @xref)
|
125
|
+
@xref = XRef.new(io)
|
128
126
|
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
129
127
|
|
130
128
|
options = {:pages => true, :metadata => true}
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
|
-
# Copyright (C)
|
5
|
+
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
|
4
6
|
#
|
5
7
|
# Permission is hereby granted, free of charge, to any person obtaining
|
6
8
|
# a copy of this software and associated documentation files (the
|
@@ -24,140 +26,118 @@
|
|
24
26
|
################################################################################
|
25
27
|
|
26
28
|
class PDF::Reader
|
27
|
-
|
28
|
-
#
|
29
|
+
|
30
|
+
# A string tokeniser that recognises PDF grammer. When passed an IO stream or a
|
31
|
+
# string, repeated calls to token() will return the next token from the source.
|
32
|
+
#
|
33
|
+
# This is very low level, and getting the raw tokens is not very useful in itself.
|
34
|
+
#
|
35
|
+
# This will usually be used in conjunction with PDF:Reader::Parser, which converts
|
36
|
+
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
37
|
+
#
|
29
38
|
class Buffer
|
30
|
-
|
31
|
-
|
32
|
-
|
39
|
+
|
40
|
+
attr_reader :pos
|
41
|
+
|
42
|
+
# Creates a new buffer.
|
43
|
+
#
|
44
|
+
# Params:
|
45
|
+
#
|
46
|
+
# io - an IO stream or string with the raw data to tokenise
|
47
|
+
#
|
48
|
+
# options:
|
49
|
+
#
|
50
|
+
# :seek - a byte offset to seek to before starting to tokenise
|
51
|
+
#
|
52
|
+
def initialize (io, opts = {})
|
33
53
|
@io = io
|
34
|
-
@
|
54
|
+
@tokens = []
|
55
|
+
@options = opts
|
56
|
+
|
57
|
+
@io.seek(opts[:seek]) if opts[:seek]
|
58
|
+
@pos = @io.pos
|
35
59
|
end
|
36
|
-
|
37
|
-
#
|
38
|
-
|
39
|
-
|
40
|
-
@
|
41
|
-
|
60
|
+
|
61
|
+
# return true if there are no more tokens left
|
62
|
+
#
|
63
|
+
def empty?
|
64
|
+
prepare_tokens if @tokens.size < 3
|
65
|
+
|
66
|
+
@tokens.empty?
|
42
67
|
end
|
43
|
-
|
44
|
-
#
|
68
|
+
|
69
|
+
# return raw bytes from the underlying IO stream.
|
45
70
|
#
|
46
|
-
#
|
47
|
-
|
48
|
-
|
71
|
+
# bytes - the number of bytes to read
|
72
|
+
#
|
73
|
+
# options:
|
74
|
+
#
|
75
|
+
# :skip_eol - if true, the IO stream is advanced past any LF or CR
|
76
|
+
# bytes before it reads any data. This is to handle
|
77
|
+
# content streams, which have a CRLF or LF after the stream
|
78
|
+
# token.
|
79
|
+
#
|
80
|
+
def read(bytes, opts = {})
|
81
|
+
reset_pos
|
49
82
|
|
50
|
-
if
|
51
|
-
|
52
|
-
|
83
|
+
if opts[:skip_eol]
|
84
|
+
done = false
|
85
|
+
while !done
|
86
|
+
chr = @io.read(1)
|
87
|
+
if chr.nil?
|
88
|
+
return nil
|
89
|
+
elsif chr != "\n" && chr != "\r"
|
90
|
+
@io.seek(-1, IO::SEEK_CUR)
|
91
|
+
done = true
|
92
|
+
end
|
93
|
+
end
|
53
94
|
end
|
54
95
|
|
55
|
-
|
56
|
-
|
96
|
+
bytes = @io.read(bytes)
|
97
|
+
save_pos
|
98
|
+
bytes
|
57
99
|
end
|
58
|
-
|
59
|
-
#
|
100
|
+
|
101
|
+
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
+
# occurance of needle will be returned. The match (if any) is not returned.
|
103
|
+
# The IO stream cursor is left on the first byte of the match.
|
104
|
+
#
|
105
|
+
# needle - a string to search the IO stream for
|
60
106
|
#
|
61
|
-
|
62
|
-
|
107
|
+
def read_until(needle)
|
108
|
+
reset_pos
|
63
109
|
out = ""
|
64
|
-
size =
|
110
|
+
size = needle.size
|
65
111
|
|
66
|
-
|
67
|
-
|
68
|
-
offset = @buffer.index(bytes) + size
|
69
|
-
return head(offset)
|
70
|
-
else
|
71
|
-
out << head(@buffer.size)
|
72
|
-
end
|
112
|
+
while out[size * -1, size] != needle && !@io.eof?
|
113
|
+
out << @io.read(1)
|
73
114
|
end
|
74
115
|
|
75
|
-
|
76
|
-
out
|
77
|
-
|
78
|
-
out = out[0, out.size - size]
|
79
|
-
seek(pos - size)
|
80
|
-
break
|
81
|
-
end
|
116
|
+
if out[size * -1, size] == needle
|
117
|
+
out = out[0, out.size - size]
|
118
|
+
@io.seek(size * -1, IO::SEEK_CUR)
|
82
119
|
end
|
120
|
+
|
121
|
+
save_pos
|
83
122
|
out
|
84
123
|
end
|
85
|
-
################################################################################
|
86
|
-
# returns true if the underlying IO object is at end and the internal buffer
|
87
|
-
# is empty
|
88
|
-
def eof?
|
89
|
-
ready_token
|
90
|
-
if @buffer
|
91
|
-
@buffer.empty? && @io.eof?
|
92
|
-
else
|
93
|
-
@io.eof?
|
94
|
-
end
|
95
|
-
end
|
96
|
-
################################################################################
|
97
|
-
def pos
|
98
|
-
@io.pos
|
99
|
-
end
|
100
|
-
################################################################################
|
101
|
-
def pos_without_buf
|
102
|
-
@io.pos - @buffer.to_s.size
|
103
|
-
end
|
104
|
-
################################################################################
|
105
|
-
# PDF files are processed by tokenising the content into a series of objects and commands.
|
106
|
-
# This prepares the buffer for use by reading the next line of tokens into memory.
|
107
|
-
def ready_token (with_strip=true, skip_blanks=true)
|
108
|
-
while (@buffer.nil? or @buffer.empty?) && !@io.eof?
|
109
|
-
@buffer = @io.readline
|
110
|
-
@buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
|
111
|
-
#@buffer.sub!(/%.*$/, '') if strip_comments
|
112
|
-
@buffer.chomp!
|
113
|
-
break unless skip_blanks
|
114
|
-
end
|
115
|
-
@buffer.lstrip! if with_strip
|
116
|
-
end
|
117
|
-
################################################################################
|
118
|
-
# return the next token from the underlying IO stream
|
119
|
-
def token
|
120
|
-
ready_token
|
121
|
-
|
122
|
-
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
123
124
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
tok = head(token_chars, strip_space)
|
125
|
+
# return the next token from the source. Returns a string if a token
|
126
|
+
# is found, nil if there are no tokens left.
|
127
|
+
#
|
128
|
+
def token
|
129
|
+
reset_pos
|
130
|
+
prepare_tokens if @tokens.size < 3
|
131
|
+
merge_indirect_reference
|
132
|
+
merge_tokens
|
133
133
|
|
134
|
-
|
135
|
-
nil
|
136
|
-
elsif tok[0,1] == "%"
|
137
|
-
@buffer = ""
|
138
|
-
token
|
139
|
-
else
|
140
|
-
tok
|
141
|
-
end
|
134
|
+
@tokens.shift
|
142
135
|
end
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
@buffer = @buffer[chars .. -1] || ""
|
147
|
-
@buffer.lstrip! if with_strip
|
148
|
-
val
|
149
|
-
end
|
150
|
-
################################################################################
|
151
|
-
# return the internal buffer used by this class when reading from the IO stream.
|
152
|
-
def raw
|
153
|
-
@buffer
|
154
|
-
end
|
155
|
-
################################################################################
|
156
|
-
# The Xref table in a PDF file acts as an aid for finding the location of various
|
157
|
-
# objects in the file. This method attempts to locate the byte offset of the xref
|
158
|
-
# table in the underlying IO stream.
|
136
|
+
|
137
|
+
# return the byte offset where the first XRef table in th source can be found.
|
138
|
+
#
|
159
139
|
def find_first_xref_offset
|
160
|
-
@io.seek(-1024, IO::SEEK_END) rescue seek(0)
|
140
|
+
@io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
|
161
141
|
data = @io.read(1024)
|
162
142
|
|
163
143
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
@@ -179,8 +159,159 @@ class PDF::Reader
|
|
179
159
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
180
160
|
lines[eof_index+1].to_i
|
181
161
|
end
|
182
|
-
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
# Some bastard moved our IO stream cursor. Restore it.
|
166
|
+
#
|
167
|
+
def reset_pos
|
168
|
+
@io.seek(@pos) if @io.pos != @pos
|
169
|
+
end
|
170
|
+
|
171
|
+
# save the current position of the source IO stream. If someone else (like another buffer)
|
172
|
+
# moves the cursor, we can then restore it.
|
173
|
+
#
|
174
|
+
def save_pos
|
175
|
+
@pos = @io.pos
|
176
|
+
end
|
177
|
+
|
178
|
+
# attempt to prime the buffer with the next few tokens.
|
179
|
+
#
|
180
|
+
def prepare_tokens
|
181
|
+
10.times do
|
182
|
+
if state == :literal_string
|
183
|
+
prepare_literal_token
|
184
|
+
elsif state == :regular
|
185
|
+
prepare_regular_token
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
save_pos
|
190
|
+
end
|
191
|
+
|
192
|
+
# tokenising behaves slightly differently based on the current context.
|
193
|
+
# Determine the current context/state by examining the last token we found
|
194
|
+
#
|
195
|
+
def state
|
196
|
+
if @tokens[-1] == "("
|
197
|
+
:literal_string
|
198
|
+
elsif @tokens[-1] == "stream"
|
199
|
+
:stream
|
200
|
+
else
|
201
|
+
:regular
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# detect a series of 3 tokens that make up an indirect object. If we find
|
206
|
+
# them, replace the tokens with a PDF::Reader::Reference instance.
|
207
|
+
#
|
208
|
+
# Merging them into a single string was another option, but that would mean
|
209
|
+
# code further up the stact would need to check every token to see if it looks
|
210
|
+
# like an indirect object. For optimisation reasons, I'd rather avoid
|
211
|
+
# that extra check.
|
212
|
+
#
|
213
|
+
def merge_indirect_reference
|
214
|
+
return if @tokens.size < 3
|
215
|
+
return if @tokens[2] != "R"
|
216
|
+
|
217
|
+
if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
|
218
|
+
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
219
|
+
@tokens[1] = nil
|
220
|
+
@tokens[2] = nil
|
221
|
+
@tokens.compact!
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# merge any consequtive tokens that are actually 1 token. The only current
|
226
|
+
# time this is the case is << and >>. < and > are valid tokens (they indicate
|
227
|
+
# a hex string) but so are << and >> (they indicate a dictionary).
|
228
|
+
#
|
229
|
+
def merge_tokens
|
230
|
+
@tokens.each_with_index do |tok, idx|
|
231
|
+
if tok == "<" && @tokens[idx+1] == "<"
|
232
|
+
@tokens.inspect
|
233
|
+
@tokens[idx] = "<<"
|
234
|
+
@tokens[idx+1] = nil
|
235
|
+
elsif tok == ">" && @tokens[idx+1] == ">"
|
236
|
+
@tokens[idx] = ">>"
|
237
|
+
@tokens[idx+1] = nil
|
238
|
+
end
|
239
|
+
end
|
240
|
+
@tokens.compact!
|
241
|
+
end
|
242
|
+
|
243
|
+
# if we're currently inside a literal string we more or less just read bytes until
|
244
|
+
# we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
|
245
|
+
# start of a new token in regular mode are left untouched when inside a literal
|
246
|
+
# string.
|
247
|
+
#
|
248
|
+
# The entire literal string will be returned as a single token. It will need further
|
249
|
+
# processing to fix things like escaped new lines, but that's someone else's
|
250
|
+
# problem.
|
251
|
+
#
|
252
|
+
def prepare_literal_token
|
253
|
+
str = ""
|
254
|
+
count = 1
|
255
|
+
|
256
|
+
while count > 0
|
257
|
+
chr = @io.read(1)
|
258
|
+
if chr.nil?
|
259
|
+
count = 0 # unbalanced params
|
260
|
+
elsif chr == "(" && str[-1,1] != "\x5C"
|
261
|
+
str << "("
|
262
|
+
count += 1
|
263
|
+
elsif chr == ")" && str[-1,1] != "\x5C"
|
264
|
+
count -= 1
|
265
|
+
str << ")" unless count == 0
|
266
|
+
else
|
267
|
+
str << chr unless count == 0
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
@tokens << str if str.size > 0
|
272
|
+
@tokens << ")"
|
273
|
+
end
|
274
|
+
|
275
|
+
# Extract the next regular token and stock it in our buffer, ready to be returned.
|
276
|
+
#
|
277
|
+
# What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
|
278
|
+
# to read up on it.
|
279
|
+
#
|
280
|
+
def prepare_regular_token
|
281
|
+
tok = ""
|
282
|
+
|
283
|
+
while chr = @io.read(1)
|
284
|
+
case chr
|
285
|
+
when "\x25"
|
286
|
+
# comment, ignore everything until the next EOL char
|
287
|
+
done = false
|
288
|
+
while !done
|
289
|
+
chr = @io.read(1)
|
290
|
+
done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
|
291
|
+
end
|
292
|
+
when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
|
293
|
+
# white space, token finished
|
294
|
+
@tokens << tok if tok.size > 0
|
295
|
+
tok = ""
|
296
|
+
break
|
297
|
+
when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
|
298
|
+
# opening delimiter, start of new token
|
299
|
+
@tokens << tok if tok.size > 0
|
300
|
+
@tokens << chr
|
301
|
+
tok = ""
|
302
|
+
break
|
303
|
+
when "\x29", "\x3E", "\x5D", "\x7D"
|
304
|
+
# closing delimiter
|
305
|
+
@tokens << tok if tok.size > 0
|
306
|
+
@tokens << chr
|
307
|
+
tok = ""
|
308
|
+
break
|
309
|
+
else
|
310
|
+
tok << chr
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
@tokens << tok if tok.size > 0
|
315
|
+
end
|
183
316
|
end
|
184
|
-
################################################################################
|
185
317
|
end
|
186
|
-
################################################################################
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -323,7 +323,7 @@ class PDF::Reader
|
|
323
323
|
# like a regular page content stream.
|
324
324
|
#
|
325
325
|
def walk_xobject_form(label)
|
326
|
-
xobjects = current_resources[:XObject] || {}
|
326
|
+
xobjects = @xref.object(current_resources[:XObject]) || {}
|
327
327
|
xobject = @xref.object(xobjects[label])
|
328
328
|
|
329
329
|
if xobject && xobject.hash[:Subtype] == :Form
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -43,10 +43,10 @@ class PDF::Reader
|
|
43
43
|
#
|
44
44
|
# operators - a hash of supported operators to read from the underlying buffer.
|
45
45
|
def parse_token (operators={})
|
46
|
-
ref = Reference.from_buffer(@buffer) and return ref
|
47
46
|
token = @buffer.token
|
48
47
|
|
49
48
|
case token
|
49
|
+
when PDF::Reader::Reference then return token
|
50
50
|
when nil then return nil
|
51
51
|
when "/" then return @buffer.token.to_sym
|
52
52
|
when "<<" then return dictionary()
|
@@ -58,7 +58,7 @@ class PDF::Reader
|
|
58
58
|
when "null" then return nil
|
59
59
|
when "obj", "endobj" then return Token.new(token)
|
60
60
|
when "stream", "endstream" then return Token.new(token)
|
61
|
-
when ">>", "]", ">"
|
61
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
62
62
|
else
|
63
63
|
if operators.has_key?(token) then return Token.new(token)
|
64
64
|
elsif token =~ /\d*\.\d/ then return token.to_f
|
@@ -66,6 +66,29 @@ class PDF::Reader
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
end
|
69
|
+
################################################################################
|
70
|
+
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
71
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
72
|
+
# that describes it
|
73
|
+
#
|
74
|
+
# id - the object ID to return
|
75
|
+
# gen - the object revision number to return
|
76
|
+
def object (id, gen)
|
77
|
+
Error.assert_equal(parse_token, id)
|
78
|
+
Error.assert_equal(parse_token, gen)
|
79
|
+
Error.str_assert(parse_token, "obj")
|
80
|
+
|
81
|
+
obj = parse_token
|
82
|
+
post_obj = parse_token
|
83
|
+
case post_obj
|
84
|
+
when "endobj" then return obj
|
85
|
+
when "stream" then return stream(obj)
|
86
|
+
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
69
92
|
################################################################################
|
70
93
|
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
|
71
94
|
def dictionary
|
@@ -114,95 +137,34 @@ class PDF::Reader
|
|
114
137
|
################################################################################
|
115
138
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
116
139
|
def string
|
117
|
-
str =
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
if i.nil?
|
138
|
-
str << @buffer.raw + "\n"
|
139
|
-
@buffer.raw.replace("")
|
140
|
-
# if a content stream opens a string, but never closes it, we'll
|
141
|
-
# hit the end of the stream and still be appending stuff to the
|
142
|
-
# string. bad! This check prevents a hard loop.
|
143
|
-
raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
|
144
|
-
next
|
145
|
-
end
|
140
|
+
str = @buffer.token
|
141
|
+
return "" if str == ")"
|
142
|
+
Error.assert_equal(parse_token, ")")
|
143
|
+
|
144
|
+
str.gsub!("\\n","\n")
|
145
|
+
str.gsub!("\\r","\r")
|
146
|
+
str.gsub!("\\t","\t")
|
147
|
+
str.gsub!("\\b","\b")
|
148
|
+
str.gsub!("\\f","\f")
|
149
|
+
str.gsub!("\\(","(")
|
150
|
+
str.gsub!("\\)",")")
|
151
|
+
str.gsub!("\\\\","\\")
|
152
|
+
str.gsub!(/\\\n/m,"")
|
153
|
+
str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
|
154
|
+
|
155
|
+
str.scan(/\\\d{1,3}/).each do |octal|
|
156
|
+
str.gsub!(octal, octal[1,3].oct.chr)
|
157
|
+
end
|
146
158
|
|
147
|
-
|
148
|
-
to_remove = 1
|
149
|
-
|
150
|
-
case @buffer.raw[0, 1]
|
151
|
-
when "("
|
152
|
-
str << "("
|
153
|
-
count += 1
|
154
|
-
when ")"
|
155
|
-
count -= 1
|
156
|
-
str << ")" unless count == 0
|
157
|
-
when "\\"
|
158
|
-
to_remove = 2
|
159
|
-
case @buffer.raw[1, 1]
|
160
|
-
when "" then to_remove = 1
|
161
|
-
when "n" then str << "\n"
|
162
|
-
when "r" then str << "\r"
|
163
|
-
when "t" then str << "\t"
|
164
|
-
when "b" then str << "\b"
|
165
|
-
when "f" then str << "\f"
|
166
|
-
when "(" then str << "("
|
167
|
-
when ")" then str << ")"
|
168
|
-
when "\\" then str << "\\"
|
169
|
-
else
|
170
|
-
if m = @buffer.raw.match(/^\\(\d{1,3})/)
|
171
|
-
to_remove = m[0].size
|
172
|
-
str << m[1].oct.chr
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
159
|
+
str.gsub!(/\\([^\\])/,'\1')
|
176
160
|
|
177
|
-
@buffer.head(to_remove, false)
|
178
|
-
end
|
179
161
|
str
|
180
162
|
end
|
181
163
|
################################################################################
|
182
|
-
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
183
|
-
# If the object is a content stream, returns both the stream and the dictionary
|
184
|
-
# that describes it
|
185
|
-
#
|
186
|
-
# id - the object ID to return
|
187
|
-
# gen - the object revision number to return
|
188
|
-
def object (id, gen)
|
189
|
-
Error.assert_equal(parse_token, id)
|
190
|
-
Error.assert_equal(parse_token, gen)
|
191
|
-
Error.str_assert(parse_token, "obj")
|
192
|
-
|
193
|
-
obj = parse_token
|
194
|
-
post_obj = parse_token
|
195
|
-
case post_obj
|
196
|
-
when "endobj" then return obj
|
197
|
-
when "stream" then return stream(obj)
|
198
|
-
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
199
|
-
end
|
200
|
-
end
|
201
|
-
################################################################################
|
202
164
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
203
165
|
def stream (dict)
|
204
166
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
205
|
-
data = @buffer.read(@xref.object(dict[:Length]))
|
167
|
+
data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
|
206
168
|
|
207
169
|
Error.str_assert(parse_token, "endstream")
|
208
170
|
Error.str_assert(parse_token, "endobj")
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -27,16 +27,6 @@ class PDF::Reader
|
|
27
27
|
################################################################################
|
28
28
|
# An internal PDF::Reader class that represents an indirect reference to a PDF Object
|
29
29
|
class Reference
|
30
|
-
################################################################################
|
31
|
-
# check if the next token in the buffer is a reference, and return a PDF::Reader::Reference
|
32
|
-
# instance. Returns nil if the next token isn't an indirect reference.
|
33
|
-
def self.from_buffer (buffer)
|
34
|
-
buffer.ready_token
|
35
|
-
return nil unless m = buffer.raw.match(/^(\d+)\s+(\d+)\s+R\b/)
|
36
|
-
buffer.head(m[0].size)
|
37
|
-
self.new(m[1].to_i, m[2].to_i)
|
38
|
-
end
|
39
|
-
################################################################################
|
40
30
|
attr_reader :id, :gen
|
41
31
|
################################################################################
|
42
32
|
# Create a new Reference to an object with the specified id and revision number
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -32,8 +32,8 @@ class PDF::Reader
|
|
32
32
|
class XRef
|
33
33
|
################################################################################
|
34
34
|
# create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
|
35
|
-
def initialize (
|
36
|
-
@
|
35
|
+
def initialize (io)
|
36
|
+
@io = io
|
37
37
|
@xref = {}
|
38
38
|
end
|
39
39
|
def size
|
@@ -44,8 +44,8 @@ class PDF::Reader
|
|
44
44
|
# table, but it is one of the lowest level data items in the file, so we've lumped it in
|
45
45
|
# with the cross reference code.
|
46
46
|
def pdf_version
|
47
|
-
@
|
48
|
-
m, version = *@
|
47
|
+
@io.seek(0)
|
48
|
+
m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
|
49
49
|
raise MalformedPDFError, 'invalid PDF version' if version.nil?
|
50
50
|
return version.to_f
|
51
51
|
end
|
@@ -55,13 +55,14 @@ class PDF::Reader
|
|
55
55
|
#
|
56
56
|
# Will fail silently if there is no xref table at the requested offset.
|
57
57
|
def load (offset = nil)
|
58
|
-
offset ||=
|
59
|
-
|
60
|
-
|
58
|
+
offset ||= new_buffer.find_first_xref_offset
|
59
|
+
|
60
|
+
buf = new_buffer(offset)
|
61
|
+
token = buf.token
|
61
62
|
|
62
63
|
if token == "xref" || token == "ref"
|
63
|
-
load_xref_table
|
64
|
-
elsif token.to_i >= 0 &&
|
64
|
+
load_xref_table(buf)
|
65
|
+
elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
|
65
66
|
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
66
67
|
else
|
67
68
|
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
@@ -73,52 +74,12 @@ class PDF::Reader
|
|
73
74
|
# number
|
74
75
|
#
|
75
76
|
# If the object is a stream, that is returned as well
|
76
|
-
def object (ref
|
77
|
+
def object (ref)
|
77
78
|
return ref unless ref.kind_of?(Reference)
|
78
|
-
|
79
|
-
obj = Parser.new(
|
80
|
-
@buffer.seek(pos) if save_pos
|
79
|
+
buf = new_buffer(offset_for(ref))
|
80
|
+
obj = Parser.new(buf, self).object(ref.id, ref.gen)
|
81
81
|
return obj
|
82
82
|
end
|
83
|
-
################################################################################
|
84
|
-
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
85
|
-
# processes it into memory.
|
86
|
-
def load_xref_table
|
87
|
-
tok_one = tok_two = nil
|
88
|
-
|
89
|
-
begin
|
90
|
-
# loop over all subsections of the xref table
|
91
|
-
# In a well formed PDF, the 'trailer' token will indicate
|
92
|
-
# the end of the table. However we need to be careful in case
|
93
|
-
# we're processing a malformed pdf that is missing the trailer.
|
94
|
-
loop do
|
95
|
-
tok_one, tok_two = @buffer.token, @buffer.token
|
96
|
-
if tok_one != "trailer" && !tok_one.match(/\d+/)
|
97
|
-
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
98
|
-
end
|
99
|
-
break if tok_one == "trailer" or tok_one.nil?
|
100
|
-
objid, count = tok_one.to_i, tok_two.to_i
|
101
|
-
|
102
|
-
count.times do
|
103
|
-
offset = @buffer.token.to_i
|
104
|
-
generation = @buffer.token.to_i
|
105
|
-
state = @buffer.token
|
106
|
-
|
107
|
-
store(objid, generation, offset) if state == "n"
|
108
|
-
objid += 1
|
109
|
-
end
|
110
|
-
end
|
111
|
-
rescue EOFError => e
|
112
|
-
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
113
|
-
end
|
114
|
-
|
115
|
-
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
116
|
-
|
117
|
-
trailer = Parser.new(@buffer, self).dictionary
|
118
|
-
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
119
|
-
|
120
|
-
trailer
|
121
|
-
end
|
122
83
|
# returns the type of object a ref points to
|
123
84
|
def obj_type(ref)
|
124
85
|
obj = object(ref)
|
@@ -154,6 +115,41 @@ class PDF::Reader
|
|
154
115
|
(@xref[id] ||= {})[gen] ||= offset
|
155
116
|
end
|
156
117
|
################################################################################
|
118
|
+
private
|
119
|
+
################################################################################
|
120
|
+
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
121
|
+
# processes it into memory.
|
122
|
+
def load_xref_table(buf)
|
123
|
+
params = []
|
124
|
+
|
125
|
+
while !params.include?("trailer") && !params.include?(nil)
|
126
|
+
if params.size == 2
|
127
|
+
objid, count = params[0].to_i, params[1].to_i
|
128
|
+
count.times do
|
129
|
+
offset = buf.token.to_i
|
130
|
+
generation = buf.token.to_i
|
131
|
+
state = buf.token
|
132
|
+
|
133
|
+
store(objid, generation, offset) if state == "n"
|
134
|
+
objid += 1
|
135
|
+
params.clear
|
136
|
+
end
|
137
|
+
end
|
138
|
+
params << buf.token
|
139
|
+
end
|
140
|
+
|
141
|
+
trailer = Parser.new(buf, self).parse_token
|
142
|
+
|
143
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
144
|
+
|
145
|
+
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
146
|
+
|
147
|
+
trailer
|
148
|
+
end
|
149
|
+
|
150
|
+
def new_buffer(offset = 0)
|
151
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
152
|
+
end
|
157
153
|
end
|
158
154
|
################################################################################
|
159
155
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-01 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|