pdf-reader 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/Rakefile +1 -1
- data/lib/pdf/hash.rb +1 -2
- data/lib/pdf/reader.rb +1 -3
- data/lib/pdf/reader/buffer.rb +248 -117
- data/lib/pdf/reader/content.rb +1 -1
- data/lib/pdf/reader/parser.rb +45 -83
- data/lib/pdf/reader/reference.rb +0 -10
- data/lib/pdf/reader/xref.rb +48 -52
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
v0.8.2 (1st January 2010)
|
2
|
+
- Fix parsing of files that use Form XObjects behind an indirect reference
|
3
|
+
(thanks Cornelius Illi and Patrick Crosby)
|
4
|
+
- Rewrote Buffer class to fix various speed issues reported over the years
|
5
|
+
- On my sample file extracting full text reduced from 220 seconds to 9 seconds.
|
6
|
+
|
1
7
|
v0.8.1 (27th November 2009)
|
2
8
|
- Added PDF::Hash#version. Provides access to the source file PDF version
|
3
9
|
|
data/Rakefile
CHANGED
data/lib/pdf/hash.rb
CHANGED
@@ -46,8 +46,7 @@ module PDF
|
|
46
46
|
raise ArgumentError, "input must be an IO-like object or a filename"
|
47
47
|
end
|
48
48
|
@version = read_version(io)
|
49
|
-
|
50
|
-
@xref = PDF::Reader::XRef.new(buffer)
|
49
|
+
@xref = PDF::Reader::XRef.new(io)
|
51
50
|
@trailer = @xref.load
|
52
51
|
end
|
53
52
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -122,9 +122,7 @@ class PDF::Reader
|
|
122
122
|
################################################################################
|
123
123
|
# Given an IO object that contains PDF data, parse it.
|
124
124
|
def parse (io, receiver, opts = {})
|
125
|
-
@
|
126
|
-
@xref = XRef.new(@buffer)
|
127
|
-
@parser = Parser.new(@buffer, @xref)
|
125
|
+
@xref = XRef.new(io)
|
128
126
|
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
129
127
|
|
130
128
|
options = {:pages => true, :metadata => true}
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
################################################################################
|
2
4
|
#
|
3
|
-
# Copyright (C)
|
5
|
+
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
|
4
6
|
#
|
5
7
|
# Permission is hereby granted, free of charge, to any person obtaining
|
6
8
|
# a copy of this software and associated documentation files (the
|
@@ -24,140 +26,118 @@
|
|
24
26
|
################################################################################
|
25
27
|
|
26
28
|
class PDF::Reader
|
27
|
-
|
28
|
-
#
|
29
|
+
|
30
|
+
# A string tokeniser that recognises PDF grammer. When passed an IO stream or a
|
31
|
+
# string, repeated calls to token() will return the next token from the source.
|
32
|
+
#
|
33
|
+
# This is very low level, and getting the raw tokens is not very useful in itself.
|
34
|
+
#
|
35
|
+
# This will usually be used in conjunction with PDF:Reader::Parser, which converts
|
36
|
+
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
37
|
+
#
|
29
38
|
class Buffer
|
30
|
-
|
31
|
-
|
32
|
-
|
39
|
+
|
40
|
+
attr_reader :pos
|
41
|
+
|
42
|
+
# Creates a new buffer.
|
43
|
+
#
|
44
|
+
# Params:
|
45
|
+
#
|
46
|
+
# io - an IO stream or string with the raw data to tokenise
|
47
|
+
#
|
48
|
+
# options:
|
49
|
+
#
|
50
|
+
# :seek - a byte offset to seek to before starting to tokenise
|
51
|
+
#
|
52
|
+
def initialize (io, opts = {})
|
33
53
|
@io = io
|
34
|
-
@
|
54
|
+
@tokens = []
|
55
|
+
@options = opts
|
56
|
+
|
57
|
+
@io.seek(opts[:seek]) if opts[:seek]
|
58
|
+
@pos = @io.pos
|
35
59
|
end
|
36
|
-
|
37
|
-
#
|
38
|
-
|
39
|
-
|
40
|
-
@
|
41
|
-
|
60
|
+
|
61
|
+
# return true if there are no more tokens left
|
62
|
+
#
|
63
|
+
def empty?
|
64
|
+
prepare_tokens if @tokens.size < 3
|
65
|
+
|
66
|
+
@tokens.empty?
|
42
67
|
end
|
43
|
-
|
44
|
-
#
|
68
|
+
|
69
|
+
# return raw bytes from the underlying IO stream.
|
45
70
|
#
|
46
|
-
#
|
47
|
-
|
48
|
-
|
71
|
+
# bytes - the number of bytes to read
|
72
|
+
#
|
73
|
+
# options:
|
74
|
+
#
|
75
|
+
# :skip_eol - if true, the IO stream is advanced past any LF or CR
|
76
|
+
# bytes before it reads any data. This is to handle
|
77
|
+
# content streams, which have a CRLF or LF after the stream
|
78
|
+
# token.
|
79
|
+
#
|
80
|
+
def read(bytes, opts = {})
|
81
|
+
reset_pos
|
49
82
|
|
50
|
-
if
|
51
|
-
|
52
|
-
|
83
|
+
if opts[:skip_eol]
|
84
|
+
done = false
|
85
|
+
while !done
|
86
|
+
chr = @io.read(1)
|
87
|
+
if chr.nil?
|
88
|
+
return nil
|
89
|
+
elsif chr != "\n" && chr != "\r"
|
90
|
+
@io.seek(-1, IO::SEEK_CUR)
|
91
|
+
done = true
|
92
|
+
end
|
93
|
+
end
|
53
94
|
end
|
54
95
|
|
55
|
-
|
56
|
-
|
96
|
+
bytes = @io.read(bytes)
|
97
|
+
save_pos
|
98
|
+
bytes
|
57
99
|
end
|
58
|
-
|
59
|
-
#
|
100
|
+
|
101
|
+
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
+
# occurance of needle will be returned. The match (if any) is not returned.
|
103
|
+
# The IO stream cursor is left on the first byte of the match.
|
104
|
+
#
|
105
|
+
# needle - a string to search the IO stream for
|
60
106
|
#
|
61
|
-
|
62
|
-
|
107
|
+
def read_until(needle)
|
108
|
+
reset_pos
|
63
109
|
out = ""
|
64
|
-
size =
|
110
|
+
size = needle.size
|
65
111
|
|
66
|
-
|
67
|
-
|
68
|
-
offset = @buffer.index(bytes) + size
|
69
|
-
return head(offset)
|
70
|
-
else
|
71
|
-
out << head(@buffer.size)
|
72
|
-
end
|
112
|
+
while out[size * -1, size] != needle && !@io.eof?
|
113
|
+
out << @io.read(1)
|
73
114
|
end
|
74
115
|
|
75
|
-
|
76
|
-
out
|
77
|
-
|
78
|
-
out = out[0, out.size - size]
|
79
|
-
seek(pos - size)
|
80
|
-
break
|
81
|
-
end
|
116
|
+
if out[size * -1, size] == needle
|
117
|
+
out = out[0, out.size - size]
|
118
|
+
@io.seek(size * -1, IO::SEEK_CUR)
|
82
119
|
end
|
120
|
+
|
121
|
+
save_pos
|
83
122
|
out
|
84
123
|
end
|
85
|
-
################################################################################
|
86
|
-
# returns true if the underlying IO object is at end and the internal buffer
|
87
|
-
# is empty
|
88
|
-
def eof?
|
89
|
-
ready_token
|
90
|
-
if @buffer
|
91
|
-
@buffer.empty? && @io.eof?
|
92
|
-
else
|
93
|
-
@io.eof?
|
94
|
-
end
|
95
|
-
end
|
96
|
-
################################################################################
|
97
|
-
def pos
|
98
|
-
@io.pos
|
99
|
-
end
|
100
|
-
################################################################################
|
101
|
-
def pos_without_buf
|
102
|
-
@io.pos - @buffer.to_s.size
|
103
|
-
end
|
104
|
-
################################################################################
|
105
|
-
# PDF files are processed by tokenising the content into a series of objects and commands.
|
106
|
-
# This prepares the buffer for use by reading the next line of tokens into memory.
|
107
|
-
def ready_token (with_strip=true, skip_blanks=true)
|
108
|
-
while (@buffer.nil? or @buffer.empty?) && !@io.eof?
|
109
|
-
@buffer = @io.readline
|
110
|
-
@buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
|
111
|
-
#@buffer.sub!(/%.*$/, '') if strip_comments
|
112
|
-
@buffer.chomp!
|
113
|
-
break unless skip_blanks
|
114
|
-
end
|
115
|
-
@buffer.lstrip! if with_strip
|
116
|
-
end
|
117
|
-
################################################################################
|
118
|
-
# return the next token from the underlying IO stream
|
119
|
-
def token
|
120
|
-
ready_token
|
121
|
-
|
122
|
-
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
123
124
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
tok = head(token_chars, strip_space)
|
125
|
+
# return the next token from the source. Returns a string if a token
|
126
|
+
# is found, nil if there are no tokens left.
|
127
|
+
#
|
128
|
+
def token
|
129
|
+
reset_pos
|
130
|
+
prepare_tokens if @tokens.size < 3
|
131
|
+
merge_indirect_reference
|
132
|
+
merge_tokens
|
133
133
|
|
134
|
-
|
135
|
-
nil
|
136
|
-
elsif tok[0,1] == "%"
|
137
|
-
@buffer = ""
|
138
|
-
token
|
139
|
-
else
|
140
|
-
tok
|
141
|
-
end
|
134
|
+
@tokens.shift
|
142
135
|
end
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
@buffer = @buffer[chars .. -1] || ""
|
147
|
-
@buffer.lstrip! if with_strip
|
148
|
-
val
|
149
|
-
end
|
150
|
-
################################################################################
|
151
|
-
# return the internal buffer used by this class when reading from the IO stream.
|
152
|
-
def raw
|
153
|
-
@buffer
|
154
|
-
end
|
155
|
-
################################################################################
|
156
|
-
# The Xref table in a PDF file acts as an aid for finding the location of various
|
157
|
-
# objects in the file. This method attempts to locate the byte offset of the xref
|
158
|
-
# table in the underlying IO stream.
|
136
|
+
|
137
|
+
# return the byte offset where the first XRef table in th source can be found.
|
138
|
+
#
|
159
139
|
def find_first_xref_offset
|
160
|
-
@io.seek(-1024, IO::SEEK_END) rescue seek(0)
|
140
|
+
@io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
|
161
141
|
data = @io.read(1024)
|
162
142
|
|
163
143
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
@@ -179,8 +159,159 @@ class PDF::Reader
|
|
179
159
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
180
160
|
lines[eof_index+1].to_i
|
181
161
|
end
|
182
|
-
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
# Some bastard moved our IO stream cursor. Restore it.
|
166
|
+
#
|
167
|
+
def reset_pos
|
168
|
+
@io.seek(@pos) if @io.pos != @pos
|
169
|
+
end
|
170
|
+
|
171
|
+
# save the current position of the source IO stream. If someone else (like another buffer)
|
172
|
+
# moves the cursor, we can then restore it.
|
173
|
+
#
|
174
|
+
def save_pos
|
175
|
+
@pos = @io.pos
|
176
|
+
end
|
177
|
+
|
178
|
+
# attempt to prime the buffer with the next few tokens.
|
179
|
+
#
|
180
|
+
def prepare_tokens
|
181
|
+
10.times do
|
182
|
+
if state == :literal_string
|
183
|
+
prepare_literal_token
|
184
|
+
elsif state == :regular
|
185
|
+
prepare_regular_token
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
save_pos
|
190
|
+
end
|
191
|
+
|
192
|
+
# tokenising behaves slightly differently based on the current context.
|
193
|
+
# Determine the current context/state by examining the last token we found
|
194
|
+
#
|
195
|
+
def state
|
196
|
+
if @tokens[-1] == "("
|
197
|
+
:literal_string
|
198
|
+
elsif @tokens[-1] == "stream"
|
199
|
+
:stream
|
200
|
+
else
|
201
|
+
:regular
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# detect a series of 3 tokens that make up an indirect object. If we find
|
206
|
+
# them, replace the tokens with a PDF::Reader::Reference instance.
|
207
|
+
#
|
208
|
+
# Merging them into a single string was another option, but that would mean
|
209
|
+
# code further up the stact would need to check every token to see if it looks
|
210
|
+
# like an indirect object. For optimisation reasons, I'd rather avoid
|
211
|
+
# that extra check.
|
212
|
+
#
|
213
|
+
def merge_indirect_reference
|
214
|
+
return if @tokens.size < 3
|
215
|
+
return if @tokens[2] != "R"
|
216
|
+
|
217
|
+
if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
|
218
|
+
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
219
|
+
@tokens[1] = nil
|
220
|
+
@tokens[2] = nil
|
221
|
+
@tokens.compact!
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
# merge any consequtive tokens that are actually 1 token. The only current
|
226
|
+
# time this is the case is << and >>. < and > are valid tokens (they indicate
|
227
|
+
# a hex string) but so are << and >> (they indicate a dictionary).
|
228
|
+
#
|
229
|
+
def merge_tokens
|
230
|
+
@tokens.each_with_index do |tok, idx|
|
231
|
+
if tok == "<" && @tokens[idx+1] == "<"
|
232
|
+
@tokens.inspect
|
233
|
+
@tokens[idx] = "<<"
|
234
|
+
@tokens[idx+1] = nil
|
235
|
+
elsif tok == ">" && @tokens[idx+1] == ">"
|
236
|
+
@tokens[idx] = ">>"
|
237
|
+
@tokens[idx+1] = nil
|
238
|
+
end
|
239
|
+
end
|
240
|
+
@tokens.compact!
|
241
|
+
end
|
242
|
+
|
243
|
+
# if we're currently inside a literal string we more or less just read bytes until
|
244
|
+
# we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
|
245
|
+
# start of a new token in regular mode are left untouched when inside a literal
|
246
|
+
# string.
|
247
|
+
#
|
248
|
+
# The entire literal string will be returned as a single token. It will need further
|
249
|
+
# processing to fix things like escaped new lines, but that's someone else's
|
250
|
+
# problem.
|
251
|
+
#
|
252
|
+
def prepare_literal_token
|
253
|
+
str = ""
|
254
|
+
count = 1
|
255
|
+
|
256
|
+
while count > 0
|
257
|
+
chr = @io.read(1)
|
258
|
+
if chr.nil?
|
259
|
+
count = 0 # unbalanced params
|
260
|
+
elsif chr == "(" && str[-1,1] != "\x5C"
|
261
|
+
str << "("
|
262
|
+
count += 1
|
263
|
+
elsif chr == ")" && str[-1,1] != "\x5C"
|
264
|
+
count -= 1
|
265
|
+
str << ")" unless count == 0
|
266
|
+
else
|
267
|
+
str << chr unless count == 0
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
@tokens << str if str.size > 0
|
272
|
+
@tokens << ")"
|
273
|
+
end
|
274
|
+
|
275
|
+
# Extract the next regular token and stock it in our buffer, ready to be returned.
|
276
|
+
#
|
277
|
+
# What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
|
278
|
+
# to read up on it.
|
279
|
+
#
|
280
|
+
def prepare_regular_token
|
281
|
+
tok = ""
|
282
|
+
|
283
|
+
while chr = @io.read(1)
|
284
|
+
case chr
|
285
|
+
when "\x25"
|
286
|
+
# comment, ignore everything until the next EOL char
|
287
|
+
done = false
|
288
|
+
while !done
|
289
|
+
chr = @io.read(1)
|
290
|
+
done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
|
291
|
+
end
|
292
|
+
when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
|
293
|
+
# white space, token finished
|
294
|
+
@tokens << tok if tok.size > 0
|
295
|
+
tok = ""
|
296
|
+
break
|
297
|
+
when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
|
298
|
+
# opening delimiter, start of new token
|
299
|
+
@tokens << tok if tok.size > 0
|
300
|
+
@tokens << chr
|
301
|
+
tok = ""
|
302
|
+
break
|
303
|
+
when "\x29", "\x3E", "\x5D", "\x7D"
|
304
|
+
# closing delimiter
|
305
|
+
@tokens << tok if tok.size > 0
|
306
|
+
@tokens << chr
|
307
|
+
tok = ""
|
308
|
+
break
|
309
|
+
else
|
310
|
+
tok << chr
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
@tokens << tok if tok.size > 0
|
315
|
+
end
|
183
316
|
end
|
184
|
-
################################################################################
|
185
317
|
end
|
186
|
-
################################################################################
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -323,7 +323,7 @@ class PDF::Reader
|
|
323
323
|
# like a regular page content stream.
|
324
324
|
#
|
325
325
|
def walk_xobject_form(label)
|
326
|
-
xobjects = current_resources[:XObject] || {}
|
326
|
+
xobjects = @xref.object(current_resources[:XObject]) || {}
|
327
327
|
xobject = @xref.object(xobjects[label])
|
328
328
|
|
329
329
|
if xobject && xobject.hash[:Subtype] == :Form
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -43,10 +43,10 @@ class PDF::Reader
|
|
43
43
|
#
|
44
44
|
# operators - a hash of supported operators to read from the underlying buffer.
|
45
45
|
def parse_token (operators={})
|
46
|
-
ref = Reference.from_buffer(@buffer) and return ref
|
47
46
|
token = @buffer.token
|
48
47
|
|
49
48
|
case token
|
49
|
+
when PDF::Reader::Reference then return token
|
50
50
|
when nil then return nil
|
51
51
|
when "/" then return @buffer.token.to_sym
|
52
52
|
when "<<" then return dictionary()
|
@@ -58,7 +58,7 @@ class PDF::Reader
|
|
58
58
|
when "null" then return nil
|
59
59
|
when "obj", "endobj" then return Token.new(token)
|
60
60
|
when "stream", "endstream" then return Token.new(token)
|
61
|
-
when ">>", "]", ">"
|
61
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
62
62
|
else
|
63
63
|
if operators.has_key?(token) then return Token.new(token)
|
64
64
|
elsif token =~ /\d*\.\d/ then return token.to_f
|
@@ -66,6 +66,29 @@ class PDF::Reader
|
|
66
66
|
end
|
67
67
|
end
|
68
68
|
end
|
69
|
+
################################################################################
|
70
|
+
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
71
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
72
|
+
# that describes it
|
73
|
+
#
|
74
|
+
# id - the object ID to return
|
75
|
+
# gen - the object revision number to return
|
76
|
+
def object (id, gen)
|
77
|
+
Error.assert_equal(parse_token, id)
|
78
|
+
Error.assert_equal(parse_token, gen)
|
79
|
+
Error.str_assert(parse_token, "obj")
|
80
|
+
|
81
|
+
obj = parse_token
|
82
|
+
post_obj = parse_token
|
83
|
+
case post_obj
|
84
|
+
when "endobj" then return obj
|
85
|
+
when "stream" then return stream(obj)
|
86
|
+
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
69
92
|
################################################################################
|
70
93
|
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
|
71
94
|
def dictionary
|
@@ -114,95 +137,34 @@ class PDF::Reader
|
|
114
137
|
################################################################################
|
115
138
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
116
139
|
def string
|
117
|
-
str =
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
if i.nil?
|
138
|
-
str << @buffer.raw + "\n"
|
139
|
-
@buffer.raw.replace("")
|
140
|
-
# if a content stream opens a string, but never closes it, we'll
|
141
|
-
# hit the end of the stream and still be appending stuff to the
|
142
|
-
# string. bad! This check prevents a hard loop.
|
143
|
-
raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
|
144
|
-
next
|
145
|
-
end
|
140
|
+
str = @buffer.token
|
141
|
+
return "" if str == ")"
|
142
|
+
Error.assert_equal(parse_token, ")")
|
143
|
+
|
144
|
+
str.gsub!("\\n","\n")
|
145
|
+
str.gsub!("\\r","\r")
|
146
|
+
str.gsub!("\\t","\t")
|
147
|
+
str.gsub!("\\b","\b")
|
148
|
+
str.gsub!("\\f","\f")
|
149
|
+
str.gsub!("\\(","(")
|
150
|
+
str.gsub!("\\)",")")
|
151
|
+
str.gsub!("\\\\","\\")
|
152
|
+
str.gsub!(/\\\n/m,"")
|
153
|
+
str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
|
154
|
+
|
155
|
+
str.scan(/\\\d{1,3}/).each do |octal|
|
156
|
+
str.gsub!(octal, octal[1,3].oct.chr)
|
157
|
+
end
|
146
158
|
|
147
|
-
|
148
|
-
to_remove = 1
|
149
|
-
|
150
|
-
case @buffer.raw[0, 1]
|
151
|
-
when "("
|
152
|
-
str << "("
|
153
|
-
count += 1
|
154
|
-
when ")"
|
155
|
-
count -= 1
|
156
|
-
str << ")" unless count == 0
|
157
|
-
when "\\"
|
158
|
-
to_remove = 2
|
159
|
-
case @buffer.raw[1, 1]
|
160
|
-
when "" then to_remove = 1
|
161
|
-
when "n" then str << "\n"
|
162
|
-
when "r" then str << "\r"
|
163
|
-
when "t" then str << "\t"
|
164
|
-
when "b" then str << "\b"
|
165
|
-
when "f" then str << "\f"
|
166
|
-
when "(" then str << "("
|
167
|
-
when ")" then str << ")"
|
168
|
-
when "\\" then str << "\\"
|
169
|
-
else
|
170
|
-
if m = @buffer.raw.match(/^\\(\d{1,3})/)
|
171
|
-
to_remove = m[0].size
|
172
|
-
str << m[1].oct.chr
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
159
|
+
str.gsub!(/\\([^\\])/,'\1')
|
176
160
|
|
177
|
-
@buffer.head(to_remove, false)
|
178
|
-
end
|
179
161
|
str
|
180
162
|
end
|
181
163
|
################################################################################
|
182
|
-
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
183
|
-
# If the object is a content stream, returns both the stream and the dictionary
|
184
|
-
# that describes it
|
185
|
-
#
|
186
|
-
# id - the object ID to return
|
187
|
-
# gen - the object revision number to return
|
188
|
-
def object (id, gen)
|
189
|
-
Error.assert_equal(parse_token, id)
|
190
|
-
Error.assert_equal(parse_token, gen)
|
191
|
-
Error.str_assert(parse_token, "obj")
|
192
|
-
|
193
|
-
obj = parse_token
|
194
|
-
post_obj = parse_token
|
195
|
-
case post_obj
|
196
|
-
when "endobj" then return obj
|
197
|
-
when "stream" then return stream(obj)
|
198
|
-
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
199
|
-
end
|
200
|
-
end
|
201
|
-
################################################################################
|
202
164
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
203
165
|
def stream (dict)
|
204
166
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
205
|
-
data = @buffer.read(@xref.object(dict[:Length]))
|
167
|
+
data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
|
206
168
|
|
207
169
|
Error.str_assert(parse_token, "endstream")
|
208
170
|
Error.str_assert(parse_token, "endobj")
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -27,16 +27,6 @@ class PDF::Reader
|
|
27
27
|
################################################################################
|
28
28
|
# An internal PDF::Reader class that represents an indirect reference to a PDF Object
|
29
29
|
class Reference
|
30
|
-
################################################################################
|
31
|
-
# check if the next token in the buffer is a reference, and return a PDF::Reader::Reference
|
32
|
-
# instance. Returns nil if the next token isn't an indirect reference.
|
33
|
-
def self.from_buffer (buffer)
|
34
|
-
buffer.ready_token
|
35
|
-
return nil unless m = buffer.raw.match(/^(\d+)\s+(\d+)\s+R\b/)
|
36
|
-
buffer.head(m[0].size)
|
37
|
-
self.new(m[1].to_i, m[2].to_i)
|
38
|
-
end
|
39
|
-
################################################################################
|
40
30
|
attr_reader :id, :gen
|
41
31
|
################################################################################
|
42
32
|
# Create a new Reference to an object with the specified id and revision number
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -32,8 +32,8 @@ class PDF::Reader
|
|
32
32
|
class XRef
|
33
33
|
################################################################################
|
34
34
|
# create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
|
35
|
-
def initialize (
|
36
|
-
@
|
35
|
+
def initialize (io)
|
36
|
+
@io = io
|
37
37
|
@xref = {}
|
38
38
|
end
|
39
39
|
def size
|
@@ -44,8 +44,8 @@ class PDF::Reader
|
|
44
44
|
# table, but it is one of the lowest level data items in the file, so we've lumped it in
|
45
45
|
# with the cross reference code.
|
46
46
|
def pdf_version
|
47
|
-
@
|
48
|
-
m, version = *@
|
47
|
+
@io.seek(0)
|
48
|
+
m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
|
49
49
|
raise MalformedPDFError, 'invalid PDF version' if version.nil?
|
50
50
|
return version.to_f
|
51
51
|
end
|
@@ -55,13 +55,14 @@ class PDF::Reader
|
|
55
55
|
#
|
56
56
|
# Will fail silently if there is no xref table at the requested offset.
|
57
57
|
def load (offset = nil)
|
58
|
-
offset ||=
|
59
|
-
|
60
|
-
|
58
|
+
offset ||= new_buffer.find_first_xref_offset
|
59
|
+
|
60
|
+
buf = new_buffer(offset)
|
61
|
+
token = buf.token
|
61
62
|
|
62
63
|
if token == "xref" || token == "ref"
|
63
|
-
load_xref_table
|
64
|
-
elsif token.to_i >= 0 &&
|
64
|
+
load_xref_table(buf)
|
65
|
+
elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
|
65
66
|
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
66
67
|
else
|
67
68
|
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
@@ -73,52 +74,12 @@ class PDF::Reader
|
|
73
74
|
# number
|
74
75
|
#
|
75
76
|
# If the object is a stream, that is returned as well
|
76
|
-
def object (ref
|
77
|
+
def object (ref)
|
77
78
|
return ref unless ref.kind_of?(Reference)
|
78
|
-
|
79
|
-
obj = Parser.new(
|
80
|
-
@buffer.seek(pos) if save_pos
|
79
|
+
buf = new_buffer(offset_for(ref))
|
80
|
+
obj = Parser.new(buf, self).object(ref.id, ref.gen)
|
81
81
|
return obj
|
82
82
|
end
|
83
|
-
################################################################################
|
84
|
-
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
85
|
-
# processes it into memory.
|
86
|
-
def load_xref_table
|
87
|
-
tok_one = tok_two = nil
|
88
|
-
|
89
|
-
begin
|
90
|
-
# loop over all subsections of the xref table
|
91
|
-
# In a well formed PDF, the 'trailer' token will indicate
|
92
|
-
# the end of the table. However we need to be careful in case
|
93
|
-
# we're processing a malformed pdf that is missing the trailer.
|
94
|
-
loop do
|
95
|
-
tok_one, tok_two = @buffer.token, @buffer.token
|
96
|
-
if tok_one != "trailer" && !tok_one.match(/\d+/)
|
97
|
-
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
98
|
-
end
|
99
|
-
break if tok_one == "trailer" or tok_one.nil?
|
100
|
-
objid, count = tok_one.to_i, tok_two.to_i
|
101
|
-
|
102
|
-
count.times do
|
103
|
-
offset = @buffer.token.to_i
|
104
|
-
generation = @buffer.token.to_i
|
105
|
-
state = @buffer.token
|
106
|
-
|
107
|
-
store(objid, generation, offset) if state == "n"
|
108
|
-
objid += 1
|
109
|
-
end
|
110
|
-
end
|
111
|
-
rescue EOFError => e
|
112
|
-
raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
|
113
|
-
end
|
114
|
-
|
115
|
-
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
116
|
-
|
117
|
-
trailer = Parser.new(@buffer, self).dictionary
|
118
|
-
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
119
|
-
|
120
|
-
trailer
|
121
|
-
end
|
122
83
|
# returns the type of object a ref points to
|
123
84
|
def obj_type(ref)
|
124
85
|
obj = object(ref)
|
@@ -154,6 +115,41 @@ class PDF::Reader
|
|
154
115
|
(@xref[id] ||= {})[gen] ||= offset
|
155
116
|
end
|
156
117
|
################################################################################
|
118
|
+
private
|
119
|
+
################################################################################
|
120
|
+
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
121
|
+
# processes it into memory.
|
122
|
+
def load_xref_table(buf)
|
123
|
+
params = []
|
124
|
+
|
125
|
+
while !params.include?("trailer") && !params.include?(nil)
|
126
|
+
if params.size == 2
|
127
|
+
objid, count = params[0].to_i, params[1].to_i
|
128
|
+
count.times do
|
129
|
+
offset = buf.token.to_i
|
130
|
+
generation = buf.token.to_i
|
131
|
+
state = buf.token
|
132
|
+
|
133
|
+
store(objid, generation, offset) if state == "n"
|
134
|
+
objid += 1
|
135
|
+
params.clear
|
136
|
+
end
|
137
|
+
end
|
138
|
+
params << buf.token
|
139
|
+
end
|
140
|
+
|
141
|
+
trailer = Parser.new(buf, self).parse_token
|
142
|
+
|
143
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
144
|
+
|
145
|
+
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
146
|
+
|
147
|
+
trailer
|
148
|
+
end
|
149
|
+
|
150
|
+
def new_buffer(offset = 0)
|
151
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
152
|
+
end
|
157
153
|
end
|
158
154
|
################################################################################
|
159
155
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-01 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|