pdf-reader 0.8.1 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,9 @@
1
+ v0.8.2 (1st January 2010)
2
+ - Fix parsing of files that use Form XObjects behind an indirect reference
3
+ (thanks Cornelius Illi and Patrick Crosby)
4
+ - Rewrote Buffer class to fix various speed issues reported over the years
5
+ - On my sample file extracting full text reduced from 220 seconds to 9 seconds.
6
+
1
7
  v0.8.1 (27th November 2009)
2
8
  - Added PDF::Hash#version. Provides access to the source file PDF version
3
9
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.1"
9
+ PKG_VERSION = "0.8.2"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -46,8 +46,7 @@ module PDF
46
46
  raise ArgumentError, "input must be an IO-like object or a filename"
47
47
  end
48
48
  @version = read_version(io)
49
- buffer = PDF::Reader::Buffer.new(io)
50
- @xref = PDF::Reader::XRef.new(buffer)
49
+ @xref = PDF::Reader::XRef.new(io)
51
50
  @trailer = @xref.load
52
51
  end
53
52
 
@@ -122,9 +122,7 @@ class PDF::Reader
122
122
  ################################################################################
123
123
  # Given an IO object that contains PDF data, parse it.
124
124
  def parse (io, receiver, opts = {})
125
- @buffer = Buffer.new(io)
126
- @xref = XRef.new(@buffer)
127
- @parser = Parser.new(@buffer, @xref)
125
+ @xref = XRef.new(io)
128
126
  @content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
129
127
 
130
128
  options = {:pages => true, :metadata => true}
@@ -1,6 +1,8 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
- # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
5
+ # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
6
  #
5
7
  # Permission is hereby granted, free of charge, to any person obtaining
6
8
  # a copy of this software and associated documentation files (the
@@ -24,140 +26,118 @@
24
26
  ################################################################################
25
27
 
26
28
  class PDF::Reader
27
- ################################################################################
28
- # An internal PDF::Reader class that mediates access to the underlying PDF File or IO Stream
29
+
30
+ # A string tokeniser that recognises PDF grammer. When passed an IO stream or a
31
+ # string, repeated calls to token() will return the next token from the source.
32
+ #
33
+ # This is very low level, and getting the raw tokens is not very useful in itself.
34
+ #
35
+ # This will usually be used in conjunction with PDF:Reader::Parser, which converts
36
+ # the raw tokens into objects we can work with (strings, ints, arrays, etc)
37
+ #
29
38
  class Buffer
30
- ################################################################################
31
- # Creates a new buffer around the specified IO object
32
- def initialize (io)
39
+
40
+ attr_reader :pos
41
+
42
+ # Creates a new buffer.
43
+ #
44
+ # Params:
45
+ #
46
+ # io - an IO stream or string with the raw data to tokenise
47
+ #
48
+ # options:
49
+ #
50
+ # :seek - a byte offset to seek to before starting to tokenise
51
+ #
52
+ def initialize (io, opts = {})
33
53
  @io = io
34
- @buffer = nil
54
+ @tokens = []
55
+ @options = opts
56
+
57
+ @io.seek(opts[:seek]) if opts[:seek]
58
+ @pos = @io.pos
35
59
  end
36
- ################################################################################
37
- # Seek to the requested byte in the IO stream.
38
- def seek (offset)
39
- @io.seek(offset, IO::SEEK_SET)
40
- @buffer = nil
41
- self
60
+
61
+ # return true if there are no more tokens left
62
+ #
63
+ def empty?
64
+ prepare_tokens if @tokens.size < 3
65
+
66
+ @tokens.empty?
42
67
  end
43
- ################################################################################
44
- # reads the requested number of bytes from the underlying IO stream.
68
+
69
+ # return raw bytes from the underlying IO stream.
45
70
  #
46
- # length should be a positive integer.
47
- def read (length)
48
- out = ""
71
+ # bytes - the number of bytes to read
72
+ #
73
+ # options:
74
+ #
75
+ # :skip_eol - if true, the IO stream is advanced past any LF or CR
76
+ # bytes before it reads any data. This is to handle
77
+ # content streams, which have a CRLF or LF after the stream
78
+ # token.
79
+ #
80
+ def read(bytes, opts = {})
81
+ reset_pos
49
82
 
50
- if @buffer and !@buffer.empty?
51
- out << head(length)
52
- length -= out.length
83
+ if opts[:skip_eol]
84
+ done = false
85
+ while !done
86
+ chr = @io.read(1)
87
+ if chr.nil?
88
+ return nil
89
+ elsif chr != "\n" && chr != "\r"
90
+ @io.seek(-1, IO::SEEK_CUR)
91
+ done = true
92
+ end
93
+ end
53
94
  end
54
95
 
55
- out << @io.read(length) if length > 0
56
- out
96
+ bytes = @io.read(bytes)
97
+ save_pos
98
+ bytes
57
99
  end
58
- ################################################################################
59
- # Reads from the buffer until the specified token is found, or the end of the buffer
100
+
101
+ # return raw bytes from the underlying IO stream. All bytes up to the first
102
+ # occurance of needle will be returned. The match (if any) is not returned.
103
+ # The IO stream cursor is left on the first byte of the match.
104
+ #
105
+ # needle - a string to search the IO stream for
60
106
  #
61
- # bytes - the bytes to search for.
62
- def read_until(bytes)
107
+ def read_until(needle)
108
+ reset_pos
63
109
  out = ""
64
- size = bytes.size
110
+ size = needle.size
65
111
 
66
- if @buffer && !@buffer.empty?
67
- if @buffer.include?(bytes)
68
- offset = @buffer.index(bytes) + size
69
- return head(offset)
70
- else
71
- out << head(@buffer.size)
72
- end
112
+ while out[size * -1, size] != needle && !@io.eof?
113
+ out << @io.read(1)
73
114
  end
74
115
 
75
- loop do
76
- out << @io.read(1)
77
- if out[-1 * size,size].eql?(bytes)
78
- out = out[0, out.size - size]
79
- seek(pos - size)
80
- break
81
- end
116
+ if out[size * -1, size] == needle
117
+ out = out[0, out.size - size]
118
+ @io.seek(size * -1, IO::SEEK_CUR)
82
119
  end
120
+
121
+ save_pos
83
122
  out
84
123
  end
85
- ################################################################################
86
- # returns true if the underlying IO object is at end and the internal buffer
87
- # is empty
88
- def eof?
89
- ready_token
90
- if @buffer
91
- @buffer.empty? && @io.eof?
92
- else
93
- @io.eof?
94
- end
95
- end
96
- ################################################################################
97
- def pos
98
- @io.pos
99
- end
100
- ################################################################################
101
- def pos_without_buf
102
- @io.pos - @buffer.to_s.size
103
- end
104
- ################################################################################
105
- # PDF files are processed by tokenising the content into a series of objects and commands.
106
- # This prepares the buffer for use by reading the next line of tokens into memory.
107
- def ready_token (with_strip=true, skip_blanks=true)
108
- while (@buffer.nil? or @buffer.empty?) && !@io.eof?
109
- @buffer = @io.readline
110
- @buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
111
- #@buffer.sub!(/%.*$/, '') if strip_comments
112
- @buffer.chomp!
113
- break unless skip_blanks
114
- end
115
- @buffer.lstrip! if with_strip
116
- end
117
- ################################################################################
118
- # return the next token from the underlying IO stream
119
- def token
120
- ready_token
121
-
122
- i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
123
124
 
124
- token_chars =
125
- if i == 0 and @buffer[i,2] == "<<" then 2
126
- elsif i == 0 and @buffer[i,2] == ">>" then 2
127
- elsif i == 0 then 1
128
- else i
129
- end
130
-
131
- strip_space = !(i == 0 and @buffer[0,1] == '(')
132
- tok = head(token_chars, strip_space)
125
+ # return the next token from the source. Returns a string if a token
126
+ # is found, nil if there are no tokens left.
127
+ #
128
+ def token
129
+ reset_pos
130
+ prepare_tokens if @tokens.size < 3
131
+ merge_indirect_reference
132
+ merge_tokens
133
133
 
134
- if tok == ""
135
- nil
136
- elsif tok[0,1] == "%"
137
- @buffer = ""
138
- token
139
- else
140
- tok
141
- end
134
+ @tokens.shift
142
135
  end
143
- ################################################################################
144
- def head (chars, with_strip=true)
145
- val = @buffer[0, chars]
146
- @buffer = @buffer[chars .. -1] || ""
147
- @buffer.lstrip! if with_strip
148
- val
149
- end
150
- ################################################################################
151
- # return the internal buffer used by this class when reading from the IO stream.
152
- def raw
153
- @buffer
154
- end
155
- ################################################################################
156
- # The Xref table in a PDF file acts as an aid for finding the location of various
157
- # objects in the file. This method attempts to locate the byte offset of the xref
158
- # table in the underlying IO stream.
136
+
137
+ # return the byte offset where the first XRef table in th source can be found.
138
+ #
159
139
  def find_first_xref_offset
160
- @io.seek(-1024, IO::SEEK_END) rescue seek(0)
140
+ @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
161
141
  data = @io.read(1024)
162
142
 
163
143
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
@@ -179,8 +159,159 @@ class PDF::Reader
179
159
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
180
160
  lines[eof_index+1].to_i
181
161
  end
182
- ################################################################################
162
+
163
+ private
164
+
165
+ # Some bastard moved our IO stream cursor. Restore it.
166
+ #
167
+ def reset_pos
168
+ @io.seek(@pos) if @io.pos != @pos
169
+ end
170
+
171
+ # save the current position of the source IO stream. If someone else (like another buffer)
172
+ # moves the cursor, we can then restore it.
173
+ #
174
+ def save_pos
175
+ @pos = @io.pos
176
+ end
177
+
178
+ # attempt to prime the buffer with the next few tokens.
179
+ #
180
+ def prepare_tokens
181
+ 10.times do
182
+ if state == :literal_string
183
+ prepare_literal_token
184
+ elsif state == :regular
185
+ prepare_regular_token
186
+ end
187
+ end
188
+
189
+ save_pos
190
+ end
191
+
192
+ # tokenising behaves slightly differently based on the current context.
193
+ # Determine the current context/state by examining the last token we found
194
+ #
195
+ def state
196
+ if @tokens[-1] == "("
197
+ :literal_string
198
+ elsif @tokens[-1] == "stream"
199
+ :stream
200
+ else
201
+ :regular
202
+ end
203
+ end
204
+
205
+ # detect a series of 3 tokens that make up an indirect object. If we find
206
+ # them, replace the tokens with a PDF::Reader::Reference instance.
207
+ #
208
+ # Merging them into a single string was another option, but that would mean
209
+ # code further up the stact would need to check every token to see if it looks
210
+ # like an indirect object. For optimisation reasons, I'd rather avoid
211
+ # that extra check.
212
+ #
213
+ def merge_indirect_reference
214
+ return if @tokens.size < 3
215
+ return if @tokens[2] != "R"
216
+
217
+ if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
218
+ @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
219
+ @tokens[1] = nil
220
+ @tokens[2] = nil
221
+ @tokens.compact!
222
+ end
223
+ end
224
+
225
+ # merge any consequtive tokens that are actually 1 token. The only current
226
+ # time this is the case is << and >>. < and > are valid tokens (they indicate
227
+ # a hex string) but so are << and >> (they indicate a dictionary).
228
+ #
229
+ def merge_tokens
230
+ @tokens.each_with_index do |tok, idx|
231
+ if tok == "<" && @tokens[idx+1] == "<"
232
+ @tokens.inspect
233
+ @tokens[idx] = "<<"
234
+ @tokens[idx+1] = nil
235
+ elsif tok == ">" && @tokens[idx+1] == ">"
236
+ @tokens[idx] = ">>"
237
+ @tokens[idx+1] = nil
238
+ end
239
+ end
240
+ @tokens.compact!
241
+ end
242
+
243
+ # if we're currently inside a literal string we more or less just read bytes until
244
+ # we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
245
+ # start of a new token in regular mode are left untouched when inside a literal
246
+ # string.
247
+ #
248
+ # The entire literal string will be returned as a single token. It will need further
249
+ # processing to fix things like escaped new lines, but that's someone else's
250
+ # problem.
251
+ #
252
+ def prepare_literal_token
253
+ str = ""
254
+ count = 1
255
+
256
+ while count > 0
257
+ chr = @io.read(1)
258
+ if chr.nil?
259
+ count = 0 # unbalanced params
260
+ elsif chr == "(" && str[-1,1] != "\x5C"
261
+ str << "("
262
+ count += 1
263
+ elsif chr == ")" && str[-1,1] != "\x5C"
264
+ count -= 1
265
+ str << ")" unless count == 0
266
+ else
267
+ str << chr unless count == 0
268
+ end
269
+ end
270
+
271
+ @tokens << str if str.size > 0
272
+ @tokens << ")"
273
+ end
274
+
275
+ # Extract the next regular token and stock it in our buffer, ready to be returned.
276
+ #
277
+ # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
278
+ # to read up on it.
279
+ #
280
+ def prepare_regular_token
281
+ tok = ""
282
+
283
+ while chr = @io.read(1)
284
+ case chr
285
+ when "\x25"
286
+ # comment, ignore everything until the next EOL char
287
+ done = false
288
+ while !done
289
+ chr = @io.read(1)
290
+ done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
291
+ end
292
+ when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
293
+ # white space, token finished
294
+ @tokens << tok if tok.size > 0
295
+ tok = ""
296
+ break
297
+ when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
298
+ # opening delimiter, start of new token
299
+ @tokens << tok if tok.size > 0
300
+ @tokens << chr
301
+ tok = ""
302
+ break
303
+ when "\x29", "\x3E", "\x5D", "\x7D"
304
+ # closing delimiter
305
+ @tokens << tok if tok.size > 0
306
+ @tokens << chr
307
+ tok = ""
308
+ break
309
+ else
310
+ tok << chr
311
+ end
312
+ end
313
+
314
+ @tokens << tok if tok.size > 0
315
+ end
183
316
  end
184
- ################################################################################
185
317
  end
186
- ################################################################################
@@ -323,7 +323,7 @@ class PDF::Reader
323
323
  # like a regular page content stream.
324
324
  #
325
325
  def walk_xobject_form(label)
326
- xobjects = current_resources[:XObject] || {}
326
+ xobjects = @xref.object(current_resources[:XObject]) || {}
327
327
  xobject = @xref.object(xobjects[label])
328
328
 
329
329
  if xobject && xobject.hash[:Subtype] == :Form
@@ -43,10 +43,10 @@ class PDF::Reader
43
43
  #
44
44
  # operators - a hash of supported operators to read from the underlying buffer.
45
45
  def parse_token (operators={})
46
- ref = Reference.from_buffer(@buffer) and return ref
47
46
  token = @buffer.token
48
47
 
49
48
  case token
49
+ when PDF::Reader::Reference then return token
50
50
  when nil then return nil
51
51
  when "/" then return @buffer.token.to_sym
52
52
  when "<<" then return dictionary()
@@ -58,7 +58,7 @@ class PDF::Reader
58
58
  when "null" then return nil
59
59
  when "obj", "endobj" then return Token.new(token)
60
60
  when "stream", "endstream" then return Token.new(token)
61
- when ">>", "]", ">" then return Token.new(token)
61
+ when ">>", "]", ">", ")" then return Token.new(token)
62
62
  else
63
63
  if operators.has_key?(token) then return Token.new(token)
64
64
  elsif token =~ /\d*\.\d/ then return token.to_f
@@ -66,6 +66,29 @@ class PDF::Reader
66
66
  end
67
67
  end
68
68
  end
69
+ ################################################################################
70
+ # Reads an entire PDF object from the buffer and returns it as a Ruby String.
71
+ # If the object is a content stream, returns both the stream and the dictionary
72
+ # that describes it
73
+ #
74
+ # id - the object ID to return
75
+ # gen - the object revision number to return
76
+ def object (id, gen)
77
+ Error.assert_equal(parse_token, id)
78
+ Error.assert_equal(parse_token, gen)
79
+ Error.str_assert(parse_token, "obj")
80
+
81
+ obj = parse_token
82
+ post_obj = parse_token
83
+ case post_obj
84
+ when "endobj" then return obj
85
+ when "stream" then return stream(obj)
86
+ else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
87
+ end
88
+ end
89
+
90
+ private
91
+
69
92
  ################################################################################
70
93
  # reads a PDF dict from the buffer and converts it to a Ruby Hash.
71
94
  def dictionary
@@ -114,95 +137,34 @@ class PDF::Reader
114
137
  ################################################################################
115
138
  # Reads a PDF String from the buffer and converts it to a Ruby String
116
139
  def string
117
- str = ""
118
- count = 1
119
-
120
- while count != 0
121
- @buffer.ready_token(false, false)
122
-
123
- # find the first occurance of ( ) [ \ or ]
124
- #
125
- # I originally just used the regexp form of index(), but it seems to be
126
- # buggy on some OSX systems (returns nil when there is a match). This
127
- # version is more reliable and was suggested by Andrès Koetsier.
128
- #
129
- i = nil
130
- @buffer.raw.unpack("C*").each_with_index do |charint, idx|
131
- if [40, 41, 92].include?(charint)
132
- i = idx
133
- break
134
- end
135
- end
136
-
137
- if i.nil?
138
- str << @buffer.raw + "\n"
139
- @buffer.raw.replace("")
140
- # if a content stream opens a string, but never closes it, we'll
141
- # hit the end of the stream and still be appending stuff to the
142
- # string. bad! This check prevents a hard loop.
143
- raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
144
- next
145
- end
140
+ str = @buffer.token
141
+ return "" if str == ")"
142
+ Error.assert_equal(parse_token, ")")
143
+
144
+ str.gsub!("\\n","\n")
145
+ str.gsub!("\\r","\r")
146
+ str.gsub!("\\t","\t")
147
+ str.gsub!("\\b","\b")
148
+ str.gsub!("\\f","\f")
149
+ str.gsub!("\\(","(")
150
+ str.gsub!("\\)",")")
151
+ str.gsub!("\\\\","\\")
152
+ str.gsub!(/\\\n/m,"")
153
+ str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
154
+
155
+ str.scan(/\\\d{1,3}/).each do |octal|
156
+ str.gsub!(octal, octal[1,3].oct.chr)
157
+ end
146
158
 
147
- str << @buffer.head(i, false)
148
- to_remove = 1
149
-
150
- case @buffer.raw[0, 1]
151
- when "("
152
- str << "("
153
- count += 1
154
- when ")"
155
- count -= 1
156
- str << ")" unless count == 0
157
- when "\\"
158
- to_remove = 2
159
- case @buffer.raw[1, 1]
160
- when "" then to_remove = 1
161
- when "n" then str << "\n"
162
- when "r" then str << "\r"
163
- when "t" then str << "\t"
164
- when "b" then str << "\b"
165
- when "f" then str << "\f"
166
- when "(" then str << "("
167
- when ")" then str << ")"
168
- when "\\" then str << "\\"
169
- else
170
- if m = @buffer.raw.match(/^\\(\d{1,3})/)
171
- to_remove = m[0].size
172
- str << m[1].oct.chr
173
- end
174
- end
175
- end
159
+ str.gsub!(/\\([^\\])/,'\1')
176
160
 
177
- @buffer.head(to_remove, false)
178
- end
179
161
  str
180
162
  end
181
163
  ################################################################################
182
- # Reads an entire PDF object from the buffer and returns it as a Ruby String.
183
- # If the object is a content stream, returns both the stream and the dictionary
184
- # that describes it
185
- #
186
- # id - the object ID to return
187
- # gen - the object revision number to return
188
- def object (id, gen)
189
- Error.assert_equal(parse_token, id)
190
- Error.assert_equal(parse_token, gen)
191
- Error.str_assert(parse_token, "obj")
192
-
193
- obj = parse_token
194
- post_obj = parse_token
195
- case post_obj
196
- when "endobj" then return obj
197
- when "stream" then return stream(obj)
198
- else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
199
- end
200
- end
201
- ################################################################################
202
164
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
203
165
  def stream (dict)
204
166
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
205
- data = @buffer.read(@xref.object(dict[:Length]))
167
+ data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
206
168
 
207
169
  Error.str_assert(parse_token, "endstream")
208
170
  Error.str_assert(parse_token, "endobj")
@@ -27,16 +27,6 @@ class PDF::Reader
27
27
  ################################################################################
28
28
  # An internal PDF::Reader class that represents an indirect reference to a PDF Object
29
29
  class Reference
30
- ################################################################################
31
- # check if the next token in the buffer is a reference, and return a PDF::Reader::Reference
32
- # instance. Returns nil if the next token isn't an indirect reference.
33
- def self.from_buffer (buffer)
34
- buffer.ready_token
35
- return nil unless m = buffer.raw.match(/^(\d+)\s+(\d+)\s+R\b/)
36
- buffer.head(m[0].size)
37
- self.new(m[1].to_i, m[2].to_i)
38
- end
39
- ################################################################################
40
30
  attr_reader :id, :gen
41
31
  ################################################################################
42
32
  # Create a new Reference to an object with the specified id and revision number
@@ -32,8 +32,8 @@ class PDF::Reader
32
32
  class XRef
33
33
  ################################################################################
34
34
  # create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
35
- def initialize (buffer)
36
- @buffer = buffer
35
+ def initialize (io)
36
+ @io = io
37
37
  @xref = {}
38
38
  end
39
39
  def size
@@ -44,8 +44,8 @@ class PDF::Reader
44
44
  # table, but it is one of the lowest level data items in the file, so we've lumped it in
45
45
  # with the cross reference code.
46
46
  def pdf_version
47
- @buffer.seek(0)
48
- m, version = *@buffer.read(8).match(/%PDF-(\d.\d)/)
47
+ @io.seek(0)
48
+ m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
49
49
  raise MalformedPDFError, 'invalid PDF version' if version.nil?
50
50
  return version.to_f
51
51
  end
@@ -55,13 +55,14 @@ class PDF::Reader
55
55
  #
56
56
  # Will fail silently if there is no xref table at the requested offset.
57
57
  def load (offset = nil)
58
- offset ||= @buffer.find_first_xref_offset
59
- @buffer.seek(offset)
60
- token = @buffer.token
58
+ offset ||= new_buffer.find_first_xref_offset
59
+
60
+ buf = new_buffer(offset)
61
+ token = buf.token
61
62
 
62
63
  if token == "xref" || token == "ref"
63
- load_xref_table
64
- elsif token.to_i >= 0 && @buffer.token.to_i >= 0 && @buffer.token == "obj"
64
+ load_xref_table(buf)
65
+ elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
65
66
  raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
66
67
  else
67
68
  raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
@@ -73,52 +74,12 @@ class PDF::Reader
73
74
  # number
74
75
  #
75
76
  # If the object is a stream, that is returned as well
76
- def object (ref, save_pos = true)
77
+ def object (ref)
77
78
  return ref unless ref.kind_of?(Reference)
78
- pos = @buffer.pos_without_buf if save_pos
79
- obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
80
- @buffer.seek(pos) if save_pos
79
+ buf = new_buffer(offset_for(ref))
80
+ obj = Parser.new(buf, self).object(ref.id, ref.gen)
81
81
  return obj
82
82
  end
83
- ################################################################################
84
- # Assumes the underlying buffer is positioned at the start of an Xref table and
85
- # processes it into memory.
86
- def load_xref_table
87
- tok_one = tok_two = nil
88
-
89
- begin
90
- # loop over all subsections of the xref table
91
- # In a well formed PDF, the 'trailer' token will indicate
92
- # the end of the table. However we need to be careful in case
93
- # we're processing a malformed pdf that is missing the trailer.
94
- loop do
95
- tok_one, tok_two = @buffer.token, @buffer.token
96
- if tok_one != "trailer" && !tok_one.match(/\d+/)
97
- raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
98
- end
99
- break if tok_one == "trailer" or tok_one.nil?
100
- objid, count = tok_one.to_i, tok_two.to_i
101
-
102
- count.times do
103
- offset = @buffer.token.to_i
104
- generation = @buffer.token.to_i
105
- state = @buffer.token
106
-
107
- store(objid, generation, offset) if state == "n"
108
- objid += 1
109
- end
110
- end
111
- rescue EOFError => e
112
- raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
113
- end
114
-
115
- raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
116
-
117
- trailer = Parser.new(@buffer, self).dictionary
118
- load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
119
-
120
- trailer
121
- end
122
83
  # returns the type of object a ref points to
123
84
  def obj_type(ref)
124
85
  obj = object(ref)
@@ -154,6 +115,41 @@ class PDF::Reader
154
115
  (@xref[id] ||= {})[gen] ||= offset
155
116
  end
156
117
  ################################################################################
118
+ private
119
+ ################################################################################
120
+ # Assumes the underlying buffer is positioned at the start of an Xref table and
121
+ # processes it into memory.
122
+ def load_xref_table(buf)
123
+ params = []
124
+
125
+ while !params.include?("trailer") && !params.include?(nil)
126
+ if params.size == 2
127
+ objid, count = params[0].to_i, params[1].to_i
128
+ count.times do
129
+ offset = buf.token.to_i
130
+ generation = buf.token.to_i
131
+ state = buf.token
132
+
133
+ store(objid, generation, offset) if state == "n"
134
+ objid += 1
135
+ params.clear
136
+ end
137
+ end
138
+ params << buf.token
139
+ end
140
+
141
+ trailer = Parser.new(buf, self).parse_token
142
+
143
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
+
145
+ load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
146
+
147
+ trailer
148
+ end
149
+
150
+ def new_buffer(offset = 0)
151
+ PDF::Reader::Buffer.new(@io, :seek => offset)
152
+ end
157
153
  end
158
154
  ################################################################################
159
155
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-27 00:00:00 +11:00
12
+ date: 2010-01-01 00:00:00 +11:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency