pdf-reader 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,9 @@
1
+ v0.8.2 (1st January 2010)
2
+ - Fix parsing of files that use Form XObjects behind an indirect reference
3
+ (thanks Cornelius Illi and Patrick Crosby)
4
+ - Rewrote Buffer class to fix various speed issues reported over the years
5
+ - On my sample file extracting full text reduced from 220 seconds to 9 seconds.
6
+
1
7
  v0.8.1 (27th November 2009)
2
8
  - Added PDF::Hash#version. Provides access to the source file PDF version
3
9
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.1"
9
+ PKG_VERSION = "0.8.2"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -46,8 +46,7 @@ module PDF
46
46
  raise ArgumentError, "input must be an IO-like object or a filename"
47
47
  end
48
48
  @version = read_version(io)
49
- buffer = PDF::Reader::Buffer.new(io)
50
- @xref = PDF::Reader::XRef.new(buffer)
49
+ @xref = PDF::Reader::XRef.new(io)
51
50
  @trailer = @xref.load
52
51
  end
53
52
 
@@ -122,9 +122,7 @@ class PDF::Reader
122
122
  ################################################################################
123
123
  # Given an IO object that contains PDF data, parse it.
124
124
  def parse (io, receiver, opts = {})
125
- @buffer = Buffer.new(io)
126
- @xref = XRef.new(@buffer)
127
- @parser = Parser.new(@buffer, @xref)
125
+ @xref = XRef.new(io)
128
126
  @content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
129
127
 
130
128
  options = {:pages => true, :metadata => true}
@@ -1,6 +1,8 @@
1
+ # coding: utf-8
2
+
1
3
  ################################################################################
2
4
  #
3
- # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
5
+ # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
6
  #
5
7
  # Permission is hereby granted, free of charge, to any person obtaining
6
8
  # a copy of this software and associated documentation files (the
@@ -24,140 +26,118 @@
24
26
  ################################################################################
25
27
 
26
28
  class PDF::Reader
27
- ################################################################################
28
- # An internal PDF::Reader class that mediates access to the underlying PDF File or IO Stream
29
+
30
+ # A string tokeniser that recognises PDF grammer. When passed an IO stream or a
31
+ # string, repeated calls to token() will return the next token from the source.
32
+ #
33
+ # This is very low level, and getting the raw tokens is not very useful in itself.
34
+ #
35
+ # This will usually be used in conjunction with PDF:Reader::Parser, which converts
36
+ # the raw tokens into objects we can work with (strings, ints, arrays, etc)
37
+ #
29
38
  class Buffer
30
- ################################################################################
31
- # Creates a new buffer around the specified IO object
32
- def initialize (io)
39
+
40
+ attr_reader :pos
41
+
42
+ # Creates a new buffer.
43
+ #
44
+ # Params:
45
+ #
46
+ # io - an IO stream or string with the raw data to tokenise
47
+ #
48
+ # options:
49
+ #
50
+ # :seek - a byte offset to seek to before starting to tokenise
51
+ #
52
+ def initialize (io, opts = {})
33
53
  @io = io
34
- @buffer = nil
54
+ @tokens = []
55
+ @options = opts
56
+
57
+ @io.seek(opts[:seek]) if opts[:seek]
58
+ @pos = @io.pos
35
59
  end
36
- ################################################################################
37
- # Seek to the requested byte in the IO stream.
38
- def seek (offset)
39
- @io.seek(offset, IO::SEEK_SET)
40
- @buffer = nil
41
- self
60
+
61
+ # return true if there are no more tokens left
62
+ #
63
+ def empty?
64
+ prepare_tokens if @tokens.size < 3
65
+
66
+ @tokens.empty?
42
67
  end
43
- ################################################################################
44
- # reads the requested number of bytes from the underlying IO stream.
68
+
69
+ # return raw bytes from the underlying IO stream.
45
70
  #
46
- # length should be a positive integer.
47
- def read (length)
48
- out = ""
71
+ # bytes - the number of bytes to read
72
+ #
73
+ # options:
74
+ #
75
+ # :skip_eol - if true, the IO stream is advanced past any LF or CR
76
+ # bytes before it reads any data. This is to handle
77
+ # content streams, which have a CRLF or LF after the stream
78
+ # token.
79
+ #
80
+ def read(bytes, opts = {})
81
+ reset_pos
49
82
 
50
- if @buffer and !@buffer.empty?
51
- out << head(length)
52
- length -= out.length
83
+ if opts[:skip_eol]
84
+ done = false
85
+ while !done
86
+ chr = @io.read(1)
87
+ if chr.nil?
88
+ return nil
89
+ elsif chr != "\n" && chr != "\r"
90
+ @io.seek(-1, IO::SEEK_CUR)
91
+ done = true
92
+ end
93
+ end
53
94
  end
54
95
 
55
- out << @io.read(length) if length > 0
56
- out
96
+ bytes = @io.read(bytes)
97
+ save_pos
98
+ bytes
57
99
  end
58
- ################################################################################
59
- # Reads from the buffer until the specified token is found, or the end of the buffer
100
+
101
+ # return raw bytes from the underlying IO stream. All bytes up to the first
102
+ # occurance of needle will be returned. The match (if any) is not returned.
103
+ # The IO stream cursor is left on the first byte of the match.
104
+ #
105
+ # needle - a string to search the IO stream for
60
106
  #
61
- # bytes - the bytes to search for.
62
- def read_until(bytes)
107
+ def read_until(needle)
108
+ reset_pos
63
109
  out = ""
64
- size = bytes.size
110
+ size = needle.size
65
111
 
66
- if @buffer && !@buffer.empty?
67
- if @buffer.include?(bytes)
68
- offset = @buffer.index(bytes) + size
69
- return head(offset)
70
- else
71
- out << head(@buffer.size)
72
- end
112
+ while out[size * -1, size] != needle && !@io.eof?
113
+ out << @io.read(1)
73
114
  end
74
115
 
75
- loop do
76
- out << @io.read(1)
77
- if out[-1 * size,size].eql?(bytes)
78
- out = out[0, out.size - size]
79
- seek(pos - size)
80
- break
81
- end
116
+ if out[size * -1, size] == needle
117
+ out = out[0, out.size - size]
118
+ @io.seek(size * -1, IO::SEEK_CUR)
82
119
  end
120
+
121
+ save_pos
83
122
  out
84
123
  end
85
- ################################################################################
86
- # returns true if the underlying IO object is at end and the internal buffer
87
- # is empty
88
- def eof?
89
- ready_token
90
- if @buffer
91
- @buffer.empty? && @io.eof?
92
- else
93
- @io.eof?
94
- end
95
- end
96
- ################################################################################
97
- def pos
98
- @io.pos
99
- end
100
- ################################################################################
101
- def pos_without_buf
102
- @io.pos - @buffer.to_s.size
103
- end
104
- ################################################################################
105
- # PDF files are processed by tokenising the content into a series of objects and commands.
106
- # This prepares the buffer for use by reading the next line of tokens into memory.
107
- def ready_token (with_strip=true, skip_blanks=true)
108
- while (@buffer.nil? or @buffer.empty?) && !@io.eof?
109
- @buffer = @io.readline
110
- @buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
111
- #@buffer.sub!(/%.*$/, '') if strip_comments
112
- @buffer.chomp!
113
- break unless skip_blanks
114
- end
115
- @buffer.lstrip! if with_strip
116
- end
117
- ################################################################################
118
- # return the next token from the underlying IO stream
119
- def token
120
- ready_token
121
-
122
- i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
123
124
 
124
- token_chars =
125
- if i == 0 and @buffer[i,2] == "<<" then 2
126
- elsif i == 0 and @buffer[i,2] == ">>" then 2
127
- elsif i == 0 then 1
128
- else i
129
- end
130
-
131
- strip_space = !(i == 0 and @buffer[0,1] == '(')
132
- tok = head(token_chars, strip_space)
125
+ # return the next token from the source. Returns a string if a token
126
+ # is found, nil if there are no tokens left.
127
+ #
128
+ def token
129
+ reset_pos
130
+ prepare_tokens if @tokens.size < 3
131
+ merge_indirect_reference
132
+ merge_tokens
133
133
 
134
- if tok == ""
135
- nil
136
- elsif tok[0,1] == "%"
137
- @buffer = ""
138
- token
139
- else
140
- tok
141
- end
134
+ @tokens.shift
142
135
  end
143
- ################################################################################
144
- def head (chars, with_strip=true)
145
- val = @buffer[0, chars]
146
- @buffer = @buffer[chars .. -1] || ""
147
- @buffer.lstrip! if with_strip
148
- val
149
- end
150
- ################################################################################
151
- # return the internal buffer used by this class when reading from the IO stream.
152
- def raw
153
- @buffer
154
- end
155
- ################################################################################
156
- # The Xref table in a PDF file acts as an aid for finding the location of various
157
- # objects in the file. This method attempts to locate the byte offset of the xref
158
- # table in the underlying IO stream.
136
+
137
+ # return the byte offset where the first XRef table in th source can be found.
138
+ #
159
139
  def find_first_xref_offset
160
- @io.seek(-1024, IO::SEEK_END) rescue seek(0)
140
+ @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
161
141
  data = @io.read(1024)
162
142
 
163
143
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
@@ -179,8 +159,159 @@ class PDF::Reader
179
159
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
180
160
  lines[eof_index+1].to_i
181
161
  end
182
- ################################################################################
162
+
163
+ private
164
+
165
+ # Some bastard moved our IO stream cursor. Restore it.
166
+ #
167
+ def reset_pos
168
+ @io.seek(@pos) if @io.pos != @pos
169
+ end
170
+
171
+ # save the current position of the source IO stream. If someone else (like another buffer)
172
+ # moves the cursor, we can then restore it.
173
+ #
174
+ def save_pos
175
+ @pos = @io.pos
176
+ end
177
+
178
+ # attempt to prime the buffer with the next few tokens.
179
+ #
180
+ def prepare_tokens
181
+ 10.times do
182
+ if state == :literal_string
183
+ prepare_literal_token
184
+ elsif state == :regular
185
+ prepare_regular_token
186
+ end
187
+ end
188
+
189
+ save_pos
190
+ end
191
+
192
+ # tokenising behaves slightly differently based on the current context.
193
+ # Determine the current context/state by examining the last token we found
194
+ #
195
+ def state
196
+ if @tokens[-1] == "("
197
+ :literal_string
198
+ elsif @tokens[-1] == "stream"
199
+ :stream
200
+ else
201
+ :regular
202
+ end
203
+ end
204
+
205
+ # detect a series of 3 tokens that make up an indirect object. If we find
206
+ # them, replace the tokens with a PDF::Reader::Reference instance.
207
+ #
208
+ # Merging them into a single string was another option, but that would mean
209
+ # code further up the stact would need to check every token to see if it looks
210
+ # like an indirect object. For optimisation reasons, I'd rather avoid
211
+ # that extra check.
212
+ #
213
+ def merge_indirect_reference
214
+ return if @tokens.size < 3
215
+ return if @tokens[2] != "R"
216
+
217
+ if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
218
+ @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
219
+ @tokens[1] = nil
220
+ @tokens[2] = nil
221
+ @tokens.compact!
222
+ end
223
+ end
224
+
225
+ # merge any consequtive tokens that are actually 1 token. The only current
226
+ # time this is the case is << and >>. < and > are valid tokens (they indicate
227
+ # a hex string) but so are << and >> (they indicate a dictionary).
228
+ #
229
+ def merge_tokens
230
+ @tokens.each_with_index do |tok, idx|
231
+ if tok == "<" && @tokens[idx+1] == "<"
232
+ @tokens.inspect
233
+ @tokens[idx] = "<<"
234
+ @tokens[idx+1] = nil
235
+ elsif tok == ">" && @tokens[idx+1] == ">"
236
+ @tokens[idx] = ">>"
237
+ @tokens[idx+1] = nil
238
+ end
239
+ end
240
+ @tokens.compact!
241
+ end
242
+
243
+ # if we're currently inside a literal string we more or less just read bytes until
244
+ # we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
245
+ # start of a new token in regular mode are left untouched when inside a literal
246
+ # string.
247
+ #
248
+ # The entire literal string will be returned as a single token. It will need further
249
+ # processing to fix things like escaped new lines, but that's someone else's
250
+ # problem.
251
+ #
252
+ def prepare_literal_token
253
+ str = ""
254
+ count = 1
255
+
256
+ while count > 0
257
+ chr = @io.read(1)
258
+ if chr.nil?
259
+ count = 0 # unbalanced params
260
+ elsif chr == "(" && str[-1,1] != "\x5C"
261
+ str << "("
262
+ count += 1
263
+ elsif chr == ")" && str[-1,1] != "\x5C"
264
+ count -= 1
265
+ str << ")" unless count == 0
266
+ else
267
+ str << chr unless count == 0
268
+ end
269
+ end
270
+
271
+ @tokens << str if str.size > 0
272
+ @tokens << ")"
273
+ end
274
+
275
+ # Extract the next regular token and stock it in our buffer, ready to be returned.
276
+ #
277
+ # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
278
+ # to read up on it.
279
+ #
280
+ def prepare_regular_token
281
+ tok = ""
282
+
283
+ while chr = @io.read(1)
284
+ case chr
285
+ when "\x25"
286
+ # comment, ignore everything until the next EOL char
287
+ done = false
288
+ while !done
289
+ chr = @io.read(1)
290
+ done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
291
+ end
292
+ when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
293
+ # white space, token finished
294
+ @tokens << tok if tok.size > 0
295
+ tok = ""
296
+ break
297
+ when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
298
+ # opening delimiter, start of new token
299
+ @tokens << tok if tok.size > 0
300
+ @tokens << chr
301
+ tok = ""
302
+ break
303
+ when "\x29", "\x3E", "\x5D", "\x7D"
304
+ # closing delimiter
305
+ @tokens << tok if tok.size > 0
306
+ @tokens << chr
307
+ tok = ""
308
+ break
309
+ else
310
+ tok << chr
311
+ end
312
+ end
313
+
314
+ @tokens << tok if tok.size > 0
315
+ end
183
316
  end
184
- ################################################################################
185
317
  end
186
- ################################################################################
@@ -323,7 +323,7 @@ class PDF::Reader
323
323
  # like a regular page content stream.
324
324
  #
325
325
  def walk_xobject_form(label)
326
- xobjects = current_resources[:XObject] || {}
326
+ xobjects = @xref.object(current_resources[:XObject]) || {}
327
327
  xobject = @xref.object(xobjects[label])
328
328
 
329
329
  if xobject && xobject.hash[:Subtype] == :Form
@@ -43,10 +43,10 @@ class PDF::Reader
43
43
  #
44
44
  # operators - a hash of supported operators to read from the underlying buffer.
45
45
  def parse_token (operators={})
46
- ref = Reference.from_buffer(@buffer) and return ref
47
46
  token = @buffer.token
48
47
 
49
48
  case token
49
+ when PDF::Reader::Reference then return token
50
50
  when nil then return nil
51
51
  when "/" then return @buffer.token.to_sym
52
52
  when "<<" then return dictionary()
@@ -58,7 +58,7 @@ class PDF::Reader
58
58
  when "null" then return nil
59
59
  when "obj", "endobj" then return Token.new(token)
60
60
  when "stream", "endstream" then return Token.new(token)
61
- when ">>", "]", ">" then return Token.new(token)
61
+ when ">>", "]", ">", ")" then return Token.new(token)
62
62
  else
63
63
  if operators.has_key?(token) then return Token.new(token)
64
64
  elsif token =~ /\d*\.\d/ then return token.to_f
@@ -66,6 +66,29 @@ class PDF::Reader
66
66
  end
67
67
  end
68
68
  end
69
+ ################################################################################
70
+ # Reads an entire PDF object from the buffer and returns it as a Ruby String.
71
+ # If the object is a content stream, returns both the stream and the dictionary
72
+ # that describes it
73
+ #
74
+ # id - the object ID to return
75
+ # gen - the object revision number to return
76
+ def object (id, gen)
77
+ Error.assert_equal(parse_token, id)
78
+ Error.assert_equal(parse_token, gen)
79
+ Error.str_assert(parse_token, "obj")
80
+
81
+ obj = parse_token
82
+ post_obj = parse_token
83
+ case post_obj
84
+ when "endobj" then return obj
85
+ when "stream" then return stream(obj)
86
+ else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
87
+ end
88
+ end
89
+
90
+ private
91
+
69
92
  ################################################################################
70
93
  # reads a PDF dict from the buffer and converts it to a Ruby Hash.
71
94
  def dictionary
@@ -114,95 +137,34 @@ class PDF::Reader
114
137
  ################################################################################
115
138
  # Reads a PDF String from the buffer and converts it to a Ruby String
116
139
  def string
117
- str = ""
118
- count = 1
119
-
120
- while count != 0
121
- @buffer.ready_token(false, false)
122
-
123
- # find the first occurance of ( ) [ \ or ]
124
- #
125
- # I originally just used the regexp form of index(), but it seems to be
126
- # buggy on some OSX systems (returns nil when there is a match). This
127
- # version is more reliable and was suggested by Andrès Koetsier.
128
- #
129
- i = nil
130
- @buffer.raw.unpack("C*").each_with_index do |charint, idx|
131
- if [40, 41, 92].include?(charint)
132
- i = idx
133
- break
134
- end
135
- end
136
-
137
- if i.nil?
138
- str << @buffer.raw + "\n"
139
- @buffer.raw.replace("")
140
- # if a content stream opens a string, but never closes it, we'll
141
- # hit the end of the stream and still be appending stuff to the
142
- # string. bad! This check prevents a hard loop.
143
- raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
144
- next
145
- end
140
+ str = @buffer.token
141
+ return "" if str == ")"
142
+ Error.assert_equal(parse_token, ")")
143
+
144
+ str.gsub!("\\n","\n")
145
+ str.gsub!("\\r","\r")
146
+ str.gsub!("\\t","\t")
147
+ str.gsub!("\\b","\b")
148
+ str.gsub!("\\f","\f")
149
+ str.gsub!("\\(","(")
150
+ str.gsub!("\\)",")")
151
+ str.gsub!("\\\\","\\")
152
+ str.gsub!(/\\\n/m,"")
153
+ str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
154
+
155
+ str.scan(/\\\d{1,3}/).each do |octal|
156
+ str.gsub!(octal, octal[1,3].oct.chr)
157
+ end
146
158
 
147
- str << @buffer.head(i, false)
148
- to_remove = 1
149
-
150
- case @buffer.raw[0, 1]
151
- when "("
152
- str << "("
153
- count += 1
154
- when ")"
155
- count -= 1
156
- str << ")" unless count == 0
157
- when "\\"
158
- to_remove = 2
159
- case @buffer.raw[1, 1]
160
- when "" then to_remove = 1
161
- when "n" then str << "\n"
162
- when "r" then str << "\r"
163
- when "t" then str << "\t"
164
- when "b" then str << "\b"
165
- when "f" then str << "\f"
166
- when "(" then str << "("
167
- when ")" then str << ")"
168
- when "\\" then str << "\\"
169
- else
170
- if m = @buffer.raw.match(/^\\(\d{1,3})/)
171
- to_remove = m[0].size
172
- str << m[1].oct.chr
173
- end
174
- end
175
- end
159
+ str.gsub!(/\\([^\\])/,'\1')
176
160
 
177
- @buffer.head(to_remove, false)
178
- end
179
161
  str
180
162
  end
181
163
  ################################################################################
182
- # Reads an entire PDF object from the buffer and returns it as a Ruby String.
183
- # If the object is a content stream, returns both the stream and the dictionary
184
- # that describes it
185
- #
186
- # id - the object ID to return
187
- # gen - the object revision number to return
188
- def object (id, gen)
189
- Error.assert_equal(parse_token, id)
190
- Error.assert_equal(parse_token, gen)
191
- Error.str_assert(parse_token, "obj")
192
-
193
- obj = parse_token
194
- post_obj = parse_token
195
- case post_obj
196
- when "endobj" then return obj
197
- when "stream" then return stream(obj)
198
- else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
199
- end
200
- end
201
- ################################################################################
202
164
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
203
165
  def stream (dict)
204
166
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
205
- data = @buffer.read(@xref.object(dict[:Length]))
167
+ data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
206
168
 
207
169
  Error.str_assert(parse_token, "endstream")
208
170
  Error.str_assert(parse_token, "endobj")
@@ -27,16 +27,6 @@ class PDF::Reader
27
27
  ################################################################################
28
28
  # An internal PDF::Reader class that represents an indirect reference to a PDF Object
29
29
  class Reference
30
- ################################################################################
31
- # check if the next token in the buffer is a reference, and return a PDF::Reader::Reference
32
- # instance. Returns nil if the next token isn't an indirect reference.
33
- def self.from_buffer (buffer)
34
- buffer.ready_token
35
- return nil unless m = buffer.raw.match(/^(\d+)\s+(\d+)\s+R\b/)
36
- buffer.head(m[0].size)
37
- self.new(m[1].to_i, m[2].to_i)
38
- end
39
- ################################################################################
40
30
  attr_reader :id, :gen
41
31
  ################################################################################
42
32
  # Create a new Reference to an object with the specified id and revision number
@@ -32,8 +32,8 @@ class PDF::Reader
32
32
  class XRef
33
33
  ################################################################################
34
34
  # create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
35
- def initialize (buffer)
36
- @buffer = buffer
35
+ def initialize (io)
36
+ @io = io
37
37
  @xref = {}
38
38
  end
39
39
  def size
@@ -44,8 +44,8 @@ class PDF::Reader
44
44
  # table, but it is one of the lowest level data items in the file, so we've lumped it in
45
45
  # with the cross reference code.
46
46
  def pdf_version
47
- @buffer.seek(0)
48
- m, version = *@buffer.read(8).match(/%PDF-(\d.\d)/)
47
+ @io.seek(0)
48
+ m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
49
49
  raise MalformedPDFError, 'invalid PDF version' if version.nil?
50
50
  return version.to_f
51
51
  end
@@ -55,13 +55,14 @@ class PDF::Reader
55
55
  #
56
56
  # Will fail silently if there is no xref table at the requested offset.
57
57
  def load (offset = nil)
58
- offset ||= @buffer.find_first_xref_offset
59
- @buffer.seek(offset)
60
- token = @buffer.token
58
+ offset ||= new_buffer.find_first_xref_offset
59
+
60
+ buf = new_buffer(offset)
61
+ token = buf.token
61
62
 
62
63
  if token == "xref" || token == "ref"
63
- load_xref_table
64
- elsif token.to_i >= 0 && @buffer.token.to_i >= 0 && @buffer.token == "obj"
64
+ load_xref_table(buf)
65
+ elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
65
66
  raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
66
67
  else
67
68
  raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
@@ -73,52 +74,12 @@ class PDF::Reader
73
74
  # number
74
75
  #
75
76
  # If the object is a stream, that is returned as well
76
- def object (ref, save_pos = true)
77
+ def object (ref)
77
78
  return ref unless ref.kind_of?(Reference)
78
- pos = @buffer.pos_without_buf if save_pos
79
- obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
80
- @buffer.seek(pos) if save_pos
79
+ buf = new_buffer(offset_for(ref))
80
+ obj = Parser.new(buf, self).object(ref.id, ref.gen)
81
81
  return obj
82
82
  end
83
- ################################################################################
84
- # Assumes the underlying buffer is positioned at the start of an Xref table and
85
- # processes it into memory.
86
- def load_xref_table
87
- tok_one = tok_two = nil
88
-
89
- begin
90
- # loop over all subsections of the xref table
91
- # In a well formed PDF, the 'trailer' token will indicate
92
- # the end of the table. However we need to be careful in case
93
- # we're processing a malformed pdf that is missing the trailer.
94
- loop do
95
- tok_one, tok_two = @buffer.token, @buffer.token
96
- if tok_one != "trailer" && !tok_one.match(/\d+/)
97
- raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
98
- end
99
- break if tok_one == "trailer" or tok_one.nil?
100
- objid, count = tok_one.to_i, tok_two.to_i
101
-
102
- count.times do
103
- offset = @buffer.token.to_i
104
- generation = @buffer.token.to_i
105
- state = @buffer.token
106
-
107
- store(objid, generation, offset) if state == "n"
108
- objid += 1
109
- end
110
- end
111
- rescue EOFError => e
112
- raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
113
- end
114
-
115
- raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
116
-
117
- trailer = Parser.new(@buffer, self).dictionary
118
- load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
119
-
120
- trailer
121
- end
122
83
  # returns the type of object a ref points to
123
84
  def obj_type(ref)
124
85
  obj = object(ref)
@@ -154,6 +115,41 @@ class PDF::Reader
154
115
  (@xref[id] ||= {})[gen] ||= offset
155
116
  end
156
117
  ################################################################################
118
+ private
119
+ ################################################################################
120
+ # Assumes the underlying buffer is positioned at the start of an Xref table and
121
+ # processes it into memory.
122
+ def load_xref_table(buf)
123
+ params = []
124
+
125
+ while !params.include?("trailer") && !params.include?(nil)
126
+ if params.size == 2
127
+ objid, count = params[0].to_i, params[1].to_i
128
+ count.times do
129
+ offset = buf.token.to_i
130
+ generation = buf.token.to_i
131
+ state = buf.token
132
+
133
+ store(objid, generation, offset) if state == "n"
134
+ objid += 1
135
+ params.clear
136
+ end
137
+ end
138
+ params << buf.token
139
+ end
140
+
141
+ trailer = Parser.new(buf, self).parse_token
142
+
143
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
+
145
+ load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
146
+
147
+ trailer
148
+ end
149
+
150
+ def new_buffer(offset = 0)
151
+ PDF::Reader::Buffer.new(@io, :seek => offset)
152
+ end
157
153
  end
158
154
  ################################################################################
159
155
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-11-27 00:00:00 +11:00
12
+ date: 2010-01-01 00:00:00 +11:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency