pdf-reader 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ v0.8.3 (14th February 2010)
2
+ - Fix a bug in tokenising of hex strings inside dictionaries
3
+ - Thanks to Brad Ediger for detecting the issue and proposing a solution
4
+
1
5
  v0.8.2 (1st January 2010)
2
6
  - Fix parsing of files that use Form XObjects behind an indirect reference
3
7
  (thanks Cornelius Illi and Patrick Crosby)
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.2"
9
+ PKG_VERSION = "0.8.3"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -57,27 +57,25 @@ end
57
57
  # RSpec files aren't included, as they depend on the PDF files,
58
58
  # which will make the gem filesize irritatingly large
59
59
  spec = Gem::Specification.new do |spec|
60
- spec.name = PKG_NAME
61
- spec.version = PKG_VERSION
62
- spec.platform = Gem::Platform::RUBY
63
- spec.summary = "A library for accessing the content of PDF files"
64
- spec.files = Dir.glob("{examples,lib}/**/**/*") +
65
- ["Rakefile"]
66
-
60
+ spec.name = PKG_NAME
61
+ spec.version = PKG_VERSION
62
+ spec.platform = Gem::Platform::RUBY
63
+ spec.summary = "A library for accessing the content of PDF files"
64
+ spec.files = Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
67
65
  spec.require_path = "lib"
68
66
  spec.bindir = "bin"
69
67
  spec.executables << "pdf_object"
70
68
  spec.executables << "pdf_text"
71
69
  spec.executables << "pdf_list_callbacks"
72
- spec.has_rdoc = true
73
- spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
74
- spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
75
- '--main' << 'README.rdoc' << '-q'
76
- spec.author = "Peter Jones"
77
- spec.email = "pjones@pmade.com"
78
- spec.rubyforge_project = "pdf-reader"
79
- spec.homepage = "http://software.pmade.com/pdfreader"
80
- spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
70
+ spec.has_rdoc = true
71
+ spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
72
+ spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
73
+ '--main' << 'README.rdoc' << '-q'
74
+ spec.author = "James Healy"
75
+ spec.email = "jimmy@deefa.com"
76
+ spec.rubyforge_project = "pdf-reader"
77
+ spec.homepage = "http://github.com/yob/pdf-reader"
78
+ spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
81
79
  spec.add_dependency('Ascii85', '>=0.9')
82
80
  end
83
81
 
@@ -137,8 +137,7 @@ class PDF::Reader
137
137
  ################################################################################
138
138
  # Given an IO object that contains PDF data, return the contents of a single object
139
139
  def object (io, id, gen)
140
- @buffer = Buffer.new(io)
141
- @xref = XRef.new(@buffer)
140
+ @xref = XRef.new(io)
142
141
  @xref.load
143
142
 
144
143
  @xref.object(Reference.new(id, gen))
@@ -27,7 +27,7 @@
27
27
 
28
28
  class PDF::Reader
29
29
 
30
- # A string tokeniser that recognises PDF grammer. When passed an IO stream or a
30
+ # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
31
31
  # string, repeated calls to token() will return the next token from the source.
32
32
  #
33
33
  # This is very low level, and getting the raw tokens is not very useful in itself.
@@ -99,7 +99,7 @@ class PDF::Reader
99
99
  end
100
100
 
101
101
  # return raw bytes from the underlying IO stream. All bytes up to the first
102
- # occurance of needle will be returned. The match (if any) is not returned.
102
+ # occurrence of needle will be returned. The match (if any) is not returned.
103
103
  # The IO stream cursor is left on the first byte of the match.
104
104
  #
105
105
  # needle - a string to search the IO stream for
@@ -129,7 +129,7 @@ class PDF::Reader
129
129
  reset_pos
130
130
  prepare_tokens if @tokens.size < 3
131
131
  merge_indirect_reference
132
- merge_tokens
132
+ prepare_tokens if @tokens.size < 3
133
133
 
134
134
  @tokens.shift
135
135
  end
@@ -206,10 +206,14 @@ class PDF::Reader
206
206
  # them, replace the tokens with a PDF::Reader::Reference instance.
207
207
  #
208
208
  # Merging them into a single string was another option, but that would mean
209
- # code further up the stact would need to check every token to see if it looks
209
+ # code further up the stack would need to check every token to see if it looks
210
210
  # like an indirect object. For optimisation reasons, I'd rather avoid
211
211
  # that extra check.
212
212
  #
213
+ # It's incredibly likely that the next 3 tokens in the buffer are NOT an
214
+ # indirect reference, so test for that case first and avoid the relatively
215
+ # expensive regexp checks if possible.
216
+ #
213
217
  def merge_indirect_reference
214
218
  return if @tokens.size < 3
215
219
  return if @tokens[2] != "R"
@@ -222,26 +226,8 @@ class PDF::Reader
222
226
  end
223
227
  end
224
228
 
225
- # merge any consequtive tokens that are actually 1 token. The only current
226
- # time this is the case is << and >>. < and > are valid tokens (they indicate
227
- # a hex string) but so are << and >> (they indicate a dictionary).
228
- #
229
- def merge_tokens
230
- @tokens.each_with_index do |tok, idx|
231
- if tok == "<" && @tokens[idx+1] == "<"
232
- @tokens.inspect
233
- @tokens[idx] = "<<"
234
- @tokens[idx+1] = nil
235
- elsif tok == ">" && @tokens[idx+1] == ">"
236
- @tokens[idx] = ">>"
237
- @tokens[idx+1] = nil
238
- end
239
- end
240
- @tokens.compact!
241
- end
242
-
243
229
  # if we're currently inside a literal string we more or less just read bytes until
244
- # we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
230
+ # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
245
231
  # start of a new token in regular mode are left untouched when inside a literal
246
232
  # string.
247
233
  #
@@ -294,13 +280,27 @@ class PDF::Reader
294
280
  @tokens << tok if tok.size > 0
295
281
  tok = ""
296
282
  break
297
- when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
283
+ when "\x3C"
284
+ # opening delimiter '<', start of new token
285
+ @tokens << tok if tok.size > 0
286
+ chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
287
+ @tokens << chr
288
+ tok = ""
289
+ break
290
+ when "\x3E"
291
+ # closing delimiter '>', start of new token
292
+ @tokens << tok if tok.size > 0
293
+ chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
294
+ @tokens << chr
295
+ tok = ""
296
+ break
297
+ when "\x28", "\x5B", "\x7B", "\x2F"
298
298
  # opening delimiter, start of new token
299
299
  @tokens << tok if tok.size > 0
300
300
  @tokens << chr
301
301
  tok = ""
302
302
  break
303
- when "\x29", "\x3E", "\x5D", "\x7D"
303
+ when "\x29", "\x5D", "\x7D"
304
304
  # closing delimiter
305
305
  @tokens << tok if tok.size > 0
306
306
  @tokens << chr
@@ -313,5 +313,14 @@ class PDF::Reader
313
313
 
314
314
  @tokens << tok if tok.size > 0
315
315
  end
316
+
317
+ # peek at the next character in the io stream, leaving the stream position
318
+ # untouched
319
+ #
320
+ def peek_char
321
+ chr = @io.read(1)
322
+ @io.seek(-1, IO::SEEK_CUR) unless chr.nil?
323
+ chr
324
+ end
316
325
  end
317
326
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
- - Peter Jones
7
+ - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-01 00:00:00 +11:00
12
+ date: 2010-02-14 00:00:00 +11:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -23,7 +23,7 @@ dependencies:
23
23
  version: "0.9"
24
24
  version:
25
25
  description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
26
- email: pjones@pmade.com
26
+ email: jimmy@deefa.com
27
27
  executables:
28
28
  - pdf_object
29
29
  - pdf_text
@@ -76,7 +76,7 @@ files:
76
76
  - CHANGELOG
77
77
  - MIT-LICENSE
78
78
  has_rdoc: true
79
- homepage: http://software.pmade.com/pdfreader
79
+ homepage: http://github.com/yob/pdf-reader
80
80
  licenses: []
81
81
 
82
82
  post_install_message: