RubyGems - pdf-reader - Versions diffs - 0.8.2 → 0.8.3 - Mend

pdf-reader 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,7 @@
+v0.8.3 (14th February 2010)
+- Fix a bug in tokenising of hex strings inside dictionaries
+  - Thanks to Brad Ediger for detecting the issue and proposing a solution
 v0.8.2 (1st January 2010)
 - Fix parsing of files that use Form XObjects behind an indirect reference
   (thanks Cornelius Illi and Patrick Crosby)

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.8.2"
+PKG_VERSION = "0.8.3"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
@@ -57,27 +57,25 @@ end
 # RSpec files aren't included, as they depend on the PDF files,
 # which will make the gem filesize irritatingly large
 spec = Gem::Specification.new do |spec|
-	spec.name = PKG_NAME
-	spec.version = PKG_VERSION
-	spec.platform = Gem::Platform::RUBY
-	spec.summary = "A library for accessing the content of PDF files"
-	spec.files =  Dir.glob("{examples,lib}/**/**/*") +
-                      ["Rakefile"]
+  spec.name = PKG_NAME
+  spec.version = PKG_VERSION
+  spec.platform = Gem::Platform::RUBY
+  spec.summary = "A library for accessing the content of PDF files"
+  spec.files =  Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
   spec.require_path = "lib"
   spec.bindir = "bin"
   spec.executables << "pdf_object"
   spec.executables << "pdf_text"
   spec.executables << "pdf_list_callbacks"
-	spec.has_rdoc = true
-	spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
-	spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
-	                     '--main'  << 'README.rdoc' << '-q'
-  spec.author = "Peter Jones"
-	spec.email = "pjones@pmade.com"
-	spec.rubyforge_project = "pdf-reader"
-	spec.homepage = "http://software.pmade.com/pdfreader"
-	spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
+  spec.has_rdoc = true
+  spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
+  spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
+                       '--main'  << 'README.rdoc' << '-q'
+  spec.author = "James Healy"
+  spec.email = "jimmy@deefa.com"
+  spec.rubyforge_project = "pdf-reader"
+  spec.homepage = "http://github.com/yob/pdf-reader"
+  spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
   spec.add_dependency('Ascii85', '>=0.9')
 end

data/lib/pdf/reader.rb CHANGED

@@ -137,8 +137,7 @@ class PDF::Reader
   ################################################################################
   # Given an IO object that contains PDF data, return the contents of a single object
   def object (io, id, gen)
-    @buffer   = Buffer.new(io)
-    @xref     = XRef.new(@buffer)
+    @xref     = XRef.new(io)
     @xref.load
     @xref.object(Reference.new(id, gen))

data/lib/pdf/reader/buffer.rb CHANGED

@@ -27,7 +27,7 @@
 class PDF::Reader
-  # A string tokeniser that recognises PDF grammer. When passed an IO stream or a
+  # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
   # string, repeated calls to token() will return the next token from the source.
   #
   # This is very low level, and getting the raw tokens is not very useful in itself.
@@ -99,7 +99,7 @@ class PDF::Reader
     end
     # return raw bytes from the underlying IO stream. All bytes up to the first
-    # occurance of needle will be returned. The match (if any) is not returned.
+    # occurrence of needle will be returned. The match (if any) is not returned.
     # The IO stream cursor is left on the first byte of the match.
     #
     #   needle - a string to search the IO stream for
@@ -129,7 +129,7 @@ class PDF::Reader
       reset_pos
       prepare_tokens if @tokens.size < 3
       merge_indirect_reference
-      merge_tokens
+      prepare_tokens if @tokens.size < 3
       @tokens.shift
     end
@@ -206,10 +206,14 @@ class PDF::Reader
     # them, replace the tokens with a PDF::Reader::Reference instance.
     #
     # Merging them into a single string was another option, but that would mean
-    # code further up the stact would need to check every token  to see if it looks
+    # code further up the stack would need to check every token  to see if it looks
     # like an indirect object. For optimisation reasons, I'd rather avoid
     # that extra check.
     #
+    # It's incredibly likely that the next 3 tokens in the buffer are NOT an
+    # indirect reference, so test for that case first and avoid the relatively
+    # expensive regexp checks if possible.
+    #
     def merge_indirect_reference
       return if @tokens.size < 3
       return if @tokens[2] != "R"
@@ -222,26 +226,8 @@ class PDF::Reader
       end
     end
-    # merge any consequtive tokens that are actually 1 token. The only current
-    # time this is the case is << and >>. < and > are valid tokens (they indicate
-    # a hex string) but so are << and >> (they indicate a dictionary).
-    #
-    def merge_tokens
-      @tokens.each_with_index do |tok, idx|
-        if tok == "<" && @tokens[idx+1] == "<"
-          @tokens.inspect
-          @tokens[idx] = "<<"
-          @tokens[idx+1] = nil
-        elsif tok == ">" && @tokens[idx+1] == ">"
-          @tokens[idx] = ">>"
-          @tokens[idx+1] = nil
-        end
-      end
-      @tokens.compact!
-    end
     # if we're currently inside a literal string we more or less just read bytes until
-    # we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
+    # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
     # start of a new token in regular mode are left untouched when inside a literal
     # string.
     #
@@ -294,13 +280,27 @@ class PDF::Reader
           @tokens << tok if tok.size > 0
           tok = ""
           break
-        when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
+        when "\x3C"
+          # opening delimiter '<', start of new token
+          @tokens << tok if tok.size > 0
+          chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
+          @tokens << chr
+          tok = ""
+          break
+        when "\x3E"
+          # closing delimiter '>', start of new token
+          @tokens << tok if tok.size > 0
+          chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
+          @tokens << chr
+          tok = ""
+          break
+        when "\x28", "\x5B", "\x7B", "\x2F"
           # opening delimiter, start of new token
           @tokens << tok if tok.size > 0
           @tokens << chr
           tok = ""
           break
-        when "\x29", "\x3E", "\x5D", "\x7D"
+        when "\x29", "\x5D", "\x7D"
           # closing delimiter
           @tokens << tok if tok.size > 0
           @tokens << chr
@@ -313,5 +313,14 @@ class PDF::Reader
       @tokens << tok if tok.size > 0
     end
+    # peek at the next character in the io stream, leaving the stream position
+    # untouched
+    #
+    def peek_char
+      chr = @io.read(1)
+      @io.seek(-1, IO::SEEK_CUR) unless chr.nil?
+      chr
+    end
   end
 end

metadata CHANGED

@@ -1,15 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.8.2
+  version: 0.8.3
 platform: ruby
 authors:
-- Peter Jones
+- James Healy
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-01-01 00:00:00 +11:00
+date: 2010-02-14 00:00:00 +11:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -23,7 +23,7 @@ dependencies:
         version: "0.9"
     version:
 description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
-email: pjones@pmade.com
+email: jimmy@deefa.com
 executables:
 - pdf_object
 - pdf_text
@@ -76,7 +76,7 @@ files:
 - CHANGELOG
 - MIT-LICENSE
 has_rdoc: true
-homepage: http://software.pmade.com/pdfreader
+homepage: http://github.com/yob/pdf-reader
 licenses: []
 post_install_message: