RubyGems - pdf-reader - Versions diffs - 0.8.6 → 0.9.0 - Mend

pdf-reader 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/CHANGELOG +17 -0
data/README.rdoc +7 -15
data/Rakefile +10 -63
data/TODO +6 -8
data/bin/pdf_object +3 -0
data/bin/pdf_text +4 -2
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +1 -1
data/examples/text.rb +3 -0
data/lib/pdf/hash.rb +8 -225
data/lib/pdf/reader.rb +79 -55
data/lib/pdf/reader/abstract_strategy.rb +77 -0
data/lib/pdf/reader/buffer.rb +61 -40
data/lib/pdf/reader/cmap.rb +11 -10
data/lib/pdf/reader/encoding.rb +85 -79
data/lib/pdf/reader/error.rb +1 -2
data/lib/pdf/reader/filter.rb +109 -6
data/lib/pdf/reader/font.rb +11 -11
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +53 -0
data/lib/pdf/reader/object_hash.rb +275 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
data/lib/pdf/reader/parser.rb +74 -37
data/lib/pdf/reader/print_receiver.rb +0 -1
data/lib/pdf/reader/register_receiver.rb +21 -0
data/lib/pdf/reader/stream.rb +5 -1
data/lib/pdf/reader/text_receiver.rb +3 -1
data/lib/pdf/reader/token.rb +1 -1
data/lib/pdf/reader/xref.rb +126 -64
metadata +61 -13
data/lib/pdf/reader/explore.rb +0 -116

data/lib/pdf/reader/parser.rb CHANGED

@@ -32,10 +32,10 @@ class PDF::Reader
     # Create a new parser around a PDF::Reader::Buffer object
     #
     # buffer - a PDF::Reader::Buffer object that contains PDF data
-    # xref   - a PDF::Reader::XRef object that represents the document's object offsets
-    def initialize (buffer, xref=nil)
+    # ohash  - a PDF::Reader::ObjectHash object that can return objects from the PDF file
+    def initialize (buffer, ohash=nil)
       @buffer = buffer
-      @xref   = xref
+      @ohash  = ohash
     end
     ################################################################################
     # Reads the next token from the underlying buffer and convets it to an appropriate
@@ -46,23 +46,22 @@ class PDF::Reader
       token = @buffer.token
       case token
-      when PDF::Reader::Reference     then return token
-      when nil                        then return nil
-      when "/"                        then return pdf_name()
-      when "<<"                       then return dictionary()
-      when "["                        then return array()
-      when "("                        then return string()
-      when "<"                        then return hex_string()
-      when "true"                     then return true
-      when "false"                    then return false
-      when "null"                     then return nil
-      when "obj", "endobj"            then return Token.new(token)
-      when "stream", "endstream"      then return Token.new(token)
-      when ">>", "]", ">", ")"        then return Token.new(token)
+      when PDF::Reader::Reference, nil then return token
+      when "/"                         then return pdf_name()
+      when "<<"                        then return dictionary()
+      when "["                         then return array()
+      when "("                         then return string()
+      when "<"                         then return hex_string()
+      when "true"                      then return true
+      when "false"                     then return false
+      when "null"                      then return nil
+      when "obj", "endobj", "stream", "endstream" then return Token.new(token)
+      when "stream", "endstream"       then return Token.new(token)
+      when ">>", "]", ">", ")"         then return Token.new(token)
       else
-        if operators.has_key?(token)  then return Token.new(token)
-        elsif token =~ /\d*\.\d/      then return token.to_f
-        else                          return token.to_i
+        if operators.has_key?(token)   then return Token.new(token)
+        elsif token =~ /\d*\.\d/       then return token.to_f
+        else                           return token.to_i
         end
       end
     end
@@ -151,30 +150,68 @@ class PDF::Reader
       return "" if str == ")"
       Error.assert_equal(parse_token, ")")
-      str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
-      str.gsub!("\\n","\n")
-      str.gsub!("\\r","\r")
-      str.gsub!("\\t","\t")
-      str.gsub!("\\b","\b")
-      str.gsub!("\\f","\f")
-      str.gsub!("\\(","(")
-      str.gsub!("\\)",")")
-      str.gsub!("\\\\","\\")
-      str.gsub!(/\\\n/m,"")
-      str.scan(/\\\d{1,3}/).each do |octal|
-        str.gsub!(octal, octal[1,3].oct.chr)
+      ret = ""
+      idx = 0
+      while idx < str.size
+        chr = str[idx,1]
+        jump = 1
+        if chr == "\\"
+          jump = 2
+          case str[idx+1, 1]
+          when "" then jump = 1
+          when "n"  then chr = "\n"
+          when "r"  then chr = "\r"
+          when "t"  then chr = "\t"
+          when "b"  then chr = "\b"
+          when "f"  then chr = "\f"
+          when "("  then chr = "("
+          when ")"  then chr = ")"
+          when "\\" then chr = "\\"
+          when "\n" then
+            chr = ""
+            jump = 2
+          else
+            if str[idx+1,3].match(/\d{3}/)
+              jump = 4
+              chr = str[idx+1,3].oct.chr
+            elsif str[idx+1,2].match(/\d{2}/)
+              jump = 3
+              chr = ("0"+str[idx+1,2]).oct.chr
+            elsif str[idx+1,1].match(/\d/)
+              jump = 2
+              chr = ("00"+str[idx+1,1]).oct.chr
+            else
+              jump = 1
+              chr = ""
+            end
+          end
+        elsif chr == "\r" && str[idx+1,1] == "\n"
+          chr = "\n"
+          jump = 2
+        elsif chr == "\n" && str[idx+1,1] == "\r"
+          chr = "\n"
+          jump = 2
+        elsif chr == "\r"
+          chr = "\n"
+        end
+        ret << chr
+        idx += jump
       end
-      str.gsub!(/\\([^\\])/,'\1')
-      str
+      ret
     end
     ################################################################################
     # Decodes the contents of a PDF Stream and returns it as a Ruby String.
     def stream (dict)
       raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
-      data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
+      if @ohash
+        length = @ohash.object(dict[:Length])
+      else
+        length = dict[:Length] || 0
+      end
+      data = @buffer.read(length, :skip_eol => true)
       Error.str_assert(parse_token, "endstream")
       Error.str_assert(parse_token, "endobj")

data/lib/pdf/reader/print_receiver.rb CHANGED

@@ -8,7 +8,6 @@ class PDF::Reader
     end
     def respond_to?(meth)
-      return false if [:begin_inline_image_data].include?(meth)
       true
     end

data/lib/pdf/reader/register_receiver.rb CHANGED

@@ -1,4 +1,21 @@
+# coding: utf-8
+# Copyright (C) 2010 James Healy (jimmy@deefa.com)
 class PDF::Reader
+  # An example receiver that just records all callbacks generated by parsing
+  # a PDF file.
+  #
+  # Useful for testing the contents of a file in an rspec/test-unit suite.
+  #
+  # Usage:
+  #
+  #  receiver = PDF::Reader::RegisterReceiver.new
+  #  PDF::Reader.file("somefile.pdf", receiver)
+  #  callback = receiver.first_occurance_of(:show_text)
+  #  callback[:args].first.should == "Hellow World"
+  #
   class RegisterReceiver
     attr_accessor :callbacks
@@ -31,6 +48,10 @@ class PDF::Reader
       return ret
     end
+    def all_args(methodname)
+      all(methodname).map { |cb| cb[:args] }
+    end
     # return the details for the first time the specified callback was fired
     def first_occurance_of(methodname)
       callbacks.each do |cb|

data/lib/pdf/reader/stream.rb CHANGED

@@ -50,7 +50,11 @@ class PDF::Reader
         options = []
         if hash.has_key?(:DecodeParms)
-          options = Array(hash[:DecodeParms])
+          if hash[:DecodeParms].is_a?(Hash)
+            options = [hash[:DecodeParms]]
+          else
+            options = hash[:DecodeParms]
+          end
         end
         Array(hash[:Filter]).each_with_index do |filter, index|

data/lib/pdf/reader/text_receiver.rb CHANGED

@@ -96,7 +96,9 @@ class PDF::Reader
     end
     ################################################################################
     # PDF operator Tm
-    def set_text_matrix_and_text_line_matrix (a, b, c, d, e, f)
+    def set_text_matrix_and_text_line_matrix (*args)
+      # these variable names look bad, but they're from the PDF spec
+      a, b, c, d, e, f = *args
       calculate_line_and_location(f)
     end
     ################################################################################

data/lib/pdf/reader/token.rb CHANGED

@@ -28,7 +28,7 @@ class PDF::Reader
   # An internal PDF::Reader class that represents a single token from a PDF file.
   #
   # Behaves exactly like a Ruby String - it basically exists for convenience.
-  class Token < String
+  class Token < String # :nodoc:
     ################################################################################
     # Creates a new token with the specified value
     def initialize (val)

data/lib/pdf/reader/xref.rb CHANGED

@@ -25,76 +25,48 @@
 class PDF::Reader
   ################################################################################
-  # An internal PDF::Reader class that represents the Xref table in a PDF file
+  # An internal PDF::Reader class that represents the XRef table in a PDF file as a
+  # hash-like object.
+  #
   # An Xref table is a map of object identifiers and byte offsets. Any time a particular
   # object needs to be found, the Xref table is used to find where it is stored in the
   # file.
+  #
+  # Hash keys are object ids, values are either:
+  #
+  # * a byte offset where the object starts (regular PDF objects)
+  # * a PDF::Reader::Reference instance that points to a stream that contains the
+  #   desired object (PDF objects embedded in an object stream)
+  #
+  # The class behaves much like a standard Ruby hash, including the use of
+  # the Enumerable mixin. The key difference is no []= method - the hash
+  # is read only.
+  #
   class XRef
+    include Enumerable
+    attr_reader :trailer
     ################################################################################
-    # create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
+    # create a new Xref table based on the contents of the supplied io object
+    #
+    # io - must be an IO object, generally either a file or a StringIO
+    #
     def initialize (io)
       @io = io
       @xref = {}
+      @trailer = load_offsets
     end
+    ################################################################################
+    # return the number of objects in this file. Objects with multiple generations are
+    # only counter once.
     def size
       @xref.size
     end
     ################################################################################
-    # returns the PDF version of the current document. Technically this isn't part of the XRef
-    # table, but it is one of the lowest level data items in the file, so we've lumped it in
-    # with the cross reference code.
-    def pdf_version
-      @io.seek(0)
-      m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
-      raise MalformedPDFError, 'invalid PDF version' if version.nil?
-      return version.to_f
-    end
-    ################################################################################
-    # Read the xref table from the underlying buffer. If offset is specified the table
-    # will be loaded from there, otherwise the default offset will be located and used.
-    #
-    # Will fail silently if there is no xref table at the requested offset.
-    def load (offset = nil)
-      offset ||= new_buffer.find_first_xref_offset
-      buf = new_buffer(offset)
-      token = buf.token
-      if token == "xref" || token == "ref"
-        load_xref_table(buf)
-      elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
-        raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
-      else
-        raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
-      end
-    end
-    ################################################################################
-    # Return a string containing the contents of an entire PDF object. The object is requested
-    # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
-    # number
-    #
-    # If the object is a stream, that is returned as well
-    def object (ref)
-      return ref unless ref.kind_of?(Reference)
-      buf = new_buffer(offset_for(ref))
-      obj = Parser.new(buf, self).object(ref.id, ref.gen)
-      return obj
-    end
-    # returns the type of object a ref points to
-    def obj_type(ref)
-      obj = object(ref)
-      obj.class.to_s.to_sym
-    end
-    # returns true if the supplied references points to an object with a stream
-    def stream?(ref)
-      obj, stream = @xref.object(ref)
-      stream ? true : false
-    end
-    ################################################################################
     # returns the byte offset for the specified PDF object.
     #
     # ref - a PDF::Reader::Reference object containing an object ID and revision number
-    def offset_for (ref)
+    def [](ref)
       @xref[ref.id][ref.gen]
     rescue
       raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
@@ -105,20 +77,42 @@ class PDF::Reader
       ids = @xref.keys.sort
       ids.each do |id|
         gen = @xref[id].keys.sort[-1]
-        ref = PDF::Reader::Reference.new(id, gen)
-        yield ref, object(ref)
+        yield PDF::Reader::Reference.new(id, gen)
       end
     end
     ################################################################################
-    # Stores an offset value for a particular PDF object ID and revision number
-    def store (id, gen, offset)
-      (@xref[id] ||= {})[gen] ||= offset
-    end
-    ################################################################################
     private
     ################################################################################
-    # Assumes the underlying buffer is positioned at the start of an Xref table and
-    # processes it into memory.
+    # Read a xref table from the underlying buffer.
+    #
+    # If offset is specified the table will be loaded from there, otherwise the
+    # default offset will be located and used.
+    #
+    # After seeking to the offset, processing is handed of to either load_xref_table()
+    # or load_xref_stream() based on what we find there.
+    #
+    def load_offsets(offset = nil)
+      offset ||= new_buffer.find_first_xref_offset
+      buf = new_buffer(offset)
+      tok_one = buf.token
+      return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
+      tok_two   = buf.token
+      tok_three = buf.token
+      if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
+        buf = new_buffer(offset)
+        stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
+        return load_xref_stream(stream)
+      end
+      raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
+    end
+    ################################################################################
+    # Assumes the underlying buffer is positioned at the start of a traditional
+    # Xref table and processes it into memory.
     def load_xref_table(buf)
       params = []
@@ -142,14 +136,82 @@ class PDF::Reader
       raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
-      load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
+      load_offsets(trailer[:XRefStm])   if trailer.has_key?(:XRefStm)
+      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
       trailer
     end
+    ################################################################################
+    # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
+    #
+    def load_xref_stream(stream)
+      unless stream.hash[:Type] == :XRef
+        raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
+      end
+      trailer = {}
+      trailer[:Root] = stream.hash[:Root] if stream.hash[:Root]
+      trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
+      trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
+      widths = stream.hash[:W]
+      entry_length = widths.inject(0) { |s, w| s + w }
+      raw_data = stream.unfiltered_data
+      if stream.hash[:Index]
+        index = stream.hash[:Index][0]
+      else
+        index = 0
+      end
+      stream.hash[:Size].times do |i|
+        entry = raw_data[i*entry_length, entry_length] || ""
+        f1    = unpack_bytes(entry[0,widths[0]])
+        f2    = unpack_bytes(entry[widths[0],widths[1]])
+        f3    = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
+        if f1 == 1
+          store(index + i, f3, f2)
+        elsif f1 == 2
+          store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
+        end
+      end
+      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
+      trailer
+    end
+    ################################################################################
+    # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
+    # bytes they need to be converted to an int in different ways.
+    #
+    def unpack_bytes(bytes)
+      if bytes.to_s.size == 0
+        0
+      elsif bytes.size == 1
+        bytes.unpack("C")[0]
+      elsif bytes.size == 2
+        bytes.unpack("n")[0]
+      elsif bytes.size == 3
+        ("\x00" + bytes).unpack("N")[0]
+      elsif bytes.size == 4
+        bytes.unpack("N")[0]
+      else
+        raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
+      end
+    end
+    ################################################################################
+    # Wrap the io stream we're working with in a buffer that can tokenise it for us.
+    #
+    # We create multiple buffers so we can be tokenising multiple sections of the file
+    # at the same time without worring about clearing the buffers contents.
+    #
     def new_buffer(offset = 0)
       PDF::Reader::Buffer.new(@io, :seek => offset)
     end
+    ################################################################################
+    # Stores an offset value for a particular PDF object ID and revision number
+    #
+    def store (id, gen, offset)
+      (@xref[id] ||= {})[gen] ||= offset
+    end
   end
   ################################################################################
 end