RubyGems - pdf-reader - Versions diffs - 0.6.1 → 0.6.2 - Mend

pdf-reader 0.6.1 → 0.6.2

Files changed (12) hide show

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,13 @@
+v0.6.2 (22nd March 2008)
+- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
+- Added support for processing inline images
+- Support for parsing XRef tables that have multiple subsections
+- Added a few callbacks to improve the way we supply information on page resources
+- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
+- Use our "unknown character box" when a single character in an Identity-H string fails to decode
+- Support ToUnicode CMaps that use the bfrange operator
+- Tweaked tokenising code to ensure whitespace doesn't get in the way
 v0.6.1 (12th March 2008)
 - Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
   just replace each character with a little box.

data/Rakefile CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.6.1"
+PKG_VERSION = "0.6.2"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/TODO CHANGED Viewed

@@ -4,7 +4,6 @@ v0.7
   - maybe a third option to Reader.parse?
     parse(io, receiver, {:pages => true, :fonts => false, :metadata => true, :bookmarks => false})
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
-- When parsing a CMap into a ruby object, recognise ranged mappings defined by begincodespacerange (see spec, section 5.9.2)
 - Provide a way to get raw access to a particular object. Good for testing purposes
 v0.8
@@ -14,10 +13,14 @@ v0.8
 v0.9
 - Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
+  - Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
 - Add a way to extract raster images
+  - see XObjects section of spec (section 4.7)
+- Add a way to extract font data?
 Sometime
+- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
 - Ship some extra receivers in the standard package, particuarly ones that are useful for running
   rspec over generated PDF files
@@ -33,3 +36,5 @@ Sometime
   - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
 - Investigate how R->L text is handled
+- Add support for object streams (spec section 3.4.6)

data/lib/pdf/reader.rb CHANGED Viewed

@@ -87,7 +87,6 @@ require 'pdf/reader/text_receiver'
 require 'pdf/reader/token'
 require 'pdf/reader/xref'
 class PDF::Reader
   ################################################################################
   # Initialize a new PDF::Reader

data/lib/pdf/reader/buffer.rb CHANGED Viewed

@@ -56,6 +56,24 @@ class PDF::Reader
       out
     end
     ################################################################################
+    # Reads from the buffer until the specified token is found, or the end of the buffer
+    #
+    # bytes - the bytes to search for.
+    def read_until(bytes)
+      out = ""
+      size = bytes.size
+      loop do
+        out << @io.read(1)
+        if out[-1 * size,size].eql?(bytes)
+          out = out[0, out.size - size]
+          seek(pos - size)
+          break
+        end
+      end
+      out
+    end
+    ################################################################################
     # returns true if the underlying IO object is at end and the internal buffer
     # is empty
     def eof?
@@ -71,21 +89,21 @@ class PDF::Reader
     end
     ################################################################################
     # PDF files are processed by tokenising the content into a series of objects and commands.
-    # This prepares the buffer for use by rerading the next line of tokens into memory.
+    # This prepares the buffer for use by reading the next line of tokens into memory.
     def ready_token (with_strip=true, skip_blanks=true)
       while @buffer.nil? or @buffer.empty?
         @buffer = @io.readline
         @buffer.sub!(/%.*$/, '')
         @buffer.chomp!
-        @buffer.lstrip! if with_strip
         break unless skip_blanks
       end
+      @buffer.lstrip! if with_strip
     end
     ################################################################################
     # return the next token from the underlying IO stream
     def token
       ready_token
       i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
       token_chars =

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -28,12 +28,24 @@ class PDF::Reader
     def initialize(data)
       @map = {}
-      inmap = false
+      in_char_mode = false
+      in_range_mode = false
       data.each_line do |l|
-        inmap = true if l.include?("beginbfchar")
-        if inmap
-          m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
-          @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+        if l.include?("beginbfchar")
+          in_char_mode = true
+        elsif l.include?("endbfchar")
+          in_char_mode = false
+        elsif l.include?("beginbfrange")
+          in_range_mode = true
+        elsif l.include?("endbfrange")
+          in_range_mode = false
+        end
+        if in_char_mode
+          process_bfchar_line(l)
+        elsif in_range_mode
+          process_bfrange_line(l)
         end
       end
     end
@@ -44,5 +56,29 @@ class PDF::Reader
       @map[c]
     end
+    private
+    def process_bfchar_line(l)
+      m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
+      @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+    end
+    def process_bfrange_line(l)
+      m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
+      if start_code && end_code && dst
+        start_code = "0x#{start_code}".hex
+        end_code   = "0x#{end_code}".hex
+        dst        = "0x#{dst}".hex
+        incr       = 0
+        # add all values in the range to our mapping
+        (start_code..end_code).each do |val|
+          @map[val] = dst + incr
+          incr += 1
+          # ensure a single range does not exceed 255 chars
+          raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if incr > 255
+        end
+      end
+    end
   end
 end

data/lib/pdf/reader/content.rb CHANGED Viewed

@@ -23,6 +23,7 @@
 #
 ################################################################################
 require 'stringio'
+#require 'enumerable'
 class PDF::Reader
   ################################################################################
@@ -144,6 +145,25 @@ class PDF::Reader
   # - end_page_container
   # - begin_page
   # - end_page
+  #
+  # == Resource Callbacks
+  #
+  # Each page and page_container can contain a range of resources required for the page,
+  # including things like fonts and images. The following callbacks may appear
+  # after begin_page_container and begin_page if the relevant resources exist
+  # on a page:
+  #
+  # In most cases, these callbacks associate a name with each resource, allowing it
+  # to be referred to by name in the page content. For example, an XObject can hold an image.
+  # If it gets mapped to the name "IM1", then it can be placed on the page using
+  # invoke_xobject "IM1".
+  #
+  # - resource_procset
+  # - resource_xobject
+  # - resource_extgstate
+  # - resource_colorspace
+  # - resource_pattern
+  # - resource_font
   class Content
     OPERATORS = {
       'b'   => :close_fill_stroke,
@@ -240,20 +260,27 @@ class PDF::Reader
     # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
     # its content
     def walk_pages (page)
-      resolve_resources(@xref.object(page['Resources'])) if page['Resources']
+      if page['Resources']
+        res = page['Resources']
+        page.delete('Resources')
+      end
       # extract page content
       if page['Type'] == "Pages"
         callback(:begin_page_container, [page])
+        walk_resources(@xref.object(res)) if res
         page['Kids'].each {|child| walk_pages(@xref.object(child))}
         callback(:end_page_container)
       elsif page['Type'] == "Page"
         callback(:begin_page, [page])
+        walk_resources(@xref.object(res)) if res
         @page = page
         @params = []
         page['Contents'].to_a.each do |cstream|
-          content_stream(@xref.object(cstream))
+          obj, stream = @xref.object(cstream)
+          content_stream(stream)
         end if page.has_key?('Contents') and page['Contents']
         callback(:end_page)
@@ -274,9 +301,19 @@ class PDF::Reader
           if token.kind_of?(Token) and OPERATORS.has_key?(token)
             @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
-            # convert any text to utf-8
+            # handle special cases in response to certain operators
             if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
+              # convert any text to utf-8
               @params = @fonts[@current_font].to_utf8(@params)
+            elsif token == "ID"
+              # inline image data, first convert the current params into a more familiar hash
+              map = {}
+              @params.each_slice(2) do |a|
+                map[a.first] = a.last
+              end
+              @params = [map]
+              # read the raw image data from the buffer without tokenising
+              @params << @buffer.read_until("EI")
             end
             callback(OPERATORS[token], @params)
             @params.clear
@@ -289,7 +326,43 @@ class PDF::Reader
     rescue EOFError => e
     end
     ################################################################################
-    def resolve_resources(resources)
+    def walk_resources(resources)
+      resources = resolve_references(resources)
+      # extract any procset information
+      if resources['ProcSet']
+        callback(:resource_procset, resources['ProcSet'])
+      end
+      # extract any xobject information
+      if resources['XObject']
+        @xref.object(resources['XObject']).each do |name, val|
+          obj, stream = @xref.object(val)
+          callback(:resource_xobject, [name, obj, stream])
+        end
+      end
+      # extract any extgstate information
+      if resources['ExtGState']
+        @xref.object(resources['ExtGState']).each do |name, val|
+          callback(:resource_extgstate, [name, @xref.object(val)])
+        end
+      end
+      # extract any colorspace information
+      if resources['ColorSpace']
+        @xref.object(resources['ColorSpace']).each do |name, val|
+          callback(:resource_colorspace, [name, @xref.object(val)])
+        end
+      end
+      # extract any pattern information
+      if resources['Pattern']
+        @xref.object(resources['Pattern']).each do |name, val|
+          callback(:resource_pattern, [name, @xref.object(val)])
+        end
+      end
       # extract any font information
       if resources['Font']
         @xref.object(resources['Font']).each do |label, desc|
@@ -301,15 +374,29 @@ class PDF::Reader
           @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc['Encoding']))
           @fonts[label].descendantfonts = desc['DescendantFonts'] if desc['DescendantFonts']
           if desc['ToUnicode']
-            @fonts[label].tounicode = desc['ToUnicode']
-            @fonts[label].tounicode = @xref.object(@fonts[label].tounicode)
+            obj, cmap = @xref.object(desc['ToUnicode'])
+            # this stream is a cmap
+            begin
+              @fonts[label].tounicode = PDF::Reader::CMap.new(cmap)
+            rescue
+              # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
+            end
           end
+          callback(:resource_font, [label, @fonts[label]])
         end
       end
-      #@fonts.each do |key,val|
-      #  puts "#{key}: #{val.inspect}"
-      #  puts
-      #end
+    end
+    ################################################################################
+    # Convert any PDF::Reader::Resource objects into a real object
+    def resolve_references(obj)
+      case obj
+      when PDF::Reader::Reference then resolve_references(@xref.object(obj))
+      when Hash                   then obj.each { |key,val| obj[key] = resolve_references(val) }
+      when Array                  then obj.collect { |item| resolve_references(item) }
+      else
+        obj
+      end
     end
     ################################################################################
     # calls the name callback method on the receiver class with params as the arguments

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -111,12 +111,13 @@ class PDF::Reader
         # iterate over string, reading it in 2 byte chunks and interpreting those
         # chunks as ints
         str.unpack("n*").each do |c|
           # convert the int to a unicode codepoint if possible.
           # without a ToUnicode CMap, it's impossible to reliably convert this text
           # to unicode, so just replace each character with a little box. Big smacks
           # the the PDF producing app.
-          if map
-            array_enc << map.decode(c)
+          if map && (code = map.decode(c))
+            array_enc << code
           else
             array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
           end

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -40,20 +40,27 @@ class PDF::Reader
       case name
       when "FlateDecode"    then @filter = :flate
-      else                    raise UnsupportedFeatureError, "Unknown filter: #{name}"
+      #else                    raise UnsupportedFeatureError, "Unknown filter: #{name}"
       end
     end
     ################################################################################
     # attempts to decode the specified data with the current filter
     def filter (data)
+      # leave the data untouched if we don't support the required filter
+      return data if @filter.nil?
+      # decode the data
       self.send(@filter, data)
     end
     ################################################################################
     # Decode the specified data with the Zlib compression algorithm
     def flate (data)
-      z = Zlib::Inflate.new
-      z << data
-      z.inflate(nil)
+      begin
+        z = Zlib::Inflate.new
+        z.inflate(data)
+      rescue Exception => e
+        raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
+      end
     end
     ################################################################################
   end

data/lib/pdf/reader/parser.rb CHANGED Viewed

@@ -9,10 +9,10 @@
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
-#
+#
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
-#
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -58,9 +58,9 @@ class PDF::Reader
       when "obj", "endobj"            then return Token.new(token)
       when "stream", "endstream"      then return Token.new(token)
       when ">>", "]", ">"             then return Token.new(token)
-      else
+      else
         if operators.has_key?(token)  then return Token.new(token)
-        else                            return token.to_f
+        else                          return token.to_f
         end
       end
     end
@@ -72,7 +72,7 @@ class PDF::Reader
       loop do
         key = parse_token
         break if key.kind_of?(Token) and key == ">>"
-        raise MalformedPDFError, "PDF malformed, dictionary key is not a name" unless key.kind_of?(Name)
+        raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Name)
         value = parse_token
         value.kind_of?(Token) and Error.str_assert_not(value, ">>")
@@ -97,9 +97,15 @@ class PDF::Reader
     ################################################################################
     # Reads a PDF hex string from the buffer and converts it to a Ruby String
     def hex_string
-      str = @buffer.token
-      Error.str_assert(@buffer.token, ">")
+      str = ""
+      loop do
+        token = @buffer.token
+        break if token == ">"
+        str << token
+      end
+      # add a missing digit if required, as required by the spec
       str << "0" unless str.size % 2 == 0
       str.scan(/../).map {|i| i.hex.chr}.join
     end
@@ -151,11 +157,12 @@ class PDF::Reader
         @buffer.head(to_remove, false)
       end
       str
     end
     ################################################################################
     # Reads an entire PDF object from the buffer and returns it as a Ruby String.
+    # If the object is a content stream, returns both the stream and the dictionary
+    # that describes it
     #
     # id  - the object ID to return
     # gen - the object revision number to return
@@ -166,11 +173,10 @@ class PDF::Reader
       obj = parse_token
       post_obj = parse_token
       case post_obj
       when "endobj"   then return obj
-      when "stream"   then return stream(obj)
-      else              raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
+      when "stream"   then return obj, stream(obj)
+      else            raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
       end
     end
     ################################################################################
@@ -178,6 +184,7 @@ class PDF::Reader
     def stream (dict)
       raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
       data = @buffer.read(@xref.object(dict['Length']))
       Error.str_assert(parse_token, "endstream")
       Error.str_assert(parse_token, "endobj")
@@ -193,9 +200,6 @@ class PDF::Reader
         end
       end
-      # this stream is a cmap
-      data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")
       data
     end
     ################################################################################

data/lib/pdf/reader/xref.rb CHANGED Viewed

@@ -42,44 +42,69 @@ class PDF::Reader
     #
     # Will fail silently if there is no xref table at the requested offset.
     def load (offset = nil)
-      @buffer.seek(offset || @buffer.find_first_xref_offset)
+      offset ||= @buffer.find_first_xref_offset
+      @buffer.seek(offset)
       token = @buffer.token
-      if token == "xref"
+      if token == "xref" || token == "ref"
         load_xref_table
+      else
+        raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
       end
     end
     ################################################################################
     # Return a string containing the contents of an entire PDF object. The object is requested
     # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
     # number
+    #
+    # If the object is a stream, that is returned as well
     def object (ref, save_pos = true)
       return ref unless ref.kind_of?(Reference)
       pos = @buffer.pos if save_pos
-      parser = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
+      obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
       @buffer.seek(pos) if save_pos
-      parser
+      if stream
+        return obj, stream
+      else
+        return obj
+      end
     end
     ################################################################################
     # Assumes the underlying buffer is positioned at the start of an Xref table and
     # processes it into memory.
     def load_xref_table
-      objid, count = @buffer.token.to_i, @buffer.token.to_i
+      tok_one = tok_two = nil
+      begin
+        # loop over all subsections of the xref table
+        # In a well formed PDF, the 'trailer' token will indicate
+        # the end of the table. However we need to be careful in case
+        # we're processing a malformed pdf that is missing the trailer.
+        loop do
+          tok_one, tok_two = @buffer.token, @buffer.token
+          if tok_one != "trailer" && !tok_one.match(/\d+/)
+            raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
+          end
+          break if tok_one == "trailer" or tok_one.nil?
+          objid, count = tok_one.to_i, tok_two.to_i
-      count.times do
-        offset = @buffer.token.to_i
-        generation = @buffer.token.to_i
-        state = @buffer.token
+          count.times do
+            offset = @buffer.token.to_i
+            generation = @buffer.token.to_i
+            state = @buffer.token
-        store(objid, generation, offset) if state == "n"
-        objid += 1
+            store(objid, generation, offset) if state == "n"
+            objid += 1
+          end
+        end
+      rescue EOFError => e
+        raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
       end
-      raise MalformedPDFError, "PDF malformed, missing trailer after cross reference" unless @buffer.token == "trailer"
-      raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless @buffer.token == "<<"
+      raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
       trailer = Parser.new(@buffer, self).dictionary
-      load(trailer['Prev']) if trailer.has_key?('Prev')
+      load(trailer['Prev'].to_i) if trailer.has_key?('Prev')
       trailer
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-03-12 00:00:00 +11:00
+date: 2008-03-22 00:00:00 +11:00
 default_executable:
 dependencies: []