RubyGems - pdf-reader - Versions diffs - 0.6.1 → 0.6.2 - Mend

pdf-reader 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,13 @@
+v0.6.2 (22nd March 2008)
+- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
+- Added support for processing inline images
+- Support for parsing XRef tables that have multiple subsections
+- Added a few callbacks to improve the way we supply information on page resources
+- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
+- Use our "unknown character box" when a single character in an Identity-H string fails to decode
+- Support ToUnicode CMaps that use the bfrange operator
+- Tweaked tokenising code to ensure whitespace doesn't get in the way
 v0.6.1 (12th March 2008)
 - Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
   just replace each character with a little box.

data/Rakefile CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.6.1"
+PKG_VERSION = "0.6.2"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/TODO CHANGED Viewed

@@ -4,7 +4,6 @@ v0.7
   - maybe a third option to Reader.parse?
     parse(io, receiver, {:pages => true, :fonts => false, :metadata => true, :bookmarks => false})
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
-- When parsing a CMap into a ruby object, recognise ranged mappings defined by begincodespacerange (see spec, section 5.9.2)
 - Provide a way to get raw access to a particular object. Good for testing purposes
 v0.8
@@ -14,10 +13,14 @@ v0.8
 v0.9
 - Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
+  - Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
 - Add a way to extract raster images
+  - see XObjects section of spec (section 4.7)
+- Add a way to extract font data?
 Sometime
+- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
 - Ship some extra receivers in the standard package, particuarly ones that are useful for running
   rspec over generated PDF files
@@ -33,3 +36,5 @@ Sometime
   - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
 - Investigate how R->L text is handled
+- Add support for object streams (spec section 3.4.6)

data/lib/pdf/reader.rb CHANGED Viewed

@@ -87,7 +87,6 @@ require 'pdf/reader/text_receiver'
 require 'pdf/reader/token'
 require 'pdf/reader/xref'
 class PDF::Reader
   ################################################################################
   # Initialize a new PDF::Reader

data/lib/pdf/reader/buffer.rb CHANGED Viewed

@@ -56,6 +56,24 @@ class PDF::Reader
       out
     end
     ################################################################################
+    # Reads from the buffer until the specified token is found, or the end of the buffer
+    #
+    # bytes - the bytes to search for.
+    def read_until(bytes)
+      out = ""
+      size = bytes.size
+      loop do
+        out << @io.read(1)
+        if out[-1 * size,size].eql?(bytes)
+          out = out[0, out.size - size]
+          seek(pos - size)
+          break
+        end
+      end
+      out
+    end
+    ################################################################################
     # returns true if the underlying IO object is at end and the internal buffer
     # is empty
     def eof?
@@ -71,21 +89,21 @@ class PDF::Reader
     end
     ################################################################################
     # PDF files are processed by tokenising the content into a series of objects and commands.
-    # This prepares the buffer for use by rerading the next line of tokens into memory.
+    # This prepares the buffer for use by reading the next line of tokens into memory.
     def ready_token (with_strip=true, skip_blanks=true)
       while @buffer.nil? or @buffer.empty?
         @buffer = @io.readline
         @buffer.sub!(/%.*$/, '')
         @buffer.chomp!
-        @buffer.lstrip! if with_strip
         break unless skip_blanks
       end
+      @buffer.lstrip! if with_strip
     end
     ################################################################################
     # return the next token from the underlying IO stream
     def token
       ready_token
       i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
       token_chars =

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -28,12 +28,24 @@ class PDF::Reader
     def initialize(data)
       @map = {}
-      inmap = false
+      in_char_mode = false
+      in_range_mode = false
       data.each_line do |l|
-        inmap = true if l.include?("beginbfchar")
-        if inmap
-          m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
-          @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+        if l.include?("beginbfchar")
+          in_char_mode = true
+        elsif l.include?("endbfchar")
+          in_char_mode = false
+        elsif l.include?("beginbfrange")
+          in_range_mode = true
+        elsif l.include?("endbfrange")
+          in_range_mode = false
+        end
+        if in_char_mode
+          process_bfchar_line(l)
+        elsif in_range_mode
+          process_bfrange_line(l)
         end
       end
     end
@@ -44,5 +56,29 @@ class PDF::Reader
       @map[c]
     end
+    private
+    def process_bfchar_line(l)
+      m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
+      @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+    end
+    def process_bfrange_line(l)
+      m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
+      if start_code && end_code && dst
+        start_code = "0x#{start_code}".hex
+        end_code   = "0x#{end_code}".hex
+        dst        = "0x#{dst}".hex
+        incr       = 0
+        # add all values in the range to our mapping
+        (start_code..end_code).each do |val|
+          @map[val] = dst + incr
+          incr += 1
+          # ensure a single range does not exceed 255 chars
+          raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if incr > 255
+        end
+      end
+    end
   end
 end

data/lib/pdf/reader/content.rb CHANGED Viewed

@@ -23,6 +23,7 @@
 #
 ################################################################################
 require 'stringio'
+#require 'enumerable'
 class PDF::Reader
   ################################################################################
@@ -144,6 +145,25 @@ class PDF::Reader
   # - end_page_container
   # - begin_page
   # - end_page
+  #
+  # == Resource Callbacks
+  #
+  # Each page and page_container can contain a range of resources required for the page,
+  # including things like fonts and images. The following callbacks may appear
+  # after begin_page_container and begin_page if the relevant resources exist
+  # on a page:
+  #
+  # In most cases, these callbacks associate a name with each resource, allowing it
+  # to be referred to by name in the page content. For example, an XObject can hold an image.
+  # If it gets mapped to the name "IM1", then it can be placed on the page using
+  # invoke_xobject "IM1".
+  #
+  # - resource_procset
+  # - resource_xobject
+  # - resource_extgstate
+  # - resource_colorspace
+  # - resource_pattern
+  # - resource_font
   class Content
     OPERATORS = {
       'b'   => :close_fill_stroke,
@@ -240,20 +260,27 @@ class PDF::Reader
     # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
     # its content
     def walk_pages (page)
-      resolve_resources(@xref.object(page['Resources'])) if page['Resources']
+      if page['Resources']
+        res = page['Resources']
+        page.delete('Resources')
+      end
       # extract page content
       if page['Type'] == "Pages"
         callback(:begin_page_container, [page])
+        walk_resources(@xref.object(res)) if res
         page['Kids'].each {|child| walk_pages(@xref.object(child))}
         callback(:end_page_container)
       elsif page['Type'] == "Page"
         callback(:begin_page, [page])
+        walk_resources(@xref.object(res)) if res
         @page = page
         @params = []
         page['Contents'].to_a.each do |cstream|
-          content_stream(@xref.object(cstream))
+          obj, stream = @xref.object(cstream)
+          content_stream(stream)
         end if page.has_key?('Contents') and page['Contents']
         callback(:end_page)
@@ -274,9 +301,19 @@ class PDF::Reader
           if token.kind_of?(Token) and OPERATORS.has_key?(token)
             @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
-            # convert any text to utf-8
+            # handle special cases in response to certain operators
             if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
+              # convert any text to utf-8
               @params = @fonts[@current_font].to_utf8(@params)
+            elsif token == "ID"
+              # inline image data, first convert the current params into a more familiar hash
+              map = {}
+              @params.each_slice(2) do |a|
+                map[a.first] = a.last
+              end
+              @params = [map]
+              # read the raw image data from the buffer without tokenising
+              @params << @buffer.read_until("EI")
             end
             callback(OPERATORS[token], @params)
             @params.clear
@@ -289,7 +326,43 @@ class PDF::Reader
     rescue EOFError => e
     end
     ################################################################################
-    def resolve_resources(resources)
+    def walk_resources(resources)
+      resources = resolve_references(resources)
+      # extract any procset information
+      if resources['ProcSet']
+        callback(:resource_procset, resources['ProcSet'])
+      end
+      # extract any xobject information
+      if resources['XObject']
+        @xref.object(resources['XObject']).each do |name, val|
+          obj, stream = @xref.object(val)
+          callback(:resource_xobject, [name, obj, stream])
+        end
+      end
+      # extract any extgstate information
+      if resources['ExtGState']
+        @xref.object(resources['ExtGState']).each do |name, val|
+          callback(:resource_extgstate, [name, @xref.object(val)])
+        end
+      end
+      # extract any colorspace information
+      if resources['ColorSpace']
+        @xref.object(resources['ColorSpace']).each do |name, val|
+          callback(:resource_colorspace, [name, @xref.object(val)])
+        end
+      end
+      # extract any pattern information
+      if resources['Pattern']
+        @xref.object(resources['Pattern']).each do |name, val|
+          callback(:resource_pattern, [name, @xref.object(val)])
+        end
+      end
       # extract any font information
       if resources['Font']
         @xref.object(resources['Font']).each do |label, desc|
@@ -301,15 +374,29 @@ class PDF::Reader
           @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc['Encoding']))
           @fonts[label].descendantfonts = desc['DescendantFonts'] if desc['DescendantFonts']
           if desc['ToUnicode']
-            @fonts[label].tounicode = desc['ToUnicode']
-            @fonts[label].tounicode = @xref.object(@fonts[label].tounicode)
+            obj, cmap = @xref.object(desc['ToUnicode'])
+            # this stream is a cmap
+            begin
+              @fonts[label].tounicode = PDF::Reader::CMap.new(cmap)
+            rescue
+              # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
+            end
           end
+          callback(:resource_font, [label, @fonts[label]])
         end
       end
-      #@fonts.each do |key,val|
-      #  puts "#{key}: #{val.inspect}"
-      #  puts
-      #end
+    end
+    ################################################################################
+    # Convert any PDF::Reader::Resource objects into a real object
+    def resolve_references(obj)
+      case obj
+      when PDF::Reader::Reference then resolve_references(@xref.object(obj))
+      when Hash                   then obj.each { |key,val| obj[key] = resolve_references(val) }
+      when Array                  then obj.collect { |item| resolve_references(item) }
+      else
+        obj
+      end
     end
     ################################################################################
     # calls the name callback method on the receiver class with params as the arguments

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -111,12 +111,13 @@ class PDF::Reader
         # iterate over string, reading it in 2 byte chunks and interpreting those
         # chunks as ints
         str.unpack("n*").each do |c|
           # convert the int to a unicode codepoint if possible.
           # without a ToUnicode CMap, it's impossible to reliably convert this text
           # to unicode, so just replace each character with a little box. Big smacks
           # the the PDF producing app.
-          if map
-            array_enc << map.decode(c)
+          if map && (code = map.decode(c))
+            array_enc << code
           else
             array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
           end

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -40,20 +40,27 @@ class PDF::Reader
       case name
       when "FlateDecode"    then @filter = :flate
-      else                    raise UnsupportedFeatureError, "Unknown filter: #{name}"
+      #else                    raise UnsupportedFeatureError, "Unknown filter: #{name}"
       end
     end
     ################################################################################
     # attempts to decode the specified data with the current filter
     def filter (data)
+      # leave the data untouched if we don't support the required filter
+      return data if @filter.nil?
+      # decode the data
       self.send(@filter, data)
     end
     ################################################################################
     # Decode the specified data with the Zlib compression algorithm
     def flate (data)
-      z = Zlib::Inflate.new
-      z << data
-      z.inflate(nil)
+      begin
+        z = Zlib::Inflate.new
+        z.inflate(data)
+      rescue Exception => e
+        raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
+      end
     end
     ################################################################################
   end

data/lib/pdf/reader/parser.rb CHANGED Viewed

@@ -9,10 +9,10 @@
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
-#
+#
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
-#
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -58,9 +58,9 @@ class PDF::Reader
       when "obj", "endobj"            then return Token.new(token)
       when "stream", "endstream"      then return Token.new(token)
       when ">>", "]", ">"             then return Token.new(token)
-      else
+      else
         if operators.has_key?(token)  then return Token.new(token)
-        else                            return token.to_f
+        else                          return token.to_f
         end
       end
     end
@@ -72,7 +72,7 @@ class PDF::Reader
       loop do
         key = parse_token
         break if key.kind_of?(Token) and key == ">>"
-        raise MalformedPDFError, "PDF malformed, dictionary key is not a name" unless key.kind_of?(Name)
+        raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Name)
         value = parse_token
         value.kind_of?(Token) and Error.str_assert_not(value, ">>")
@@ -97,9 +97,15 @@ class PDF::Reader
     ################################################################################
     # Reads a PDF hex string from the buffer and converts it to a Ruby String
     def hex_string
-      str = @buffer.token
-      Error.str_assert(@buffer.token, ">")
+      str = ""
+      loop do
+        token = @buffer.token
+        break if token == ">"
+        str << token
+      end
+      # add a missing digit if required, as required by the spec
       str << "0" unless str.size % 2 == 0
       str.scan(/../).map {|i| i.hex.chr}.join
     end
@@ -151,11 +157,12 @@ class PDF::Reader
         @buffer.head(to_remove, false)
       end
       str
     end
     ################################################################################
     # Reads an entire PDF object from the buffer and returns it as a Ruby String.
+    # If the object is a content stream, returns both the stream and the dictionary
+    # that describes it
     #
     # id  - the object ID to return
     # gen - the object revision number to return
@@ -166,11 +173,10 @@ class PDF::Reader
       obj = parse_token
       post_obj = parse_token
       case post_obj
       when "endobj"   then return obj
-      when "stream"   then return stream(obj)
-      else              raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
+      when "stream"   then return obj, stream(obj)
+      else            raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
       end
     end
     ################################################################################
@@ -178,6 +184,7 @@ class PDF::Reader
     def stream (dict)
       raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
       data = @buffer.read(@xref.object(dict['Length']))
       Error.str_assert(parse_token, "endstream")
       Error.str_assert(parse_token, "endobj")
@@ -193,9 +200,6 @@ class PDF::Reader
         end
       end
-      # this stream is a cmap
-      data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")
       data
     end
     ################################################################################

data/lib/pdf/reader/xref.rb CHANGED Viewed

@@ -42,44 +42,69 @@ class PDF::Reader
     #
     # Will fail silently if there is no xref table at the requested offset.
     def load (offset = nil)
-      @buffer.seek(offset || @buffer.find_first_xref_offset)
+      offset ||= @buffer.find_first_xref_offset
+      @buffer.seek(offset)
       token = @buffer.token
-      if token == "xref"
+      if token == "xref" || token == "ref"
         load_xref_table
+      else
+        raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
       end
     end
     ################################################################################
     # Return a string containing the contents of an entire PDF object. The object is requested
     # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
     # number
+    #
+    # If the object is a stream, that is returned as well
     def object (ref, save_pos = true)
       return ref unless ref.kind_of?(Reference)
       pos = @buffer.pos if save_pos
-      parser = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
+      obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
       @buffer.seek(pos) if save_pos
-      parser
+      if stream
+        return obj, stream
+      else
+        return obj
+      end
     end
     ################################################################################
     # Assumes the underlying buffer is positioned at the start of an Xref table and
     # processes it into memory.
     def load_xref_table
-      objid, count = @buffer.token.to_i, @buffer.token.to_i
+      tok_one = tok_two = nil
+      begin
+        # loop over all subsections of the xref table
+        # In a well formed PDF, the 'trailer' token will indicate
+        # the end of the table. However we need to be careful in case
+        # we're processing a malformed pdf that is missing the trailer.
+        loop do
+          tok_one, tok_two = @buffer.token, @buffer.token
+          if tok_one != "trailer" && !tok_one.match(/\d+/)
+            raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
+          end
+          break if tok_one == "trailer" or tok_one.nil?
+          objid, count = tok_one.to_i, tok_two.to_i
-      count.times do
-        offset = @buffer.token.to_i
-        generation = @buffer.token.to_i
-        state = @buffer.token
+          count.times do
+            offset = @buffer.token.to_i
+            generation = @buffer.token.to_i
+            state = @buffer.token
-        store(objid, generation, offset) if state == "n"
-        objid += 1
+            store(objid, generation, offset) if state == "n"
+            objid += 1
+          end
+        end
+      rescue EOFError => e
+        raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
       end
-      raise MalformedPDFError, "PDF malformed, missing trailer after cross reference" unless @buffer.token == "trailer"
-      raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless @buffer.token == "<<"
+      raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
       trailer = Parser.new(@buffer, self).dictionary
-      load(trailer['Prev']) if trailer.has_key?('Prev')
+      load(trailer['Prev'].to_i) if trailer.has_key?('Prev')
       trailer
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-03-12 00:00:00 +11:00
+date: 2008-03-22 00:00:00 +11:00
 default_executable:
 dependencies: []