RubyGems - pdf-reader - Versions diffs - 0.8.3 → 0.8.4 - Mend

pdf-reader 0.8.3 → 0.8.4

Files changed (10) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,13 @@
+v0.8.4 (XXX)
+- fix parsing of files that use Form XObjects
+  - thanks to Andrea Barisani for reporting the issue
+- fix two issues that caused a small number of characters to convert to Unicode
+  incorrectly
+  - thanks to Andrea Barisani for reporting the issue
+- require 'pdf-reader' now works a well as 'pdf/reader'
+  - good practice to have the require file match the gem name
+  - thanks to Chris O'Meara for highlighting this
 v0.8.3 (14th February 2010)
 - Fix a bug in tokenising of hex strings inside dictionaries
   - Thanks to Brad Ediger for detecting the issue and proposing a solution

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.8.3"
+PKG_VERSION = "0.8.4"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/bin/pdf_text CHANGED

@@ -29,6 +29,7 @@ class PageTextReceiver
   def show_text_with_positioning(*params)
     params = params.first
+    params ||= []
     params.each { |str| show_text(str) if str.kind_of?(String)}
   end
 end

data/examples/version.rb ADDED

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Determine the PDF version of a file
+require 'rubygems'
+require 'pdf/reader'
+class VersionReceiver
+  attr_accessor :version
+  def initialize
+    @version = nil
+  end
+  # Called when document parsing starts
+  def pdf_version(arg = nil)
+    @version = arg
+  end
+end
+receiver = VersionReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver)
+puts receiver.version

data/lib/pdf-reader.rb ADDED

	@@ -0,0 +1 @@
1	+ require "pdf/reader"

data/lib/pdf/reader/cmap.rb CHANGED

@@ -9,10 +9,10 @@
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
-#
+#
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
-#
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -30,26 +30,33 @@ class PDF::Reader
       @map = {}
       in_char_mode = false
       in_range_mode = false
+      instructions = ""
       data.each_line do |l|
         if l.include?("beginbfchar")
-          in_char_mode = true
+          in_char_mode = true
         elsif l.include?("endbfchar")
-          in_char_mode = false
+          process_bfchar_instructions(instructions)
+          instructions = ""
+          in_char_mode = false
         elsif l.include?("beginbfrange")
-          in_range_mode = true
+          in_range_mode = true
         elsif l.include?("endbfrange")
-          in_range_mode = false
+          process_bfrange_instructions(instructions)
+          instructions = ""
+          in_range_mode = false
         end
-        if in_char_mode
-          process_bfchar_line(l)
-        elsif in_range_mode
-          process_bfrange_line(l)
+        if !l.include?("begin") && (in_char_mode || in_range_mode)
+          instructions << l
         end
       end
     end
+    def size
+      @map.size
+    end
     def decode(c)
       # TODO: implement the conversion
       return c unless c.class == Fixnum
@@ -58,24 +65,72 @@ class PDF::Reader
     private
-    def process_bfchar_line(l)
-      m, find, replace = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
-      @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+    def build_parser(instructions)
+      buffer = Buffer.new(StringIO.new(instructions))
+      Parser.new(buffer)
+    end
+    def str_to_int(str)
+      return nil if str.nil? || str.size == 0 || str.size >= 3
+      if str.size == 1
+        str.unpack("C*")[0]
+      else
+        str.unpack("n*")[0]
+      end
     end
-    def process_bfrange_line(l)
-      m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
-      if start_code && end_code && dst
-        start_code = "0x#{start_code}".hex
-        end_code   = "0x#{end_code}".hex
-        dst        = "0x#{dst}".hex
-        # add all values in the range to our mapping
-        (start_code..end_code).each_with_index do |val, idx|
-          @map[val] = dst + idx
-          # ensure a single range does not exceed 255 chars
-          raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
+    def process_bfchar_instructions(instructions)
+      parser  = build_parser(instructions)
+      find    = str_to_int(parser.parse_token)
+      replace = str_to_int(parser.parse_token)
+      while find && replace
+        @map[find] = replace
+        find       = str_to_int(parser.parse_token)
+        replace    = str_to_int(parser.parse_token)
+      end
+    end
+    def process_bfrange_instructions(instructions)
+      parser  = build_parser(instructions)
+      start   = parser.parse_token
+      finish  = parser.parse_token
+      to      = parser.parse_token
+      while start && finish && to
+        if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
+          bfrange_type_one(start, finish, to)
+        elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
+          bfrange_type_two(start, finish, to)
+        else
+          raise "invalid bfrange section"
         end
+        start   = parser.parse_token
+        finish  = parser.parse_token
+        to      = parser.parse_token
+      end
+    end
+    def bfrange_type_one(start_code, end_code, dst)
+      start_code = str_to_int(start_code)
+      end_code   = str_to_int(end_code)
+      dst        = str_to_int(dst)
+      # add all values in the range to our mapping
+      (start_code..end_code).each_with_index do |val, idx|
+        @map[val] = dst + idx
+        # ensure a single range does not exceed 255 chars
+        raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
+      end
+    end
+    def bfrange_type_two(start_code, end_code, dst)
+      start_code = str_to_int(start_code)
+      end_code   = str_to_int(end_code)
+      from_range = (start_code..end_code)
+      # add all values in the range to our mapping
+      from_range.each_with_index do |val, idx|
+        @map[val] = str_to_int(dst[idx])
       end
     end
   end

data/lib/pdf/reader/content.rb CHANGED

@@ -251,7 +251,6 @@ class PDF::Reader
     def initialize (receiver, xref)
       @receiver = receiver
       @xref     = xref
-      @fonts ||= {}
     end
     ################################################################################
     # Begin processing the document metadata
@@ -309,10 +308,14 @@ class PDF::Reader
           contents = [page[:Contents]]
         end
-        contents.each do |content|
-          obj = @xref.object(content)
-          content_stream(obj)
-        end if page.has_key?(:Contents) and page[:Contents]
+        fonts = font_hash_from_resources(current_resources)
+        if page.has_key?(:Contents) and page[:Contents]
+          contents.each do |content|
+            obj = @xref.object(content)
+            content_stream(obj, fonts)
+          end
+        end
         resources.pop if res
         callback(:end_page)
@@ -330,7 +333,8 @@ class PDF::Reader
         callback(:begin_form_xobject)
         resources = @xref.object(xobject.hash[:Resources])
         walk_resources(resources) if resources
-        content_stream(xobject)
+        fonts = font_hash_from_resources(resources)
+        content_stream(xobject, fonts)
         callback(:end_form_xobject)
       end
     end
@@ -348,42 +352,43 @@ class PDF::Reader
     ################################################################################
     # Reads a PDF content stream and calls all the appropriate callback methods for the operators
     # it contains
-    def content_stream (instructions)
+    def content_stream (instructions, fonts = {})
       instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
-      @buffer =   Buffer.new(StringIO.new(instructions))
-      @parser =   Parser.new(@buffer, @xref)
-      @params ||= []
+      buffer       = Buffer.new(StringIO.new(instructions))
+      parser       = Parser.new(buffer, @xref)
+      current_font = nil
+      params       = []
-      while (token = @parser.parse_token(OPERATORS))
+      while (token = parser.parse_token(OPERATORS))
         if token.kind_of?(Token) and OPERATORS.has_key?(token)
-          @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
+          current_font = params.first if OPERATORS[token] == :set_text_font_and_size
           # handle special cases in response to certain operators
-          if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
+          if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
             # convert any text to utf-8
-            @params = @fonts[@current_font].to_utf8(@params)
+            params = fonts[current_font].to_utf8(params)
           elsif token == "ID"
             # inline image data, first convert the current params into a more familiar hash
             map = {}
-            @params.each_slice(2) do |a|
+            params.each_slice(2) do |a|
               map[a.first] = a.last
             end
-            @params = [map]
+            params = [map]
             # read the raw image data from the buffer without tokenising
-            @params << @buffer.read_until("EI")
+            params << buffer.read_until("EI")
           end
-          callback(OPERATORS[token], @params)
+          callback(OPERATORS[token], params)
           if OPERATORS[token] == :invoke_xobject
-            xobject_label = @params.first
-            @params.clear
+            xobject_label = params.first
+            params.clear
             walk_xobject_form(xobject_label)
           else
-            @params.clear
+            params.clear
           end
         else
-          @params << token
+          params << token
         end
       end
     rescue EOFError => e
@@ -430,24 +435,9 @@ class PDF::Reader
       # extract any font information
       if resources[:Font]
-        @xref.object(resources[:Font]).each do |label, desc|
-          desc = @xref.object(desc)
-          @fonts[label] = PDF::Reader::Font.new
-          @fonts[label].label = label
-          @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
-          @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
-          @fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
-          @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
-          if desc[:ToUnicode]
-            # this stream is a cmap
-            begin
-              stream = desc[:ToUnicode]
-              @fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
-            rescue
-              # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
-            end
-          end
-          callback(:resource_font, [label, @fonts[label]])
+        fonts = font_hash_from_resources(resources)
+        fonts.each do  |label, font|
+          callback(:resource_font, [label, fonts])
         end
       end
     end
@@ -473,6 +463,32 @@ class PDF::Reader
     end
     ################################################################################
     private
+    ################################################################################
+    def font_hash_from_resources(resources)
+      return {} unless resources.respond_to?(:[])
+      fonts = {}
+      resources = @xref.object(resources[:Font]) || {}
+      resources.each do |label, desc|
+        desc = @xref.object(desc)
+        fonts[label] = PDF::Reader::Font.new
+        fonts[label].label = label
+        fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
+        fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
+        fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
+        fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
+        if desc[:ToUnicode]
+          # this stream is a cmap
+          begin
+            stream = desc[:ToUnicode]
+            fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+          rescue
+            # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
+          end
+        end
+      end
+      fonts
+    end
     # strings outside of page content should be in either PDFDocEncoding or UTF-16.
     def decode_strings(obj)
       case obj

data/lib/pdf/reader/encoding.rb CHANGED

@@ -30,9 +30,11 @@ class PDF::Reader
     UNKNOWN_CHAR = 0x25AF # ▯
-    attr_reader :differences
+    attr_reader :differences, :unpack
     def initialize(enc)
+      @to_unicode_required = false
       if enc.kind_of?(Hash)
         self.differences=enc[:Differences] if enc[:Differences]
         enc = enc[:Encoding] || enc[:BaseEncoding]
@@ -74,6 +76,10 @@ class PDF::Reader
       end
     end
+    def to_unicode_required?
+      @to_unicode_required
+    end
     # set the differences table for this encoding. should be an array in the following format:
     #
     #   [25, :A, 26, :B]
@@ -101,25 +107,22 @@ class PDF::Reader
     # convert the specified string to utf8
     def to_utf8(str, tounicode = nil)
       # unpack the single bytes
-      array_orig = str.unpack(@unpack)
+      array_orig = str.unpack(unpack)
       # replace any relevant bytes with a glyph name
       array_orig = process_differences(array_orig)
       # replace any remaining bytes with a unicode codepoint
-      array_enc = []
-      array_orig.each do |num|
+      array_enc = array_orig.map do |num|
         if tounicode && (code = tounicode.decode(num))
-          array_enc << code
-        elsif tounicode || ( tounicode.nil? && defined?(@to_unicode_required) &&
-                                               @to_unicode_required )
-          array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
-        elsif defined?(@mapping) && @mapping && @mapping[num]
-          array_enc << @mapping[num]
+          code
+        elsif tounicode || ( tounicode.nil? && to_unicode_required? )
+          PDF::Reader::Encoding::UNKNOWN_CHAR
+        elsif mapping[num]
+          mapping[num]
         else
-          array_enc << num
+          num
         end
       end
@@ -140,6 +143,14 @@ class PDF::Reader
     private
+    def mapping
+      @mapping ||= {}
+    end
+    def has_mapping?
+      mapping.size > 0
+    end
     # accepts an array of byte numbers, and replaces any that have entries in the differences table
     # with a glyph name
     def process_differences(arr)
@@ -154,12 +165,13 @@ class PDF::Reader
     end
     def load_mapping(file)
-      @mapping = {}
+      return if has_mapping?
       RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
       File.open(file, mode) do |f|
         f.each do |l|
           m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-          @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
+          mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
         end
       end
     end

data/lib/pdf/reader/parser.rb CHANGED

@@ -33,7 +33,7 @@ class PDF::Reader
     #
     # buffer - a PDF::Reader::Buffer object that contains PDF data
     # xref   - a PDF::Reader::XRef object that represents the document's object offsets
-    def initialize (buffer, xref)
+    def initialize (buffer, xref=nil)
       @buffer = buffer
       @xref   = xref
     end
@@ -48,7 +48,7 @@ class PDF::Reader
       case token
       when PDF::Reader::Reference     then return token
       when nil                        then return nil
-      when "/"                        then return @buffer.token.to_sym
+      when "/"                        then return pdf_name()
       when "<<"                       then return dictionary()
       when "["                        then return array()
       when "("                        then return string()
@@ -107,6 +107,16 @@ class PDF::Reader
       dict
     end
     ################################################################################
+    # reads a PDF name from the buffer and converts it to a Ruby Symbol
+    def pdf_name
+      tok = @buffer.token
+      tok.scan(/#(\d\d)/).each do |find|
+        replace = find[0].hex.chr
+        tok.gsub!("#"+find[0], replace)
+      end
+      tok.to_sym
+    end
+    ################################################################################
     # reads a PDF array from the buffer and converts it to a Ruby Array.
     def array
       a = []
@@ -141,6 +151,7 @@ class PDF::Reader
       return "" if str == ")"
       Error.assert_equal(parse_token, ")")
+      str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
       str.gsub!("\\n","\n")
       str.gsub!("\\r","\r")
       str.gsub!("\\t","\t")
@@ -150,7 +161,6 @@ class PDF::Reader
       str.gsub!("\\)",")")
       str.gsub!("\\\\","\\")
       str.gsub!(/\\\n/m,"")
-      str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
       str.scan(/\\\d{1,3}/).each do |octal|
         str.gsub!(octal, octal[1,3].oct.chr)

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.8.3
+  version: 0.8.4
 platform: ruby
 authors:
 - James Healy
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-14 00:00:00 +11:00
+date: 2010-03-30 00:00:00 +05:30
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
 - examples/hash.rb
 - examples/callbacks.rb
 - examples/text.rb
+- examples/version.rb
 - examples/page_counter_improved.rb
 - lib/pdf/reader/glyphlist.txt
 - lib/pdf/reader/content.rb
@@ -70,6 +71,7 @@ files:
 - lib/pdf/reader/parser.rb
 - lib/pdf/hash.rb
 - lib/pdf/reader.rb
+- lib/pdf-reader.rb
 - Rakefile
 - README.rdoc
 - TODO