RubyGems - pdf-reader - Versions diffs - 0.8.3 → 0.8.4 - Mend

pdf-reader 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,13 @@
+v0.8.4 (XXX)
+- fix parsing of files that use Form XObjects
+  - thanks to Andrea Barisani for reporting the issue
+- fix two issues that caused a small number of characters to convert to Unicode
+  incorrectly
+  - thanks to Andrea Barisani for reporting the issue
+- require 'pdf-reader' now works a well as 'pdf/reader'
+  - good practice to have the require file match the gem name
+  - thanks to Chris O'Meara for highlighting this
 v0.8.3 (14th February 2010)
 - Fix a bug in tokenising of hex strings inside dictionaries
   - Thanks to Brad Ediger for detecting the issue and proposing a solution

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.8.3"
+PKG_VERSION = "0.8.4"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/bin/pdf_text CHANGED

@@ -29,6 +29,7 @@ class PageTextReceiver
   def show_text_with_positioning(*params)
     params = params.first
+    params ||= []
     params.each { |str| show_text(str) if str.kind_of?(String)}
   end
 end

data/examples/version.rb ADDED

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Determine the PDF version of a file
+require 'rubygems'
+require 'pdf/reader'
+class VersionReceiver
+  attr_accessor :version
+  def initialize
+    @version = nil
+  end
+  # Called when document parsing starts
+  def pdf_version(arg = nil)
+    @version = arg
+  end
+end
+receiver = VersionReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver)
+puts receiver.version

data/lib/pdf-reader.rb ADDED

	@@ -0,0 +1 @@
1	+ require "pdf/reader"

data/lib/pdf/reader/cmap.rb CHANGED

@@ -9,10 +9,10 @@
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
-#
+#
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.
-#
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -30,26 +30,33 @@ class PDF::Reader
       @map = {}
       in_char_mode = false
       in_range_mode = false
+      instructions = ""
       data.each_line do |l|
         if l.include?("beginbfchar")
-          in_char_mode = true
+          in_char_mode = true
         elsif l.include?("endbfchar")
-          in_char_mode = false
+          process_bfchar_instructions(instructions)
+          instructions = ""
+          in_char_mode = false
         elsif l.include?("beginbfrange")
-          in_range_mode = true
+          in_range_mode = true
         elsif l.include?("endbfrange")
-          in_range_mode = false
+          process_bfrange_instructions(instructions)
+          instructions = ""
+          in_range_mode = false
         end
-        if in_char_mode
-          process_bfchar_line(l)
-        elsif in_range_mode
-          process_bfrange_line(l)
+        if !l.include?("begin") && (in_char_mode || in_range_mode)
+          instructions << l
         end
       end
     end
+    def size
+      @map.size
+    end
     def decode(c)
       # TODO: implement the conversion
       return c unless c.class == Fixnum
@@ -58,24 +65,72 @@ class PDF::Reader
     private
-    def process_bfchar_line(l)
-      m, find, replace = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
-      @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
+    def build_parser(instructions)
+      buffer = Buffer.new(StringIO.new(instructions))
+      Parser.new(buffer)
+    end
+    def str_to_int(str)
+      return nil if str.nil? || str.size == 0 || str.size >= 3
+      if str.size == 1
+        str.unpack("C*")[0]
+      else
+        str.unpack("n*")[0]
+      end
     end
-    def process_bfrange_line(l)
-      m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
-      if start_code && end_code && dst
-        start_code = "0x#{start_code}".hex
-        end_code   = "0x#{end_code}".hex
-        dst        = "0x#{dst}".hex
-        # add all values in the range to our mapping
-        (start_code..end_code).each_with_index do |val, idx|
-          @map[val] = dst + idx
-          # ensure a single range does not exceed 255 chars
-          raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
+    def process_bfchar_instructions(instructions)
+      parser  = build_parser(instructions)
+      find    = str_to_int(parser.parse_token)
+      replace = str_to_int(parser.parse_token)
+      while find && replace
+        @map[find] = replace
+        find       = str_to_int(parser.parse_token)
+        replace    = str_to_int(parser.parse_token)
+      end
+    end
+    def process_bfrange_instructions(instructions)
+      parser  = build_parser(instructions)
+      start   = parser.parse_token
+      finish  = parser.parse_token
+      to      = parser.parse_token
+      while start && finish && to
+        if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
+          bfrange_type_one(start, finish, to)
+        elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
+          bfrange_type_two(start, finish, to)
+        else
+          raise "invalid bfrange section"
         end
+        start   = parser.parse_token
+        finish  = parser.parse_token
+        to      = parser.parse_token
+      end
+    end
+    def bfrange_type_one(start_code, end_code, dst)
+      start_code = str_to_int(start_code)
+      end_code   = str_to_int(end_code)
+      dst        = str_to_int(dst)
+      # add all values in the range to our mapping
+      (start_code..end_code).each_with_index do |val, idx|
+        @map[val] = dst + idx
+        # ensure a single range does not exceed 255 chars
+        raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
+      end
+    end
+    def bfrange_type_two(start_code, end_code, dst)
+      start_code = str_to_int(start_code)
+      end_code   = str_to_int(end_code)
+      from_range = (start_code..end_code)
+      # add all values in the range to our mapping
+      from_range.each_with_index do |val, idx|
+        @map[val] = str_to_int(dst[idx])
       end
     end
   end

data/lib/pdf/reader/content.rb CHANGED

@@ -251,7 +251,6 @@ class PDF::Reader
     def initialize (receiver, xref)
       @receiver = receiver
       @xref     = xref
-      @fonts ||= {}
     end
     ################################################################################
     # Begin processing the document metadata
@@ -309,10 +308,14 @@ class PDF::Reader
           contents = [page[:Contents]]
         end
-        contents.each do |content|
-          obj = @xref.object(content)
-          content_stream(obj)
-        end if page.has_key?(:Contents) and page[:Contents]
+        fonts = font_hash_from_resources(current_resources)
+        if page.has_key?(:Contents) and page[:Contents]
+          contents.each do |content|
+            obj = @xref.object(content)
+            content_stream(obj, fonts)
+          end
+        end
         resources.pop if res
         callback(:end_page)
@@ -330,7 +333,8 @@ class PDF::Reader
         callback(:begin_form_xobject)
         resources = @xref.object(xobject.hash[:Resources])
         walk_resources(resources) if resources
-        content_stream(xobject)
+        fonts = font_hash_from_resources(resources)
+        content_stream(xobject, fonts)
         callback(:end_form_xobject)
       end
     end
@@ -348,42 +352,43 @@ class PDF::Reader
     ################################################################################
     # Reads a PDF content stream and calls all the appropriate callback methods for the operators
     # it contains
-    def content_stream (instructions)
+    def content_stream (instructions, fonts = {})
       instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
-      @buffer =   Buffer.new(StringIO.new(instructions))
-      @parser =   Parser.new(@buffer, @xref)
-      @params ||= []
+      buffer       = Buffer.new(StringIO.new(instructions))
+      parser       = Parser.new(buffer, @xref)
+      current_font = nil
+      params       = []
-      while (token = @parser.parse_token(OPERATORS))
+      while (token = parser.parse_token(OPERATORS))
         if token.kind_of?(Token) and OPERATORS.has_key?(token)
-          @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
+          current_font = params.first if OPERATORS[token] == :set_text_font_and_size
           # handle special cases in response to certain operators
-          if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
+          if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
             # convert any text to utf-8
-            @params = @fonts[@current_font].to_utf8(@params)
+            params = fonts[current_font].to_utf8(params)
           elsif token == "ID"
             # inline image data, first convert the current params into a more familiar hash
             map = {}
-            @params.each_slice(2) do |a|
+            params.each_slice(2) do |a|
               map[a.first] = a.last
             end
-            @params = [map]
+            params = [map]
             # read the raw image data from the buffer without tokenising
-            @params << @buffer.read_until("EI")
+            params << buffer.read_until("EI")
           end
-          callback(OPERATORS[token], @params)
+          callback(OPERATORS[token], params)
           if OPERATORS[token] == :invoke_xobject
-            xobject_label = @params.first
-            @params.clear
+            xobject_label = params.first
+            params.clear
             walk_xobject_form(xobject_label)
           else
-            @params.clear
+            params.clear
           end
         else
-          @params << token
+          params << token
         end
       end
     rescue EOFError => e
@@ -430,24 +435,9 @@ class PDF::Reader
       # extract any font information
       if resources[:Font]
-        @xref.object(resources[:Font]).each do |label, desc|
-          desc = @xref.object(desc)
-          @fonts[label] = PDF::Reader::Font.new
-          @fonts[label].label = label
-          @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
-          @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
-          @fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
-          @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
-          if desc[:ToUnicode]
-            # this stream is a cmap
-            begin
-              stream = desc[:ToUnicode]
-              @fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
-            rescue
-              # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
-            end
-          end
-          callback(:resource_font, [label, @fonts[label]])
+        fonts = font_hash_from_resources(resources)
+        fonts.each do  |label, font|
+          callback(:resource_font, [label, fonts])
         end
       end
     end
@@ -473,6 +463,32 @@ class PDF::Reader
     end
     ################################################################################
     private
+    ################################################################################
+    def font_hash_from_resources(resources)
+      return {} unless resources.respond_to?(:[])
+      fonts = {}
+      resources = @xref.object(resources[:Font]) || {}
+      resources.each do |label, desc|
+        desc = @xref.object(desc)
+        fonts[label] = PDF::Reader::Font.new
+        fonts[label].label = label
+        fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
+        fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
+        fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
+        fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
+        if desc[:ToUnicode]
+          # this stream is a cmap
+          begin
+            stream = desc[:ToUnicode]
+            fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+          rescue
+            # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
+          end
+        end
+      end
+      fonts
+    end
     # strings outside of page content should be in either PDFDocEncoding or UTF-16.
     def decode_strings(obj)
       case obj

data/lib/pdf/reader/encoding.rb CHANGED

@@ -30,9 +30,11 @@ class PDF::Reader
     UNKNOWN_CHAR = 0x25AF # ▯
-    attr_reader :differences
+    attr_reader :differences, :unpack
     def initialize(enc)
+      @to_unicode_required = false
       if enc.kind_of?(Hash)
         self.differences=enc[:Differences] if enc[:Differences]
         enc = enc[:Encoding] || enc[:BaseEncoding]
@@ -74,6 +76,10 @@ class PDF::Reader
       end
     end
+    def to_unicode_required?
+      @to_unicode_required
+    end
     # set the differences table for this encoding. should be an array in the following format:
     #
     #   [25, :A, 26, :B]
@@ -101,25 +107,22 @@ class PDF::Reader
     # convert the specified string to utf8
     def to_utf8(str, tounicode = nil)
       # unpack the single bytes
-      array_orig = str.unpack(@unpack)
+      array_orig = str.unpack(unpack)
       # replace any relevant bytes with a glyph name
       array_orig = process_differences(array_orig)
       # replace any remaining bytes with a unicode codepoint
-      array_enc = []
-      array_orig.each do |num|
+      array_enc = array_orig.map do |num|
         if tounicode && (code = tounicode.decode(num))
-          array_enc << code
-        elsif tounicode || ( tounicode.nil? && defined?(@to_unicode_required) &&
-                                               @to_unicode_required )
-          array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
-        elsif defined?(@mapping) && @mapping && @mapping[num]
-          array_enc << @mapping[num]
+          code
+        elsif tounicode || ( tounicode.nil? && to_unicode_required? )
+          PDF::Reader::Encoding::UNKNOWN_CHAR
+        elsif mapping[num]
+          mapping[num]
         else
-          array_enc << num
+          num
         end
       end
@@ -140,6 +143,14 @@ class PDF::Reader
     private
+    def mapping
+      @mapping ||= {}
+    end
+    def has_mapping?
+      mapping.size > 0
+    end
     # accepts an array of byte numbers, and replaces any that have entries in the differences table
     # with a glyph name
     def process_differences(arr)
@@ -154,12 +165,13 @@ class PDF::Reader
     end
     def load_mapping(file)
-      @mapping = {}
+      return if has_mapping?
       RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
       File.open(file, mode) do |f|
         f.each do |l|
           m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-          @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
+          mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
         end
       end
     end

data/lib/pdf/reader/parser.rb CHANGED

@@ -33,7 +33,7 @@ class PDF::Reader
     #
     # buffer - a PDF::Reader::Buffer object that contains PDF data
     # xref   - a PDF::Reader::XRef object that represents the document's object offsets
-    def initialize (buffer, xref)
+    def initialize (buffer, xref=nil)
       @buffer = buffer
       @xref   = xref
     end
@@ -48,7 +48,7 @@ class PDF::Reader
       case token
       when PDF::Reader::Reference     then return token
       when nil                        then return nil
-      when "/"                        then return @buffer.token.to_sym
+      when "/"                        then return pdf_name()
       when "<<"                       then return dictionary()
       when "["                        then return array()
       when "("                        then return string()
@@ -107,6 +107,16 @@ class PDF::Reader
       dict
     end
     ################################################################################
+    # reads a PDF name from the buffer and converts it to a Ruby Symbol
+    def pdf_name
+      tok = @buffer.token
+      tok.scan(/#(\d\d)/).each do |find|
+        replace = find[0].hex.chr
+        tok.gsub!("#"+find[0], replace)
+      end
+      tok.to_sym
+    end
+    ################################################################################
     # reads a PDF array from the buffer and converts it to a Ruby Array.
     def array
       a = []
@@ -141,6 +151,7 @@ class PDF::Reader
       return "" if str == ")"
       Error.assert_equal(parse_token, ")")
+      str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
       str.gsub!("\\n","\n")
       str.gsub!("\\r","\r")
       str.gsub!("\\t","\t")
@@ -150,7 +161,6 @@ class PDF::Reader
       str.gsub!("\\)",")")
       str.gsub!("\\\\","\\")
       str.gsub!(/\\\n/m,"")
-      str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
       str.scan(/\\\d{1,3}/).each do |octal|
         str.gsub!(octal, octal[1,3].oct.chr)

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.8.3
+  version: 0.8.4
 platform: ruby
 authors:
 - James Healy
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-02-14 00:00:00 +11:00
+date: 2010-03-30 00:00:00 +05:30
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
 - examples/hash.rb
 - examples/callbacks.rb
 - examples/text.rb
+- examples/version.rb
 - examples/page_counter_improved.rb
 - lib/pdf/reader/glyphlist.txt
 - lib/pdf/reader/content.rb
@@ -70,6 +71,7 @@ files:
 - lib/pdf/reader/parser.rb
 - lib/pdf/hash.rb
 - lib/pdf/reader.rb
+- lib/pdf-reader.rb
 - Rakefile
 - README.rdoc
 - TODO