RubyGems - pdf-reader - Versions diffs - 0.8.6 → 0.9.0 - Mend

pdf-reader 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/CHANGELOG +17 -0
data/README.rdoc +7 -15
data/Rakefile +10 -63
data/TODO +6 -8
data/bin/pdf_object +3 -0
data/bin/pdf_text +4 -2
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +1 -1
data/examples/text.rb +3 -0
data/lib/pdf/hash.rb +8 -225
data/lib/pdf/reader.rb +79 -55
data/lib/pdf/reader/abstract_strategy.rb +77 -0
data/lib/pdf/reader/buffer.rb +61 -40
data/lib/pdf/reader/cmap.rb +11 -10
data/lib/pdf/reader/encoding.rb +85 -79
data/lib/pdf/reader/error.rb +1 -2
data/lib/pdf/reader/filter.rb +109 -6
data/lib/pdf/reader/font.rb +11 -11
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +53 -0
data/lib/pdf/reader/object_hash.rb +275 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
data/lib/pdf/reader/parser.rb +74 -37
data/lib/pdf/reader/print_receiver.rb +0 -1
data/lib/pdf/reader/register_receiver.rb +21 -0
data/lib/pdf/reader/stream.rb +5 -1
data/lib/pdf/reader/text_receiver.rb +3 -1
data/lib/pdf/reader/token.rb +1 -1
data/lib/pdf/reader/xref.rb +126 -64
metadata +61 -13
data/lib/pdf/reader/explore.rb +0 -116

data/lib/pdf/reader/encoding.rb CHANGED

@@ -23,58 +23,28 @@
 #
 ################################################################################
-require 'enumerator'
 class PDF::Reader
-  class Encoding
+  class Encoding # :nodoc:
     CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
                      24,25,26,27,28,29,30,31]
     UNKNOWN_CHAR = 0x25AF # ▯
-    attr_reader :differences, :unpack
+    attr_reader :unpack
     def initialize(enc)
-      @to_unicode_required = false
       if enc.kind_of?(Hash)
-        self.differences=enc[:Differences] if enc[:Differences]
+        self.differences = enc[:Differences] if enc[:Differences]
         enc = enc[:Encoding] || enc[:BaseEncoding]
       elsif enc != nil
         enc = enc.to_sym
+      else
+        enc = nil
       end
-      case enc
-        when nil                   then
-          load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
-          @unpack = "C*"
-        when "Identity-H".to_sym   then
-          @unpack = "n*"
-          @to_unicode_required = true
-        when :MacRomanEncoding     then
-          load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
-          @unpack = "C*"
-        when :MacExpertEncoding    then
-          load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
-          @unpack = "C*"
-        when :PDFDocEncoding       then
-          load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
-          @unpack = "C*"
-        when :StandardEncoding     then
-          load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
-          @unpack = "C*"
-        when :SymbolEncoding       then
-          load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
-          @unpack = "C*"
-        when :UTF16Encoding        then
-          @unpack = "n*"
-        when :WinAnsiEncoding      then
-          load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
-          @unpack = "C*"
-        when :ZapfDingbatsEncoding then
-          load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
-          @unpack = "C*"
-        else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
-      end
+      @to_unicode_required = unicode_required?(enc)
+      @unpack   = get_unpack(enc)
+      @map_file = get_mapping_file(enc)
+      load_mapping(@map_file) if @map_file
     end
     def to_unicode_required?
@@ -85,9 +55,9 @@ class PDF::Reader
     #
     #   [25, :A, 26, :B]
     #
-    # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
+    # The array alternates between a decimal byte number and a glyph name to map to that byte
     #
-    # To save space the following array is also valid and equivilant to the previous one
+    # To save space the following array is also valid and equivalent to the previous one
     #
     #   [25, :A, :B]
     def differences=(diff)
@@ -106,45 +76,90 @@ class PDF::Reader
       @differences
     end
+    def differences
+      @differences ||= {}
+    end
     # convert the specified string to utf8
+    #
+    # * unpack raw bytes into codepoints
+    # * replace any that have entries in the differences table with a glyph name
+    # * convert codepoints from source encoding to Unicode codepoints
+    # * convert any glyph names to Unicode codepoints
+    # * replace characters that didn't convert to Unicode nicely with something
+    #   valid
+    # * pack the final array of Unicode codepoints into a utf-8 string
+    # * mark the string as utf-8 if we're running on a M17N aware VM
+    #
     def to_utf8(str, tounicode = nil)
-      # unpack the single bytes
-      array_orig = str.unpack(unpack)
-      # replace any relevant bytes with a glyph name
-      array_orig = process_differences(array_orig)
-      # replace any remaining bytes with a unicode codepoint
-      array_enc = array_orig.map do |num|
-        if tounicode && (code = tounicode.decode(num))
-          code
-        elsif tounicode || ( tounicode.nil? && to_unicode_required? )
-          PDF::Reader::Encoding::UNKNOWN_CHAR
-        elsif mapping[num]
-          mapping[num]
-        elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(num)
+      ret = str.unpack(unpack).map { |c|
+        differences[c] || c
+      }.map { |num|
+        original_codepoint_to_unicode(num, tounicode)
+      }.map { |c|
+        glyphnames[c] || c
+      }.map { |c|
+        if c.nil? || !c.is_a?(Fixnum)
           PDF::Reader::Encoding::UNKNOWN_CHAR
         else
-          num
+          c
         end
-      end
+      }.pack("U*")
+      ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
-      # convert any glyph names to unicode codepoints
-      array_enc = process_glyphnames(array_enc)
+      ret
+    end
-      # replace charcters that didn't convert to unicode nicely with something valid
-      array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
+    private
-      # pack all our Unicode codepoints into a UTF-8 string
-      ret = array_enc.pack("U*")
+    def original_codepoint_to_unicode(cp, tounicode = nil)
+      if tounicode && (code = tounicode.decode(cp))
+        code
+      elsif tounicode || ( tounicode.nil? && to_unicode_required? )
+        PDF::Reader::Encoding::UNKNOWN_CHAR
+      elsif mapping[cp]
+        mapping[cp]
+      elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
+        PDF::Reader::Encoding::UNKNOWN_CHAR
+      else
+        cp
+      end
+    end
-      # set the strings encoding correctly under ruby 1.9+
-      ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
+    def get_unpack(enc)
+      case enc
+      when :"Identity-H", :UTF16Encoding
+        "n*"
+      else
+        "C*"
+      end
+    end
-      return ret
+    def get_mapping_file(enc)
+      return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
+      files = {
+        :"Identity-H"      => nil,
+        :MacRomanEncoding  => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
+        :MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
+        :PDFDocEncoding    => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
+        :StandardEncoding  => File.dirname(__FILE__) + "/encodings/standard.txt",
+        :SymbolEncoding    => File.dirname(__FILE__) + "/encodings/symbol.txt",
+        :UTF16Encoding     => nil,
+        :WinAnsiEncoding   => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
+        :ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
+      }
+      if files.has_key?(enc)
+        files[enc]
+      else
+        raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
+      end
     end
-    private
+    def unicode_required?(enc)
+      enc == :"Identity-H"
+    end
     def mapping
       @mapping ||= {}
@@ -154,17 +169,8 @@ class PDF::Reader
       mapping.size > 0
     end
-    # accepts an array of byte numbers, and replaces any that have entries in the differences table
-    # with a glyph name
-    def process_differences(arr)
-      @differences ||= {}
-      arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
-    end
-    # accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
-    def process_glyphnames(arr)
-      @differences ||= {}
-      arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
+    def glyphnames
+      @glyphnames ||= PDF::Reader::Font.glyphnames
     end
     def load_mapping(file)

data/lib/pdf/reader/error.rb CHANGED

@@ -22,12 +22,11 @@
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 class PDF::Reader
   ################################################################################
   # An internal PDF::Reader class that helps to verify various parts of the PDF file
   # are valid
-  class Error
+  class Error # :nodoc:
     ################################################################################
     def self.str_assert (lvalue, rvalue, chars=nil)
       raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)

data/lib/pdf/reader/filter.rb CHANGED

@@ -30,9 +30,7 @@ class PDF::Reader
   # support for features like compression and encryption. This class is for decoding that
   # content.
   #
-  # Currently only 1 filter type is supported. Hopefully support for others will be added
-  # in the future.
-  class Filter
+  class Filter # :nodoc:
     ################################################################################
     # creates a new filter for decoding content.
     #
@@ -49,6 +47,7 @@ class PDF::Reader
       when :DCTDecode      then @filter = nil
       when :FlateDecode    then @filter = :flate
       when :JBIG2Decode    then @filter = nil
+      when :LZWDecode      then @filter = :lzw
       else                 raise UnsupportedFeatureError, "Unknown filter: #{name}"
       end
     end
@@ -92,8 +91,9 @@ class PDF::Reader
     ################################################################################
     # Decode the specified data with the Zlib compression algorithm
     def flate (data)
+      deflated = nil
       begin
-        Zlib::Inflate.new.inflate(data)
+        deflated = Zlib::Inflate.new.inflate(data)
       rescue Zlib::DataError => e
         # by default, Ruby's Zlib assumes the data it's inflating
         # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
@@ -103,14 +103,117 @@ class PDF::Reader
         # See
         # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
         # - http://www.gzip.org/zlib/zlib_faq.html#faq38
-        Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
+        deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
       end
+      depredict(deflated, @options)
     rescue Exception => e
       # Oops, there was a problem inflating the stream
       raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
     end
     ################################################################################
+    # Decode the specified data with the LZW compression algorithm
+    def lzw(data)
+      data = PDF::Reader::LZW.decode(data)
+      depredict(data, @options)
+    end
+    ################################################################################
+    def depredict(data, opts = {})
+      predictor = (opts || {})[:Predictor].to_i
+      case predictor
+      when 0, 1 then
+        data
+      when 2    then
+        tiff_depredict(data, opts)
+      when 10, 11, 12, 13, 14, 15 then
+        png_depredict(data, opts)
+      else
+        raise  MalformedPDFError, "Unrecognised predictor value (#{predictor})"
+      end
+    end
+    ################################################################################
+    def tiff_depredict(data, opts = {})
+      raise UnsupportedFeatureError, "TIFF predictor not supported"
+    end
+    ################################################################################
+    def png_depredict(data, opts = {})
+      return data if opts.nil? || opts[:Predictor].to_i < 10
+      data = data.unpack("C*")
+      pixel_bytes     = 1 #pixel_bitlength / 8
+      scanline_length = (pixel_bytes * opts[:Columns]) + 1
+      row = 0
+      pixels = []
+      paeth, pa, pb, pc = nil
+      until data.empty? do
+        row_data = data.slice! 0, scanline_length
+        filter = row_data.shift
+        case filter
+        when 0 # None
+        when 1 # Sub
+          row_data.each_with_index do |byte, index|
+            left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
+            row_data[index] = (byte + left) % 256
+            #p [byte, left, row_data[index]]
+          end
+        when 2 # Up
+          row_data.each_with_index do |byte, index|
+            col = index / pixel_bytes
+            upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
+            row_data[index] = (upper + byte) % 256
+          end
+        when 3  # Average
+          row_data.each_with_index do |byte, index|
+            col = index / pixel_bytes
+            upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
+            left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
+            row_data[index] = (byte + ((left + upper)/2).floor) % 256
+          end
+        when 4 # Paeth
+          left = upper = upper_left = nil
+          row_data.each_with_index do |byte, index|
+            col = index / pixel_bytes
+            left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
+            if row.zero?
+              upper = upper_left = 0
+            else
+              upper = pixels[row-1][col][index % pixel_bytes]
+              upper_left = col.zero? ? 0 :
+                pixels[row-1][col-1][index % pixel_bytes]
+            end
+            p = left + upper - upper_left
+            pa = (p - left).abs
+            pb = (p - upper).abs
+            pc = (p - upper_left).abs
+            paeth = if pa <= pb && pa <= pc
+                      left
+                    elsif pb <= pc
+                      upper
+                    else
+                      upper_left
+                    end
+            row_data[index] = (byte + paeth) % 256
+          end
+        else
+          raise ArgumentError, "Invalid filter algorithm #{filter}"
+        end
+        s = []
+        row_data.each_slice pixel_bytes do |slice|
+          s << slice
+        end
+        pixels << s
+        row += 1
+      end
+      pixels.map { |row| row.flatten.pack("C*") }.join("")
+    end
   end
-  ################################################################################
 end
 ################################################################################

data/lib/pdf/reader/font.rb CHANGED

@@ -32,19 +32,17 @@ class PDF::Reader
     # a text file supplied by Adobe at:
     # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
     def self.glyphnames
-      @@glyphs ||= {}
+      glyphs = {}
-      if @@glyphs.empty?
-        RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
-        File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
-          f.each do |l|
-            m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-            @@glyphs[name.to_sym] = "0x#{code}".hex if name
-          end
+      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
+      File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
+        f.each do |l|
+          m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
+          glyphs[name.to_sym] = "0x#{code}".hex if name
         end
       end
-      @@glyphs
+      glyphs
     end
     def basefont=(font)
@@ -52,9 +50,11 @@ class PDF::Reader
       # with encoding= if required
       case font
       when "Symbol" then
-        self.encoding = PDF::Reader::Encoding.new("SymbolEncoding")
+        @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
       when "ZapfDingbats" then
-        self.encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
+        @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
+      else
+        @encoding = nil
       end
       @basefont = font
     end

data/lib/pdf/reader/lzw.rb ADDED

@@ -0,0 +1,123 @@
+# coding: utf-8
+module PDF
+  class Reader
+    # A general class for decoding LZW compressed data. LZW can be
+    # used in PDF files to compresses streams, usually for image data sourced
+    # from a TIFF file.
+    #
+    # See the following links for more information:
+    #
+    #   ref http://www.fileformat.info/format/tiff/corion-lzw.htm
+    #   ref http://marknelson.us/1989/10/01/lzw-data-compression/
+    #
+    # The PDF spec also has some data on the algorithm.
+    #
+    class LZW # :nodoc:
+      class BitStream # :nodoc:
+        def initialize(data, bits_in_chunk)
+          @data = data
+          @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
+          @bits_in_chunk = bits_in_chunk
+          @current_pos = 0
+          @bits_left_in_byte = 8
+        end
+        def set_bits_in_chunk(bits_in_chunk)
+          @bits_in_chunk = bits_in_chunk
+        end
+        def read
+          bits_left_in_chunk = @bits_in_chunk
+          chunk = nil
+          while bits_left_in_chunk > 0 and @current_pos < @data.size
+            chunk = 0 if chunk.nil?
+            codepoint = @data[@current_pos, 1].unpack("C*")[0]
+            current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
+            dif = bits_left_in_chunk - @bits_left_in_byte
+            if dif > 0 then  current_byte <<= dif
+            elsif dif < 0 then  current_byte >>= dif.abs
+            end
+            chunk |= current_byte #add bits to result
+            bits_left_in_chunk = if dif >= 0 then dif else 0 end
+            @bits_left_in_byte = if dif < 0 then dif.abs else 0 end
+            if @bits_left_in_byte.zero? #next byte
+              @current_pos += 1
+              @bits_left_in_byte = 8
+            end
+          end
+          chunk
+        end
+      end
+      CODE_EOD = 257 #end of data
+      CODE_CLEAR_TABLE = 256 #clear table
+      # stores de pairs code => string
+      class StringTable < Hash # :nodoc:
+        attr_reader :string_table_pos
+        def initialize
+          super
+          @string_table_pos = 258 #initial code
+        end
+        #if code less than 258 return fixed string
+        def [](key)
+          if key > 257 then super else key.chr end
+        end
+        def add(string)
+          store(@string_table_pos, string)
+          @string_table_pos += 1
+        end
+      end
+      # Decompresses a LZW compressed string.
+      #
+      def self.decode(data)
+        stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
+        result = ''
+        while not (code = stream.read) == CODE_EOD
+          if code == CODE_CLEAR_TABLE
+            string_table = StringTable.new
+            code = stream.read
+            break if code == CODE_EOD
+            result << string_table[code]
+            old_code = code
+          else
+            string = string_table[code]
+            if string
+              result << string
+              string_table.add create_new_string(string_table, old_code, code)
+              old_code = code
+            else
+              new_string = create_new_string(string_table, old_code, old_code)
+              result << new_string
+              string_table.add new_string
+              old_code = code
+            end
+            #increase de size of the codes when limit reached
+            case string_table.string_table_pos
+            when 511 then stream.set_bits_in_chunk(10)
+            when 1023 then stream.set_bits_in_chunk(11)
+            when 2047 then stream.set_bits_in_chunk(12)
+            end
+          end
+        end
+        result
+      end
+      private
+      def self.create_new_string(string_table,some_code, other_code)
+        string_table[some_code] + string_table[other_code][0].chr
+      end
+    end
+  end
+end