RubyGems - pdf-reader - Versions diffs - 2.8.0 → 2.9.1 - Mend

pdf-reader 2.8.0 → 2.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/CHANGELOG +9 -0
data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
data/lib/pdf/reader/buffer.rb +36 -34
data/lib/pdf/reader/cmap.rb +64 -51
data/lib/pdf/reader/error.rb +8 -0
data/lib/pdf/reader/filter/ascii85.rb +1 -1
data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
data/lib/pdf/reader/filter/depredict.rb +1 -1
data/lib/pdf/reader/filter/flate.rb +3 -3
data/lib/pdf/reader/filter/lzw.rb +1 -1
data/lib/pdf/reader/filter/null.rb +1 -2
data/lib/pdf/reader/filter/run_length.rb +1 -1
data/lib/pdf/reader/filter.rb +10 -11
data/lib/pdf/reader/font.rb +29 -17
data/lib/pdf/reader/font_descriptor.rb +18 -17
data/lib/pdf/reader/form_xobject.rb +14 -5
data/lib/pdf/reader/key_builder_v5.rb +138 -0
data/lib/pdf/reader/null_security_handler.rb +0 -4
data/lib/pdf/reader/object_hash.rb +247 -42
data/lib/pdf/reader/page.rb +38 -20
data/lib/pdf/reader/page_state.rb +1 -1
data/lib/pdf/reader/page_text_receiver.rb +4 -1
data/lib/pdf/reader/parser.rb +20 -8
data/lib/pdf/reader/point.rb +1 -1
data/lib/pdf/reader/rc4_security_handler.rb +38 -0
data/lib/pdf/reader/rectangle.rb +2 -2
data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
data/lib/pdf/reader/security_handler_factory.rb +79 -0
data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
data/lib/pdf/reader/stream.rb +2 -2
data/lib/pdf/reader/type_check.rb +52 -0
data/lib/pdf/reader/validating_receiver.rb +262 -0
data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
data/lib/pdf/reader/xref.rb +20 -3
data/lib/pdf/reader.rb +17 -9
data/rbi/pdf-reader.rbi +388 -173
metadata +15 -9
data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6182ffd59631afba6a2c234547a428382b1ec2d7b414d89830b1143f1a0e1704
-  data.tar.gz: 6c0e6a7d32cf24912edc3aa96d72b7f70497d2fdd0e0913b86f871bbf9fa104f
+  metadata.gz: 07c734cf3cfc0abf1102f813976d4936d33b57815f114ce92224bbd605fe16a2
+  data.tar.gz: f52b1751f83717a7bc96c56e8d830559d387fb430cfa6fa2a78604d98c7476f4
 SHA512:
-  metadata.gz: 42dafbe0c36ce838da4c3120bf2187efde647e486971896d9a9c59c37dac3da0f2ccf3ecd98d8dd1d3acc5404bfcf26e64a327d7797648646afd6b40be02fec2
-  data.tar.gz: 40f0b0958024b558d6aca7eb2b3b6f042f034059c8fca52ce97fab7d55a39c313797605341331c65efd1099a1310ccbe386c354024dbd3cbc61c1d96c423842d
+  metadata.gz: 72fda8f6b32c20782adca6cca44d291c7cbe4ac9d858da5ed1c815af2a7d6680e3906cac47a8414923c8db639fd51365d9da8612c1c7f79a674b22448bb35cae
+  data.tar.gz: fa79a29d80a36d37e1188769bf7991d5108bbe08b11711a7c9bb1741cedd3682b77afe219a24ae7844fdbf10b23ca3eb5434f4b9418d7002f07fb8edf9dd6e26

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,12 @@
+v2.9.1 (4th February 2022)
+- Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
+- Other small bug fixes
+v2.9.0 (24th January 2022)
+- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
+- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
+- For sorbet users, additional type annotations are included in the gem
 v2.8.0 (28th Decemeber 2021)
 - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
 - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)

data/lib/pdf/reader/aes_v2_security_handler.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# coding: utf-8
+# typed: strict
+# frozen_string_literal: true
+require 'digest/md5'
+class PDF::Reader
+  # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
+  # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
+  #
+  class AesV2SecurityHandler
+    def initialize(key)
+      @encrypt_key = key
+    end
+    ##7.6.2 General Encryption Algorithm
+    #
+    # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
+    #
+    # version == 4 and CFM == AESV2
+    #
+    # buf - a string to decrypt
+    # ref - a PDF::Reader::Reference for the object to decrypt
+    #
+    def decrypt( buf, ref )
+      objKey = @encrypt_key.dup
+      (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
+      (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
+      objKey << 'sAlT'  # Algorithm 1, b)
+      length = objKey.length < 16 ? objKey.length : 16
+      cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
+      cipher.decrypt
+      cipher.key = Digest::MD5.digest(objKey)[0,length]
+      cipher.iv = buf[0..15]
+      cipher.update(buf[16..-1]) + cipher.final
+    end
+  end
+end

data/lib/pdf/reader/aes_v3_security_handler.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# coding: utf-8
+# typed: strict
+# frozen_string_literal: true
+require 'digest'
+require 'openssl'
+class PDF::Reader
+  # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
+  # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
+  #
+  class AesV3SecurityHandler
+    def initialize(key)
+      @encrypt_key = key
+      @cipher = "AES-256-CBC"
+    end
+    ##7.6.2 General Encryption Algorithm
+    #
+    # Algorithm 1: Encryption of data using the RC4 or AES algorithms
+    #
+    # used to decrypt RC4/AES encrypted PDF streams (buf)
+    #
+    # buf - a string to decrypt
+    # ref - a PDF::Reader::Reference for the object to decrypt
+    #
+    def decrypt( buf, ref )
+      cipher = OpenSSL::Cipher.new(@cipher)
+      cipher.decrypt
+      cipher.key = @encrypt_key.dup
+      cipher.iv = buf[0..15]
+      cipher.update(buf[16..-1]) + cipher.final
+    end
+  end
+end

data/lib/pdf/reader/buffer.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: ASCII-8BIT
-# typed: false
+# typed: true
 # frozen_string_literal: true
 ################################################################################
@@ -59,6 +59,9 @@ class PDF::Reader
     # Allow for this here
     TRAILING_BYTECOUNT = 5000
+    # must match whole tokens
+    DIGITS_ONLY = %r{\A\d+\z}
     attr_reader :pos
     # Creates a new buffer.
@@ -143,13 +146,20 @@ class PDF::Reader
       @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
       data = @io.read(TRAILING_BYTECOUNT)
+      raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
       # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
       lines = data.split(/[\n\r]+/).reverse
       eof_index = lines.index { |l| l.strip[/^%%EOF/] }
       raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
       raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
-      lines[eof_index+1].to_i
+      offset = lines[eof_index+1].to_i
+      # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
+      # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
+      raise MalformedPDFError, "invalid xref offset" if offset < 0
+      offset
     end
     private
@@ -230,13 +240,12 @@ class PDF::Reader
       return if @tokens.size < 3
       return if @tokens[2] != "R"
-      # must match whole tokens
-      digits_only = %r{\A\d+\z}
-      if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
-        @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
-        @tokens[1] = nil
-        @tokens[2] = nil
-        @tokens.compact!
+      token_one = @tokens[0]
+      token_two = @tokens[1]
+      if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
+        @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
+        @tokens.delete_at(2)
+        @tokens.delete_at(1)
       end
     end
@@ -246,7 +255,7 @@ class PDF::Reader
     # This is to reduce the chance of accidentally matching an embedded EI
     def prepare_inline_token
       idstart = @io.pos
-      chr = prevchr = nil
+      prevchr = ''
       eisize = 0 # how many chars in the end marker
       seeking = 'E' # what are we looking for now?
       loop do
@@ -264,11 +273,11 @@ class PDF::Reader
           end
         when 'I'
           if chr == 'I'
-            seeking = :END
+            seeking = ''
           else
             seeking = 'E'
           end
-        when :END
+        when ''
           if WHITE_SPACE.include? chr
             eisize += 1 # Drop trailer
             break
@@ -276,28 +285,28 @@ class PDF::Reader
             seeking = 'E'
           end
         end
-        prevchr = chr
+        prevchr = chr.is_a?(String) ? chr : ''
       end
-      unless seeking == :END
+      unless seeking == ''
         raise MalformedPDFError, "EI terminator not found"
       end
       eiend = @io.pos
       @io.seek(idstart, IO::SEEK_SET)
       str = @io.read(eiend - eisize - idstart) # get the ID content
-      @tokens << string_token(str)
+      @tokens << str.freeze if str
     end
     # if we're currently inside a hex string, read hex nibbles until
     # we find a closing >
     #
     def prepare_hex_token
+      finished = :false
       str = "".dup
-      finished = false
-      while !finished
+      until finished == :true
         byte = @io.getbyte
         if byte.nil?
-          finished = true # unbalanced params
+          finished = :true # unbalanced params
         elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
           str << byte
         elsif byte <= 32
@@ -306,7 +315,7 @@ class PDF::Reader
           @tokens << str if str.size > 0
           @tokens << ">" if byte != 0x3E # '>'
           @tokens << byte.chr
-          finished = true
+          finished = :true
         end
       end
     end
@@ -353,14 +362,17 @@ class PDF::Reader
     def prepare_regular_token
       tok = "".dup
-      while byte = @io.getbyte
+      loop do
+        byte = @io.getbyte
         case byte
+        when nil
+          break
         when 0x25
           # comment, ignore everything until the next EOL char
-          done = false
-          while !done
-            byte = @io.getbyte
-            done = true if byte.nil? || byte == 0x0A || byte == 0x0D
+          loop do
+            commentbyte = @io.getbyte
+            break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
           end
         when *TOKEN_WHITESPACE
           # white space, token finished
@@ -430,15 +442,5 @@ class PDF::Reader
       byte
     end
-    # for a handful of tokens we want to tell the parser how to convert them
-    # into higher level tokens. This methods adds a to_token() method
-    # to tokens that should remain as strings.
-    #
-    def string_token(token)
-      def token.to_token
-        to_s
-      end
-      token
-    end
   end
 end

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: false
+# typed: true
 # frozen_string_literal: true
 ################################################################################
@@ -35,15 +35,15 @@ class PDF::Reader
   class CMap # :nodoc:
     CMAP_KEYWORDS = {
-      "begincodespacerange" => 1,
-      "endcodespacerange" => 1,
-      "beginbfchar" => 1,
-      "endbfchar" => 1,
-      "beginbfrange" => 1,
-      "endbfrange" => 1,
-      "begin" => 1,
-      "begincmap" => 1,
-      "def" => 1
+      "begincodespacerange" => :noop,
+      "endcodespacerange" => :noop,
+      "beginbfchar" => :noop,
+      "endbfchar" => :noop,
+      "beginbfrange" => :noop,
+      "endbfrange" => :noop,
+      "begin" => :noop,
+      "begincmap" => :noop,
+      "def" => :noop
     }
     attr_reader :map
@@ -53,30 +53,6 @@ class PDF::Reader
       process_data(data)
     end
-    def process_data(data)
-      parser = build_parser(data)
-      mode = :none
-      instructions = []
-      while token = parser.parse_token(CMAP_KEYWORDS)
-        if token == "beginbfchar"
-          mode = :char
-        elsif token == "endbfchar"
-          process_bfchar_instructions(instructions)
-          instructions = []
-          mode = :none
-        elsif token == "beginbfrange"
-          mode = :range
-        elsif token == "endbfrange"
-          process_bfrange_instructions(instructions)
-          instructions = []
-          mode = :none
-        elsif mode == :char || mode == :range
-          instructions << token
-        end
-      end
-    end
     def size
       @map.size
     end
@@ -86,13 +62,40 @@ class PDF::Reader
     # Returns an array of Integers.
     #
     def decode(c)
-      # TODO: implement the conversion
-      return c unless Integer === c
-      @map[c]
+      @map.fetch(c, [])
     end
     private
+    def process_data(data, initial_mode = :none)
+      parser = build_parser(data)
+      mode = initial_mode
+      instructions = []
+      while token = parser.parse_token(CMAP_KEYWORDS)
+        if token.is_a?(String) || token.is_a?(Array)
+          if token == "beginbfchar"
+            mode = :char
+          elsif token == "endbfchar"
+            process_bfchar_instructions(instructions)
+            instructions = []
+            mode = :none
+          elsif token == "beginbfrange"
+            mode = :range
+          elsif token == "endbfrange"
+            process_bfrange_instructions(instructions)
+            instructions = []
+            mode = :none
+          elsif mode == :char
+            instructions << token.to_s
+          elsif mode == :range
+            instructions << token
+          end
+        end
+      end
+    end
     def build_parser(instructions)
       buffer = Buffer.new(StringIO.new(instructions))
       Parser.new(buffer)
@@ -107,7 +110,6 @@ class PDF::Reader
     # exception when we try converting broken UTF-16 to UTF-8
     #
     def str_to_int(str)
-      return nil if str.nil? || str.size == 0
       unpacked_string = if str.bytesize == 1 # UTF-8
         str.unpack("C*")
       else # UTF-16
@@ -115,12 +117,15 @@ class PDF::Reader
       end
       result = []
       while unpacked_string.any? do
-        if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
+        if unpacked_string.size >= 2 &&
+            unpacked_string.first.to_i > 0xD800 &&
+            unpacked_string.first.to_i < 0xDBFF
           # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
           # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
           # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
-          points = [unpacked_string.shift, unpacked_string.shift]
-          result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
+          point_one = unpacked_string.shift.to_i
+          point_two = unpacked_string.shift.to_i
+          result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
         else
           result << unpacked_string.shift
         end
@@ -130,9 +135,11 @@ class PDF::Reader
     def process_bfchar_instructions(instructions)
       instructions.each_slice(2) do |one, two|
-        find    = str_to_int(one)
-        replace = str_to_int(two)
-        @map[find.first] = replace
+        find    = str_to_int(one.to_s)
+        replace = str_to_int(two.to_s)
+        if find.any? && replace.any?
+          @map[find.first.to_i] = replace
+        end
       end
     end
@@ -143,30 +150,36 @@ class PDF::Reader
         elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
           bfrange_type_two(start, finish, to)
         else
-          raise "invalid bfrange section"
+          raise MalformedPDFError, "invalid bfrange section"
         end
       end
     end
     def bfrange_type_one(start_code, end_code, dst)
-      start_code = str_to_int(start_code)[0]
-      end_code   = str_to_int(end_code)[0]
+      start_code = str_to_int(start_code).first
+      end_code   = str_to_int(end_code).first
       dst        = str_to_int(dst)
+      return if start_code.nil? || end_code.nil?
       # add all values in the range to our mapping
       (start_code..end_code).each_with_index do |val, idx|
-        @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
+        @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
       end
     end
     def bfrange_type_two(start_code, end_code, dst)
-      start_code = str_to_int(start_code)[0]
-      end_code   = str_to_int(end_code)[0]
+      start_code = str_to_int(start_code).first
+      end_code   = str_to_int(end_code).first
+      return if start_code.nil? || end_code.nil?
       from_range = (start_code..end_code)
       # add all values in the range to our mapping
       from_range.each_with_index do |val, idx|
-        @map[val] = str_to_int(dst[idx])
+        dst_char = dst[idx]
+        @map[val.to_i] = str_to_int(dst_char) if dst_char
       end
     end
   end

data/lib/pdf/reader/error.rb CHANGED Viewed

@@ -51,9 +51,17 @@ class PDF::Reader
       raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
     end
     ################################################################################
+    def self.validate_type_as_malformed(object, name, klass)
+      raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
+    end
+    ################################################################################
     def self.validate_not_nil(object, name)
       raise ArgumentError, "#{object} must not be nil" if object.nil?
     end
+    ################################################################################
+    def self.validate_not_nil_as_malformed(object, name)
+      raise MalformedPDFError, "#{object} must not be nil" if object.nil?
+    end
   end
   ################################################################################

data/lib/pdf/reader/filter/ascii85.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: false
+# typed: strict
 # frozen_string_literal: true
 require 'ascii85'

data/lib/pdf/reader/filter/ascii_hex.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
 #

data/lib/pdf/reader/filter/depredict.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class PDF::Reader
     class Depredict
       def initialize(options = {})
-        @options = options || {}
+        @options = options
       end
       ################################################################################

data/lib/pdf/reader/filter/flate.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
@@ -34,7 +34,7 @@ class PDF::Reader
       def zlib_inflate(data)
         begin
           return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
-        rescue Zlib::DataError
+        rescue Zlib::Error
           # by default, Ruby's Zlib assumes the data it's inflating
           # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
           # fails, swallow the exception and attempt to inflate the data as a raw
@@ -43,7 +43,7 @@ class PDF::Reader
         begin
           return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
-        rescue StandardError
+        rescue Zlib::Error
           # swallow this one too, so we can try some other fallback options
         end

data/lib/pdf/reader/filter/lzw.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
 #

data/lib/pdf/reader/filter/null.rb CHANGED Viewed

@@ -1,8 +1,7 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
-#
 class PDF::Reader
   module Filter # :nodoc:
     # implementation of the null stream filter

data/lib/pdf/reader/filter/run_length.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
 #

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -42,17 +42,16 @@ class PDF::Reader
     # returned untouched. At this stage PDF::Reader has no need to decode images.
     #
     def self.with(name, options = {})
-      case name.to_sym
-      when :ASCII85Decode   then PDF::Reader::Filter::Ascii85.new(options)
-      when :ASCIIHexDecode  then PDF::Reader::Filter::AsciiHex.new(options)
-      when :CCITTFaxDecode  then PDF::Reader::Filter::Null.new(options)
-      when :DCTDecode       then PDF::Reader::Filter::Null.new(options)
-      when :FlateDecode     then PDF::Reader::Filter::Flate.new(options)
-      when :Fl              then PDF::Reader::Filter::Flate.new(options)
-      when :JBIG2Decode     then PDF::Reader::Filter::Null.new(options)
-      when :JPXDecode       then PDF::Reader::Filter::Null.new(options)
-      when :LZWDecode       then PDF::Reader::Filter::Lzw.new(options)
-      when :RunLengthDecode then PDF::Reader::Filter::RunLength.new(options)
+      case name
+      when :ASCII85Decode, :A85   then PDF::Reader::Filter::Ascii85.new(options)
+      when :ASCIIHexDecode, :AHx  then PDF::Reader::Filter::AsciiHex.new(options)
+      when :CCITTFaxDecode, :CCF  then PDF::Reader::Filter::Null.new(options)
+      when :DCTDecode, :DCT       then PDF::Reader::Filter::Null.new(options)
+      when :FlateDecode, :Fl      then PDF::Reader::Filter::Flate.new(options)
+      when :JBIG2Decode           then PDF::Reader::Filter::Null.new(options)
+      when :JPXDecode             then PDF::Reader::Filter::Null.new(options)
+      when :LZWDecode, :LZW       then PDF::Reader::Filter::Lzw.new(options)
+      when :RunLengthDecode, :RL  then PDF::Reader::Filter::RunLength.new(options)
       else
         raise UnsupportedFeatureError, "Unknown filter: #{name}"
       end

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -149,27 +149,37 @@ class PDF::Reader
       end
     end
-    def extract_base_info(obj)
-      @subtype  = @ohash.object(obj[:Subtype])
-      @basefont = @ohash.object(obj[:BaseFont])
-      if @ohash.object(obj[:Encoding])
-        @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
+    def build_encoding(obj)
+      if obj[:Encoding].is_a?(Symbol)
+        # one of the standard encodings, referenced by name
+        # TODO pass in a standard shape, always a Hash
+        PDF::Reader::Encoding.new(obj[:Encoding])
+      elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
+        PDF::Reader::Encoding.new(obj[:Encoding])
+      elsif obj[:Encoding].nil?
+        default_encoding(@basefont)
       else
-        @encoding = default_encoding(@basefont)
+        raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
       end
-      @widths   = @ohash.object(obj[:Widths]) || []
-      @first_char = @ohash.object(obj[:FirstChar])
-      @last_char = @ohash.object(obj[:LastChar])
+    end
+    def extract_base_info(obj)
+      @subtype  = @ohash.deref_name(obj[:Subtype])
+      @basefont = @ohash.deref_name(obj[:BaseFont])
+      @encoding = build_encoding(obj)
+      @widths   = @ohash.deref_array_of_numbers(obj[:Widths]) || []
+      @first_char = @ohash.deref_integer(obj[:FirstChar])
+      @last_char = @ohash.deref_integer(obj[:LastChar])
       # CID Fonts are not required to have a W or DW entry, if they don't exist,
       # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
-      @cid_widths         = @ohash.object(obj[:W])  || []
-      @cid_default_width  = @ohash.object(obj[:DW]) || 1000
+      @cid_widths         = @ohash.deref_array(obj[:W])  || []
+      @cid_default_width  = @ohash.deref_number(obj[:DW]) || 1000
       if obj[:ToUnicode]
         # ToUnicode is optional for Type1 and Type3
-        stream = @ohash.object(obj[:ToUnicode])
-        if stream.is_a?(PDF::Reader::Stream)
+        stream = @ohash.deref_stream(obj[:ToUnicode])
+        if stream
           @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
         end
       end
@@ -177,7 +187,9 @@ class PDF::Reader
     def extract_type3_info(obj)
       if @subtype == :Type3
-        @font_matrix = @ohash.object(obj[:FontMatrix]) || [ 0.001, 0, 0, 0.001, 0, 0 ]
+        @font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
+          0.001, 0, 0, 0.001, 0, 0
+        ]
       end
     end
@@ -185,7 +197,7 @@ class PDF::Reader
       if obj[:FontDescriptor]
         # create a font descriptor object if we can, in other words, unless this is
         # a CID Font
-        fd = @ohash.object(obj[:FontDescriptor])
+        fd = @ohash.deref_hash(obj[:FontDescriptor])
         @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
       else
         @font_descriptor = nil
@@ -197,9 +209,9 @@ class PDF::Reader
       # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
       # A one-element array specifying the CIDFont dictionary that is the
       # descendant of this Type 0 font.
-      descendants = @ohash.object(obj[:DescendantFonts])
+      descendants = @ohash.deref_array(obj[:DescendantFonts])
       @descendantfonts = descendants.map { |desc|
-        PDF::Reader::Font.new(@ohash, @ohash.object(desc))
+        PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
       }
     end