RubyGems - pdf-reader - Versions diffs - 2.14.1 → 2.15.1 - Mend

pdf-reader 2.14.1 → 2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

checksums.yaml +4 -4
data/CHANGELOG +19 -0
data/lib/pdf/reader/advanced_text_run_filter.rb +17 -2
data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
data/lib/pdf/reader/buffer.rb +35 -17
data/lib/pdf/reader/cid_widths.rb +7 -1
data/lib/pdf/reader/cmap.rb +22 -6
data/lib/pdf/reader/encoding.rb +37 -12
data/lib/pdf/reader/error.rb +6 -0
data/lib/pdf/reader/filter/ascii85.rb +2 -0
data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
data/lib/pdf/reader/filter/depredict.rb +4 -0
data/lib/pdf/reader/filter/flate.rb +5 -2
data/lib/pdf/reader/filter/lzw.rb +2 -0
data/lib/pdf/reader/filter/null.rb +2 -0
data/lib/pdf/reader/filter/run_length.rb +2 -0
data/lib/pdf/reader/filter.rb +1 -0
data/lib/pdf/reader/font.rb +101 -25
data/lib/pdf/reader/font_descriptor.rb +76 -23
data/lib/pdf/reader/form_xobject.rb +11 -0
data/lib/pdf/reader/glyph_hash.rb +34 -9
data/lib/pdf/reader/key_builder_v5.rb +17 -9
data/lib/pdf/reader/lzw.rb +17 -6
data/lib/pdf/reader/no_text_filter.rb +1 -0
data/lib/pdf/reader/null_security_handler.rb +1 -0
data/lib/pdf/reader/object_cache.rb +7 -2
data/lib/pdf/reader/object_hash.rb +116 -9
data/lib/pdf/reader/object_stream.rb +19 -2
data/lib/pdf/reader/overlapping_runs_filter.rb +7 -1
data/lib/pdf/reader/page.rb +41 -7
data/lib/pdf/reader/page_layout.rb +25 -8
data/lib/pdf/reader/page_state.rb +5 -2
data/lib/pdf/reader/page_text_receiver.rb +6 -2
data/lib/pdf/reader/pages_strategy.rb +1 -1
data/lib/pdf/reader/parser.rb +51 -10
data/lib/pdf/reader/point.rb +9 -2
data/lib/pdf/reader/print_receiver.rb +2 -6
data/lib/pdf/reader/rc4_security_handler.rb +2 -0
data/lib/pdf/reader/rectangle.rb +24 -1
data/lib/pdf/reader/reference.rb +10 -1
data/lib/pdf/reader/register_receiver.rb +15 -2
data/lib/pdf/reader/resources.rb +9 -0
data/lib/pdf/reader/security_handler_factory.rb +13 -0
data/lib/pdf/reader/standard_key_builder.rb +37 -23
data/lib/pdf/reader/stream.rb +9 -3
data/lib/pdf/reader/synchronized_cache.rb +5 -2
data/lib/pdf/reader/text_run.rb +28 -1
data/lib/pdf/reader/token.rb +1 -0
data/lib/pdf/reader/transformation_matrix.rb +33 -2
data/lib/pdf/reader/type_check.rb +10 -3
data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
data/lib/pdf/reader/validating_receiver.rb +29 -0
data/lib/pdf/reader/width_calculator/built_in.rb +10 -3
data/lib/pdf/reader/width_calculator/composite.rb +5 -1
data/lib/pdf/reader/width_calculator/true_type.rb +5 -1
data/lib/pdf/reader/width_calculator/type_one_or_three.rb +3 -1
data/lib/pdf/reader/width_calculator/type_zero.rb +2 -0
data/lib/pdf/reader/xref.rb +28 -7
data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
data/lib/pdf/reader.rb +18 -2
data/rbi/pdf-reader.rbi +1511 -1594
metadata +17 -11

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 38765d176ae7b8f4cff7ea6f10fff00b811f6812629d76a2b966f36139c23188
-  data.tar.gz: a406d525e4fccb84cc9e86b28aab06a12854c6f0f297a1a479d26b3f845267f6
+  metadata.gz: 680d773fb89a823854ce986d7e35c5313df55087d0b4e8bfc3c70c51d97a8130
+  data.tar.gz: 7484cc4e28a01b9a74b869c2dede32a47bfd2519cded7ff6fc421c99d546a406
 SHA512:
-  metadata.gz: 010c16b1528d4c46d0175737c9694e2e326092b5e7091cbdd0e0ca41567e662b1adabe989c33b0b919a021bee9f985fa4f2862058bd144762c090e718b3089cc
-  data.tar.gz: 996fe5b0761280edd67c5523d00c04519b7c682c5ededd86d8dfd412df6e11d554d162ab5b4eb231709f4d3013c5963129b32358ef0b49a4521e8ba72dcf490b
+  metadata.gz: e9b2ad3cfb37fb76f731d646bf5097e0334d88b3f7e5ba39abfe9530e51a7dbcfcf08a75d81b6cc1e17f2b18015a25f3ef9ef0d9b75b0d5763eb38470263ee49
+  data.tar.gz: 7acdc84e89045708ac4c983deefa2bb1a6246f1a57d8d9a809efeb3b1477d69921238067b797f78d2b7a33eff9bd54c03e139fd4bed10102f365a4d48e8899a0

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,22 @@
+v2.15.1 (28th December 2025)
+- Add ruby 4.0 to the CI matrix (https://github.com/yob/pdf-reader/pull/575)
+- Avoiding raising an error when ToUnicode poins to the wrong object type (https://github.com/yob/pdf-reader/pull/573)
+- Skip invalid UTF-16 surrogate pairs in CMaps (https://github.com/yob/pdf-reader/pull/574)
+v2.15.0 (13th August 2025)
+- Overhaul sorbet types, moving from an external RBI file to inline comments in RBS syntax
+  - multiple PRs, but mainly https://github.com/yob/pdf-reader/pull/562
+  - See https://railsatscale.com/2025-04-23-rbs-support-for-sorbet/
+  - No impact expected for most users, but projects that use sorbet may find subtle changes in
+    the RBI file that is shipped with the gem
+- Relax version requirements for dependency `afm`, allow 1.x (https://github.com/yob/pdf-reader/pull/557)
+- Improve text positioning logic in some PDFs (https://github.com/yob/pdf-reader/pull/554)
+- Multiple fixes for encrypted files
+  - Some files with passwords > 32 bytes long (https://github.com/yob/pdf-reader/pull/555)
+  - Some files that contain cipher text with a 16 byte IV and no further blocks (https://github.com/yob/pdf-reader/pull/561)
+  - Some files that encrypted data with no padding (https://github.com/yob/pdf-reader/pull/564)
+- Add jruby 10 to CI matrix (https://github.com/yob/pdf-reader/pull/552)
 v2.14.1 (4th February 2025)
 - Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)

data/lib/pdf/reader/advanced_text_run_filter.rb CHANGED Viewed

@@ -46,28 +46,37 @@ class PDF::Reader
       less_than_or_equal
       include
       exclude
-    ]
+    ] #: Array[Symbol]
+    #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
     def self.only(text_runs, filter_hash)
       new(text_runs, filter_hash).only
     end
+    #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
     def self.exclude(text_runs, filter_hash)
       new(text_runs, filter_hash).exclude
     end
-    attr_reader :text_runs, :filter_hash
+    #: Array[PDF::Reader::TextRun]
+    attr_reader :text_runs
+    #: Hash[Symbol, untyped]
+    attr_reader :filter_hash
+    #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> void
     def initialize(text_runs, filter_hash)
       @text_runs = text_runs
       @filter_hash = filter_hash
     end
+    #: () -> Array[PDF::Reader::TextRun]
     def only
       return text_runs if filter_hash.empty?
       text_runs.select { |text_run| evaluate_filter(text_run) }
     end
+    #: () -> Array[PDF::Reader::TextRun]
     def exclude
       return text_runs if filter_hash.empty?
       text_runs.reject { |text_run| evaluate_filter(text_run) }
@@ -75,6 +84,7 @@ class PDF::Reader
     private
+    #: (PDF::Reader::TextRun) -> bool
     def evaluate_filter(text_run)
       if filter_hash[:or]
         evaluate_or_filters(text_run, filter_hash[:or])
@@ -85,24 +95,28 @@ class PDF::Reader
       end
     end
+    #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
     def evaluate_or_filters(text_run, conditions)
       conditions.any? do |condition|
         evaluate_filters(text_run, condition)
       end
     end
+    #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
     def evaluate_and_filters(text_run, conditions)
       conditions.all? do |condition|
         evaluate_filters(text_run, condition)
       end
     end
+    #: (PDF::Reader::TextRun, Hash[Symbol, untyped]) -> bool
     def evaluate_filters(text_run, filter_hash)
       filter_hash.all? do |attribute, conditions|
         evaluate_attribute_conditions(text_run, attribute, conditions)
       end
     end
+    #: (PDF::Reader::TextRun, Symbol, Hash[Symbol, untyped]) -> bool
     def evaluate_attribute_conditions(text_run, attribute, conditions)
       conditions.all? do |operator, value|
         unless VALID_OPERATORS.include?(operator)
@@ -113,6 +127,7 @@ class PDF::Reader
       end
     end
+    #: (untyped, Symbol, untyped) -> bool
     def apply_operator(attribute_value, operator, filter_value)
       case operator
       when :equal

data/lib/pdf/reader/aes_v2_security_handler.rb CHANGED Viewed

@@ -11,6 +11,7 @@ class PDF::Reader
   #
   class AesV2SecurityHandler
+    #: (String) -> void
     def initialize(key)
       @encrypt_key = key
     end
@@ -21,10 +22,38 @@ class PDF::Reader
     #
     # version == 4 and CFM == AESV2
     #
+    # used to decrypt PDF streams (buf). Input data should be in bytesizes of
+    # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
+    # vector, so any input of exactly 16 bytes decrypts to an empty string
+    #
     # buf - a string to decrypt
     # ref - a PDF::Reader::Reference for the object to decrypt
     #
+    #: (String, PDF::Reader::Reference) -> String
     def decrypt( buf, ref )
+      if buf.bytesize % 16 > 0
+        raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
+      elsif buf.bytesize == 16
+        return ""
+      else
+        begin
+          internal_decrypt(buf, ref)
+        rescue OpenSSL::Cipher::CipherError
+          # If we failed to decrypt it might be a padding error, so try again
+          # and assume no padding in the ciphertext. This will "suceed" but might
+          # return garbage if the key is incorrect but that's OK - well before this
+          # class is used we have confirmed the user provided key is correct so if
+          # this works without error we can be confident the returned plaintext is
+          #  correct
+          internal_decrypt(buf, ref, false)
+        end
+      end
+    end
+    private
+    #: (String, PDF::Reader::Reference, ?bool) -> String
+    def internal_decrypt(buf, ref, padding = true)
       objKey = @encrypt_key.dup
       (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
       (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
@@ -32,6 +61,7 @@ class PDF::Reader
       length = objKey.length < 16 ? objKey.length : 16
       cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
       cipher.decrypt
+      cipher.padding = 0 unless padding
       cipher.key = Digest::MD5.digest(objKey)[0,length]
       cipher.iv = buf[0..15]
       cipher.update(buf[16..-1]) + cipher.final

data/lib/pdf/reader/aes_v3_security_handler.rb CHANGED Viewed

@@ -12,27 +12,59 @@ class PDF::Reader
   #
   class AesV3SecurityHandler
+    #: (String) -> void
     def initialize(key)
+      if key.bytesize != 32
+        raise PDF::Reader::MalformedPDFError.new(
+          "AES-256 key must be exactly 32 bytes, got #{key.bytesize}"
+        )
+      end
       @encrypt_key = key
-      @cipher = "AES-256-CBC"
+      @cipher = "AES-256-CBC" #: String
     end
     ##7.6.2 General Encryption Algorithm
     #
     # Algorithm 1: Encryption of data using the RC4 or AES algorithms
     #
-    # used to decrypt RC4/AES encrypted PDF streams (buf)
+    # used to decrypt RC4/AES encrypted PDF streams (buf). Input data should be in bytesizes of
+    # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
+    # vector, so any input of exactly 16 bytes decrypts to an empty string
     #
     # buf - a string to decrypt
     # ref - a PDF::Reader::Reference for the object to decrypt
     #
+    #: (String, PDF::Reader::Reference) -> String
     def decrypt( buf, ref )
+      if buf.bytesize % 16 > 0
+        raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
+      elsif buf.bytesize == 16
+        return ""
+      else
+        begin
+          internal_decrypt(buf, ref)
+        rescue OpenSSL::Cipher::CipherError
+          # If we failed to decrypt it might be a padding error, so try again
+          # and assume no padding in the ciphertext. This will "suceed" but might
+          # return garbage if the key is incorrect but that's OK - well before this
+          # class is used we have confirmed the user provided key is correct so if
+          # this works without error we can be confident the returned plaintext is
+          #  correct
+         internal_decrypt(buf, ref, false)
+        end
+      end
+    end
+    private
+    #: (String, PDF::Reader::Reference, ?bool) -> String
+    def internal_decrypt(buf, ref, padding = true)
       cipher = OpenSSL::Cipher.new(@cipher)
       cipher.decrypt
+      cipher.padding = 0 unless padding
       cipher.key = @encrypt_key.dup
       cipher.iv = buf[0..15]
       cipher.update(buf[16..-1]) + cipher.final
     end
   end
 end

data/lib/pdf/reader/bounding_rectangle_runs_filter.rb CHANGED Viewed

@@ -8,6 +8,7 @@ class PDF::Reader
   # MediaBox or CropBox, but could be a user specified rectangle too
   class BoundingRectangleRunsFilter
+    #: (Array[PDF::Reader::TextRun], PDF::Reader::Rectangle) -> Array[PDF::Reader::TextRun]
     def self.runs_within_rect(runs, rect)
       runs.select { |run| rect.contains?(run.origin) }
     end

data/lib/pdf/reader/buffer.rb CHANGED Viewed

@@ -38,30 +38,31 @@ class PDF::Reader
   # the raw tokens into objects we can work with (strings, ints, arrays, etc)
   #
   class Buffer
-    TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
-    TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
+    TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
+    TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]
     # some strings for comparissons. Declaring them here avoids creating new
     # strings that need GC over and over
-    LEFT_PAREN = "("
-    LESS_THAN = "<"
-    STREAM = "stream"
-    ID = "ID"
-    FWD_SLASH = "/"
-    NULL_BYTE = "\x00"
-    CR = "\r"
-    LF = "\n"
-    CRLF = "\r\n"
-    WHITE_SPACE = ["\n", "\r", ' ']
+    LEFT_PAREN = "(" #: String
+    LESS_THAN = "<" #: String
+    STREAM = "stream" #: String
+    ID = "ID" #: String
+    FWD_SLASH = "/" #: String
+    NULL_BYTE = "\x00" #: String
+    CR = "\r" #: String
+    LF = "\n" #: String
+    CRLF = "\r\n" #: String
+    WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]
     # Quite a few PDFs have trailing junk.
     # This can be several k of nuls in some cases
     # Allow for this here
-    TRAILING_BYTECOUNT = 5000
+    TRAILING_BYTECOUNT = 5000 #: Integer
     # must match whole tokens
-    DIGITS_ONLY = %r{\A\d+\z}
+    DIGITS_ONLY = %r{\A\d+\z} #: Regexp
+    #: Integer
     attr_reader :pos
     # Creates a new buffer.
@@ -76,17 +77,19 @@ class PDF::Reader
     #   :content_stream - set to true if buffer will be tokenising a
     #                     content stream. Defaults to false
     #
+    #: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
     def initialize(io, opts = {})
       @io = io
-      @tokens = []
-      @in_content_stream = opts[:content_stream]
+      @tokens = [] #: Array[String | PDF::Reader::Reference]
+      @in_content_stream = opts[:content_stream] #: bool
       @io.seek(opts[:seek]) if opts[:seek]
-      @pos = @io.pos
+      @pos = @io.pos #: Integer
     end
     # return true if there are no more tokens left
     #
+    #: () -> bool
     def empty?
       prepare_tokens if @tokens.size < 3
@@ -105,6 +108,7 @@ class PDF::Reader
     #   Skipping a bare CR is not spec-compliant.
     #   This is because the data may start with LF.
     #   However we check for CRLF first, so the ambiguity is avoided.
+    #: (Integer, ?Hash[Symbol, untyped]) -> String?
     def read(bytes, opts = {})
       reset_pos
@@ -130,6 +134,7 @@ class PDF::Reader
     # return the next token from the source. Returns a string if a token
     # is found, nil if there are no tokens left.
     #
+    #: () -> (nil | String | PDF::Reader::Reference)
     def token
       reset_pos
       prepare_tokens if @tokens.size < 3
@@ -141,6 +146,7 @@ class PDF::Reader
     # return the byte offset where the first XRef table in th source can be found.
     #
+    #: () -> Integer
     def find_first_xref_offset
       check_size_is_non_zero
       @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
@@ -164,6 +170,7 @@ class PDF::Reader
     private
+    #: () -> void
     def check_size_is_non_zero
       @io.seek(-1, IO::SEEK_END)
       @io.seek(0)
@@ -173,12 +180,14 @@ class PDF::Reader
     # Returns true if this buffer is parsing a content stream
     #
+    #: () -> bool
     def in_content_stream?
       @in_content_stream ? true : false
     end
     # Some bastard moved our IO stream cursor. Restore it.
     #
+    #: () -> void
     def reset_pos
       @io.seek(@pos) if @io.pos != @pos
     end
@@ -186,12 +195,14 @@ class PDF::Reader
     # save the current position of the source IO stream. If someone else (like another buffer)
     # moves the cursor, we can then restore it.
     #
+    #: () -> void
     def save_pos
       @pos = @io.pos
     end
     # attempt to prime the buffer with the next few tokens.
     #
+    #: () -> void
     def prepare_tokens
       10.times do
         case state
@@ -208,6 +219,7 @@ class PDF::Reader
     # tokenising behaves slightly differently based on the current context.
     # Determine the current context/state by examining the last token we found
     #
+    #: () -> Symbol
     def state
       case @tokens.last
       when LEFT_PAREN then :literal_string
@@ -236,6 +248,7 @@ class PDF::Reader
     # indirect reference, so test for that case first and avoid the relatively
     # expensive regexp checks if possible.
     #
+    #: () -> void
     def merge_indirect_reference
       return if @tokens.size < 3
       return if @tokens[2] != "R"
@@ -253,6 +266,7 @@ class PDF::Reader
     # If the EI follows white-space the space is dropped from the data
     # The EI must followed by white-space or end of buffer
     # This is to reduce the chance of accidentally matching an embedded EI
+    #: () -> void
     def prepare_inline_token
       idstart = @io.pos
       prevchr = ''
@@ -299,6 +313,7 @@ class PDF::Reader
     # if we're currently inside a hex string, read hex nibbles until
     # we find a closing >
     #
+    #: () -> void
     def prepare_hex_token
       str = "".dup
@@ -328,6 +343,7 @@ class PDF::Reader
     # processing to fix things like escaped new lines, but that's someone else's
     # problem.
     #
+    #: () -> void
     def prepare_literal_token
       str = "".dup
       count = 1
@@ -358,6 +374,7 @@ class PDF::Reader
     # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
     # to read up on it.
     #
+    #: () -> void
     def prepare_regular_token
       tok = "".dup
@@ -435,6 +452,7 @@ class PDF::Reader
     # peek at the next character in the io stream, leaving the stream position
     # untouched
     #
+    #: () -> (Integer | nil)
     def peek_byte
       byte = @io.getbyte
       @io.seek(-1, IO::SEEK_CUR) if byte

data/lib/pdf/reader/cid_widths.rb CHANGED Viewed

@@ -18,12 +18,14 @@ class PDF::Reader
     # Graphics State Operators
     def_delegators :@widths, :[], :fetch
+    #: (Numeric, Array[Numeric]) -> void
     def initialize(default, array)
-      @widths = parse_array(default, array.dup)
+      @widths = parse_array(default, array.dup) #: Hash[Numeric, Numeric]
     end
     private
+    #: (Numeric, Array[Numeric]) -> Hash[Numeric, Numeric]
     def parse_array(default, array)
       widths  = Hash.new(default)
       params = []
@@ -43,6 +45,8 @@ class PDF::Reader
     # this is the form 10 [234 63 234 346 47 234] where width of index 10 is
     # 234, index 11 is 63, etc
+    #
+    #: (Integer, Array[Numeric]) -> Hash[Numeric, Numeric]
     def parse_first_form(first, widths)
       widths.inject({}) { |accum, glyph_width|
         accum[first + accum.size] = glyph_width
@@ -51,6 +55,8 @@ class PDF::Reader
     end
     # this is the form 10 20 123 where all index between 10 and 20 have width 123
+    #
+    #: (Integer, Integer, Numeric) -> Hash[Numeric, Numeric]
     def parse_second_form(first, final, width)
       if first > final
         raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding: utf-8
-# typed: true
+# typed: strict
 # frozen_string_literal: true
 ################################################################################
@@ -44,15 +44,22 @@ class PDF::Reader
       "begin" => :noop,
       "begincmap" => :noop,
       "def" => :noop
-    }
+    } #: Hash[String, Symbol]
+    # Indicates the start of a UTF-16 surrogate pair, see
+    # https://en.wikipedia.org/wiki/Universal_Character_Set_characters
+    HIGH_SURROGATE_RANGE = (0xD800..0xDBFF) #: Range[Integer]
+    #: Hash[Integer, Array[Integer]]
     attr_reader :map
+    #: (String) -> void
     def initialize(data)
-      @map = {}
+      @map = {} #: Hash[Integer, Array[Integer]]
       process_data(data)
     end
+    #: () -> Integer
     def size
       @map.size
     end
@@ -61,12 +68,14 @@ class PDF::Reader
     #
     # Returns an array of Integers.
     #
+    #: (Integer) -> Array[Integer]
     def decode(c)
       @map.fetch(c, [])
     end
     private
+    #: (String, ?Symbol) -> void
     def process_data(data, initial_mode = :none)
       parser = build_parser(data)
       mode = initial_mode
@@ -96,6 +105,7 @@ class PDF::Reader
     end
+    #: (String) -> PDF::Reader::Parser
     def build_parser(instructions)
       buffer = Buffer.new(StringIO.new(instructions))
       Parser.new(buffer)
@@ -109,6 +119,7 @@ class PDF::Reader
     # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
     # exception when we try converting broken UTF-16 to UTF-8
     #
+    #: (String) -> Array[Integer]
     def str_to_int(str)
       unpacked_string = if str.bytesize == 1 # UTF-8
         str.unpack("C*")
@@ -117,15 +128,16 @@ class PDF::Reader
       end
       result = []
       while unpacked_string.any? do
-        if unpacked_string.size >= 2 &&
-            unpacked_string.first.to_i >= 0xD800 &&
-            unpacked_string.first.to_i <= 0xDBFF
+        if unpacked_string.size >= 2 && HIGH_SURROGATE_RANGE.include?(unpacked_string.first.to_i)
           # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
           # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
           # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
           point_one = unpacked_string.shift.to_i
           point_two = unpacked_string.shift.to_i
           result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
+        elsif unpacked_string.size == 1 && HIGH_SURROGATE_RANGE.include?(unpacked_string.first.to_i)
+            # the start of a surrogate pair but the pair is missing. Skip it
+            unpacked_string.shift
         else
           result << unpacked_string.shift
         end
@@ -133,6 +145,7 @@ class PDF::Reader
       result
     end
+    #: (Array[String]) -> void
     def process_bfchar_instructions(instructions)
       instructions.each_slice(2) do |one, two|
         find    = str_to_int(one.to_s)
@@ -143,6 +156,7 @@ class PDF::Reader
       end
     end
+    #: (Array[Array[String] | String]) -> void
     def process_bfrange_instructions(instructions)
       instructions.each_slice(3) do |start, finish, to|
         if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
@@ -155,6 +169,7 @@ class PDF::Reader
       end
     end
+    #: (String, String, String) -> void
     def bfrange_type_one(start_code, end_code, dst)
       start_code = str_to_int(start_code).first
       end_code   = str_to_int(end_code).first
@@ -168,6 +183,7 @@ class PDF::Reader
       end
     end
+    #: (String, String, Array[String]) -> void
     def bfrange_type_two(start_code, end_code, dst)
       start_code = str_to_int(start_code).first
       end_code   = str_to_int(end_code).first