RubyGems - pdf-reader - Versions diffs - 2.4.1 → 2.7.0 - Mend

pdf-reader 2.4.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/CHANGELOG +40 -0
data/README.md +16 -1
data/Rakefile +1 -1
data/examples/extract_fonts.rb +12 -7
data/examples/rspec.rb +1 -0
data/lib/pdf/reader/buffer.rb +63 -21
data/lib/pdf/reader/cid_widths.rb +1 -0
data/lib/pdf/reader/cmap.rb +5 -3
data/lib/pdf/reader/encoding.rb +3 -2
data/lib/pdf/reader/error.rb +11 -3
data/lib/pdf/reader/filter/ascii85.rb +7 -1
data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
data/lib/pdf/reader/filter/depredict.rb +10 -8
data/lib/pdf/reader/filter/flate.rb +27 -14
data/lib/pdf/reader/filter/lzw.rb +2 -0
data/lib/pdf/reader/filter/null.rb +1 -0
data/lib/pdf/reader/filter/run_length.rb +19 -13
data/lib/pdf/reader/filter.rb +1 -0
data/lib/pdf/reader/font.rb +1 -0
data/lib/pdf/reader/font_descriptor.rb +1 -0
data/lib/pdf/reader/form_xobject.rb +1 -0
data/lib/pdf/reader/glyph_hash.rb +16 -9
data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
data/lib/pdf/reader/lzw.rb +4 -2
data/lib/pdf/reader/null_security_handler.rb +1 -0
data/lib/pdf/reader/object_cache.rb +1 -0
data/lib/pdf/reader/object_hash.rb +8 -3
data/lib/pdf/reader/object_stream.rb +1 -0
data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
data/lib/pdf/reader/page.rb +60 -9
data/lib/pdf/reader/page_layout.rb +37 -23
data/lib/pdf/reader/page_state.rb +18 -23
data/lib/pdf/reader/page_text_receiver.rb +28 -5
data/lib/pdf/reader/pages_strategy.rb +1 -0
data/lib/pdf/reader/parser.rb +12 -7
data/lib/pdf/reader/point.rb +25 -0
data/lib/pdf/reader/print_receiver.rb +1 -0
data/lib/pdf/reader/rectangle.rb +95 -0
data/lib/pdf/reader/reference.rb +1 -0
data/lib/pdf/reader/register_receiver.rb +1 -0
data/lib/pdf/reader/resource_methods.rb +5 -0
data/lib/pdf/reader/standard_security_handler.rb +1 -0
data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
data/lib/pdf/reader/stream.rb +1 -0
data/lib/pdf/reader/synchronized_cache.rb +1 -0
data/lib/pdf/reader/text_run.rb +1 -0
data/lib/pdf/reader/token.rb +1 -0
data/lib/pdf/reader/transformation_matrix.rb +1 -0
data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
data/lib/pdf/reader/width_calculator/composite.rb +1 -0
data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
data/lib/pdf/reader/width_calculator.rb +1 -0
data/lib/pdf/reader/xref.rb +7 -1
data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
data/lib/pdf/reader.rb +14 -4
data/lib/pdf-reader.rb +1 -0
data/rbi/pdf-reader.rbi +1744 -0
metadata +17 -13
data/lib/pdf/reader/orientation_detector.rb +0 -34

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2b38615953615bfbca1a80ab344f26166377d8c08d2ba2e05badf43c10682415
-  data.tar.gz: 658b5d05a14300ad056ee31c10ea998533ccb1b91600e8bc9097070605d003ea
+  metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
+  data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
 SHA512:
-  metadata.gz: 210b0bee8c4ac009808555c8ba945f3b17b85af22126ac1440eb9b49d91f542f1974b0984efb22726985f2cf8e03440511ebc4664ac5c4d91a6bddea9a43687e
-  data.tar.gz: 8fb60cb59dc4430179a4b9ba83d30ae6dc23aa13dbef5e8febe1569311ddf7e531783da7e7dd0a6542f0087748e97898af56d540e7c088832f213b48059aa7d3
+  metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
+  data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,43 @@
+v2.7.0 (13th December 2021)
+- Include RBI type files in the gem
+  - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
+    now be typed checked by sorbet
+- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
+  - Improved text extraction on some rotated pages, and rotated text on normal pages
+- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
+  - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
+- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
+- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
+- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
+v2.6.0 (12th November 2021)
+- Text extraction improvements
+  - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
+  - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
+  - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
+  - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
+- Performance improvements
+  - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
+  - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
+- Successfully parse more files
+  - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
+  - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
+  - Increase the amount of junk bytes we detect and skip at the end of a file (382)
+  - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
+  - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
+  - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
+v2.5.0 (6th June 2021)
+- bump minimum ruby version to 2.0
+- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
+- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
+- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
+- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
+v2.4.2 (28th January 2021)
+- relax ASCII85 dependency to allow 1.x
+- improved support for decompressing objects with slightly malformed zlib data
 v.2.4.1 (24th September 2020)
 - Re-vendor font metrics from Adobe to clarify their license

data/README.md CHANGED Viewed

@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
 The easiest way to explain how this works in practice is to show some examples.
 Check out the examples/ directory for a few files.
+# Alternate Decoder
+For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
+First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
+```ruby
+require "pdf-reader"
+require "ascii85_native"
+```
+Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
 # Known Limitations
 Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
 * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
-* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
+* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
+* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
 * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html

data/Rakefile CHANGED Viewed

@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
 Cane::RakeTask.new(:quality) do |cane|
   cane.abc_max = 20
   cane.style_measure = 100
-  cane.max_violations = 31
+  cane.max_violations = 32
   cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
 end

data/examples/extract_fonts.rb CHANGED Viewed

@@ -17,8 +17,8 @@ module ExtractFonts
       return count if page.fonts.nil? || page.fonts.empty?
       page.fonts.each do |label, font|
-        next if complete_refs[font]
-        complete_refs[font] = true
+        next if complete_refs[label]
+        complete_refs[label] = true
         process_font(page, font)
@@ -39,7 +39,7 @@ module ExtractFonts
       when :TrueType, :CIDFontType2 then
         ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
       else
-        $stderr.puts "unsupported font type #{font[:Subtype]}"
+        $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
       end
     end
@@ -68,10 +68,15 @@ module ExtractFonts
   end
 end
-filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
+if ARGV.size == 0 # default file name
+  ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
+end
 extractor = ExtractFonts::Extractor.new
-PDF::Reader.open(filename) do |reader|
-  page = reader.page(1)
-  extractor.page(page)
+ARGV.each do |arg|
+  PDF::Reader.open(arg) do |reader|
+    page = reader.page(1)
+    extractor.page(page)
+  end
 end

data/examples/rspec.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env ruby
 # coding: utf-8
+# typed: ignore
 #  Basic RSpec of a generated PDF
 #

data/lib/pdf/reader/buffer.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: ASCII-8BIT
+# typed: false
 # frozen_string_literal: true
 ################################################################################
@@ -48,6 +49,15 @@ class PDF::Reader
     ID = "ID"
     FWD_SLASH = "/"
     NULL_BYTE = "\x00"
+    CR = "\r"
+    LF = "\n"
+    CRLF = "\r\n"
+    WHITE_SPACE = [LF, CR, ' ']
+    # Quite a few PDFs have trailing junk.
+    # This can be several k of nuls in some cases
+    # Allow for this here
+    TRAILING_BYTECOUNT = 5000
     attr_reader :pos
@@ -86,9 +96,12 @@ class PDF::Reader
     #
     # options:
     #
-    #   :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
-    #               is sitting under the io cursor.
-    #
+    #   :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
+    #               that is sitting under the io cursor.
+    #   Note:
+    #   Skipping a bare CR is not spec-compliant.
+    #   This is because the data may start with LF.
+    #   However we check for CRLF first, so the ambiguity is avoided.
     def read(bytes, opts = {})
       reset_pos
@@ -97,9 +110,9 @@ class PDF::Reader
         str = @io.read(2)
         if str.nil?
           return nil
-        elsif str == "\r\n"
+        elsif str == CRLF # This MUST be done before checking for CR alone
           # do nothing
-        elsif str[0,1] == "\n"
+        elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
           @io.seek(-1, IO::SEEK_CUR)
         else
           @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +140,8 @@ class PDF::Reader
     #
     def find_first_xref_offset
       check_size_is_non_zero
-      @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
-      data = @io.read(1024)
+      @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
+      data = @io.read(TRAILING_BYTECOUNT)
       # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
       lines = data.split(/[\n\r]+/).reverse
@@ -217,7 +230,9 @@ class PDF::Reader
       return if @tokens.size < 3
       return if @tokens[2] != "R"
-      if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
+      # must match whole tokens
+      digits_only = %r{\A\d+\z}
+      if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
         @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
         @tokens[1] = nil
         @tokens[2] = nil
@@ -225,24 +240,51 @@ class PDF::Reader
       end
     end
+    # Extract data between ID and EI
+    # If the EI follows white-space the space is dropped from the data
+    # The EI must followed by white-space or end of buffer
+    # This is to reduce the chance of accidentally matching an embedded EI
     def prepare_inline_token
-      str = "".dup
-      buffer = []
-      until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
+      idstart = @io.pos
+      chr = prevchr = nil
+      eisize = 0 # how many chars in the end marker
+      seeking = 'E' # what are we looking for now?
+      loop do
         chr = @io.read(1)
-        buffer << chr
-        if buffer.length > 3
-          str << buffer.shift
+        break if chr.nil?
+        case seeking
+        when 'E'
+          if chr == 'E'
+            seeking = 'I'
+            if WHITE_SPACE.include? prevchr
+              eisize = 3 # include whitespace in delimiter, i.e. drop from data
+            else # assume the EI immediately follows the data
+              eisize = 2 # leave prevchr in data
+            end
+          end
+        when 'I'
+          if chr == 'I'
+            seeking = :END
+          else
+            seeking = 'E'
+          end
+        when :END
+          if WHITE_SPACE.include? chr
+            eisize += 1 # Drop trailer
+            break
+          else
+            seeking = 'E'
+          end
         end
+        prevchr = chr
       end
-      str << NULL_BYTE if buffer.first == NULL_BYTE
+      unless seeking == :END
+        raise MalformedPDFError, "EI terminator not found"
+      end
+      eiend = @io.pos
+      @io.seek(idstart, IO::SEEK_SET)
+      str = @io.read(eiend - eisize - idstart) # get the ID content
       @tokens << string_token(str)
-      @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
     end
     # if we're currently inside a hex string, read hex nibbles until

data/lib/pdf/reader/cid_widths.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 #

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: false
 # frozen_string_literal: true
 ################################################################################
@@ -32,6 +33,7 @@ class PDF::Reader
   # extracting various useful information.
   #
   class CMap # :nodoc:
     CMAP_KEYWORDS = {
       "begincodespacerange" => 1,
       "endcodespacerange" => 1,
@@ -53,7 +55,7 @@ class PDF::Reader
     def process_data(data)
       parser = build_parser(data)
-      mode = nil
+      mode = :none
       instructions = []
       while token = parser.parse_token(CMAP_KEYWORDS)
@@ -62,13 +64,13 @@ class PDF::Reader
         elsif token == "endbfchar"
           process_bfchar_instructions(instructions)
           instructions = []
-          mode = nil
+          mode = :none
         elsif token == "beginbfrange"
           mode = :range
         elsif token == "endbfrange"
           process_bfrange_instructions(instructions)
           instructions = []
-          mode = nil
+          mode = :none
         elsif mode == :char || mode == :range
           instructions << token
         end

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
     #
     #   [25, :A, :B]
     def differences=(diff)
-      raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
+      PDF::Reader::Error.validate_type(diff, "diff", Array)
       @differences = {}
       byte = 0
@@ -208,7 +209,7 @@ class PDF::Reader
     def load_mapping(file)
       File.open(file, "r:BINARY") do |f|
         f.each do |l|
-          _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
+          _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
           @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
         end
       end

data/lib/pdf/reader/error.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: strict
 # frozen_string_literal: true
 ################################################################################
@@ -33,19 +34,26 @@ class PDF::Reader
     def self.str_assert(lvalue, rvalue, chars=nil)
       raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
       lvalue = lvalue[0,chars] if chars
-      raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead"  if lvalue != rvalue
+      raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead"  if lvalue != rvalue
     end
     ################################################################################
     def self.str_assert_not(lvalue, rvalue, chars=nil)
       raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
       lvalue = lvalue[0,chars] if chars
-      raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead"  if lvalue == rvalue
+      raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead"  if lvalue == rvalue
     end
     ################################################################################
     def self.assert_equal(lvalue, rvalue)
-      raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
+      raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
     end
     ################################################################################
+    def self.validate_type(object, name, klass)
+      raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
+    end
+    ################################################################################
+    def self.validate_not_nil(object, name)
+      raise ArgumentError, "#{object} must not be nil" if object.nil?
+    end
   end
   ################################################################################

data/lib/pdf/reader/filter/ascii85.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: false
 # frozen_string_literal: true
 require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
   module Filter # :nodoc:
     # implementation of the Ascii85 filter
     class Ascii85
       def initialize(options = {})
         @options = options
       end
@@ -17,7 +19,11 @@ class PDF::Reader
       #
       def filter(data)
         data = "<~#{data}" unless data.to_s[0,2] == "<~"
-        ::Ascii85::decode(data)
+        if defined?(::Ascii85Native)
+          ::Ascii85Native::decode(data)
+        else
+          ::Ascii85::decode(data)
+        end
       rescue Exception => e
         # Oops, there was a problem decoding the stream
         raise MalformedPDFError,

data/lib/pdf/reader/filter/ascii_hex.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 #
@@ -6,6 +7,7 @@ class PDF::Reader
   module Filter # :nodoc:
     # implementation of the AsciiHex stream filter
     class AsciiHex
       def initialize(options = {})
         @options = options
       end
@@ -16,9 +18,12 @@ class PDF::Reader
       def filter(data)
         data.chop! if data[-1,1] == ">"
         data = data[1,data.size] if data[0,1] == "<"
+        return "" if data.nil?
         data.gsub!(/[^A-Fa-f0-9]/,"")
         data << "0" if data.size % 2 == 1
-        data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
+        data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
       rescue Exception => e
         # Oops, there was a problem decoding the stream
         raise MalformedPDFError,

data/lib/pdf/reader/filter/depredict.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 class PDF::Reader
@@ -6,6 +7,7 @@ class PDF::Reader
     # some filter implementations support preprocessing of the  data to
     # improve compression
     class Depredict
       def initialize(options = {})
         @options = options || {}
       end
@@ -34,7 +36,7 @@ class PDF::Reader
       ################################################################################
       def tiff_depredict(data)
         data        = data.unpack("C*")
-        unfiltered  = []
+        unfiltered  = ''
         bpc         = @options[:BitsPerComponent] || 8
         pixel_bits  = bpc * @options[:Colors]
         pixel_bytes = pixel_bits / 8
@@ -51,11 +53,11 @@ class PDF::Reader
             left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
             row_data[index] = (byte + left) % 256
           end
-          unfiltered += row_data
+          unfiltered += row_data.pack("C*")
           pos += line_len
         end
-        unfiltered.pack("C*")
+        unfiltered
       end
       ################################################################################
       def png_depredict(data)
@@ -67,7 +69,7 @@ class PDF::Reader
         scanline_length = (pixel_bytes * @options[:Columns]) + 1
         row = 0
         pixels = []
-        paeth, pa, pb, pc = nil
+        paeth, pa, pb, pc = 0, 0, 0, 0
         until data.empty? do
           row_data = data.slice! 0, scanline_length
           filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
               row_data[index] = (byte + ((left + upper)/2).floor) % 256
             end
           when 4 # Paeth
-            left = upper = upper_left = nil
+            left = upper = upper_left = 0
             row_data.each_with_index do |byte, index|
               col = index / pixel_bytes
-              left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
+              left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
               if row.zero?
                 upper = upper_left = 0
               else
-                upper = pixels[row-1][col][index % pixel_bytes]
+                upper = Integer(pixels[row-1][col][index % pixel_bytes])
                 upper_left = col.zero? ? 0 :
-                  pixels[row-1][col-1][index % pixel_bytes]
+                  Integer(pixels[row-1][col-1][index % pixel_bytes])
               end
               p = left + upper - upper_left

data/lib/pdf/reader/filter/flate.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
@@ -8,7 +9,9 @@ class PDF::Reader
   module Filter # :nodoc:
     # implementation of the Flate (zlib) stream filter
     class Flate
       ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47  # Zlib::MAX_WBITS + 32
+      ZLIB_RAW_DEFLATE              = -15 # Zlib::MAX_WBITS * -1
       def initialize(options = {})
         @options = options
@@ -17,24 +20,34 @@ class PDF::Reader
       ################################################################################
       # Decode the specified data with the Zlib compression algorithm
       def filter(data)
-        deflated = nil
+        deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
+        if deflated.nil?
+          raise MalformedPDFError,
+            "Error while inflating a compressed stream (no suitable inflation algorithm found)"
+        end
+        Depredict.new(@options).filter(deflated)
+      end
+      private
+      def zlib_inflate(data)
         begin
-          deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
-        rescue Zlib::DataError => e
+          return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
+        rescue Zlib::DataError
           # by default, Ruby's Zlib assumes the data it's inflating
           # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
-          # fails, then use a lightly-documented 'feature' to attempt to inflate
-          # the data as a raw RFC1951 stream.
-          #
-          # See
-          # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
-          deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
+          # fails, swallow the exception and attempt to inflate the data as a raw
+          # RFC1951 stream.
         end
-        Depredict.new(@options).filter(deflated)
-      rescue Exception => e
-        # Oops, there was a problem inflating the stream
-        raise MalformedPDFError,
-          "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
+        begin
+          return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
+        rescue StandardError
+          # swallow this one too, so we can try some other fallback options
+        end
+        nil
       end
     end
   end

data/lib/pdf/reader/filter/lzw.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 #
@@ -6,6 +7,7 @@ class PDF::Reader
   module Filter # :nodoc:
     # implementation of the LZW stream filter
     class Lzw
       def initialize(options = {})
         @options = options
       end

data/lib/pdf/reader/filter/null.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 #

data/lib/pdf/reader/filter/run_length.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 #
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
   module Filter # :nodoc:
     # implementation of the run length stream filter
     class RunLength
       def initialize(options = {})
         @options = options
       end
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
           length = data.getbyte(pos)
           pos += 1
-          case
-          when length == 128
-            break
-          when length < 128
-            # When the length is < 128, we copy the following length+1 bytes
-            # literally.
-            out << data[pos, length + 1]
-            pos += length
-          else
-            # When the length is > 128, we copy the next byte (257 - length)
-            # times; i.e., "\xFA\x00" ([250, 0]) will expand to
-            # "\x00\x00\x00\x00\x00\x00\x00".
-            out << data[pos, 1] * (257 - length)
+          unless length.nil?
+            case
+              # nothing
+            when length == 128
+              break
+            when length < 128
+              # When the length is < 128, we copy the following length+1 bytes
+              # literally.
+              out << data[pos, length + 1]
+              pos += length
+            else
+              # When the length is > 128, we copy the next byte (257 - length)
+              # times; i.e., "\xFA\x00" ([250, 0]) will expand to
+              # "\x00\x00\x00\x00\x00\x00\x00".
+              previous_byte = data[pos, 1] || ""
+              out << previous_byte * (257 - length)
+            end
           end
           pos += 1

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: strict
 # frozen_string_literal: true
 ################################################################################

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 # coding: utf-8
+# typed: true
 # frozen_string_literal: true
 ################################################################################