RubyGems - hexapdf - Versions diffs - 0.14.2 → 0.14.3 - Mend

hexapdf 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/lib/hexapdf/dictionary.rb +3 -0
data/lib/hexapdf/dictionary_fields.rb +1 -1
data/lib/hexapdf/font/true_type/subsetter.rb +5 -1
data/lib/hexapdf/parser.rb +15 -7
data/lib/hexapdf/pdf_array.rb +3 -0
data/lib/hexapdf/serializer.rb +3 -2
data/lib/hexapdf/tokenizer.rb +22 -0
data/lib/hexapdf/type/acro_form/appearance_generator.rb +3 -0
data/lib/hexapdf/type/resources.rb +4 -0
data/lib/hexapdf/version.rb +1 -1
data/test/hexapdf/font/true_type/test_subsetter.rb +2 -0
data/test/hexapdf/test_dictionary_fields.rb +7 -0
data/test/hexapdf/test_parser.rb +36 -0
data/test/hexapdf/test_serializer.rb +7 -0
data/test/hexapdf/test_tokenizer.rb +28 -0
data/test/hexapdf/test_writer.rb +2 -2
data/test/hexapdf/type/acro_form/test_appearance_generator.rb +3 -1
data/test/hexapdf/type/test_resources.rb +6 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 22367739c2160e7a5dbc9e8b20bfefb06ef96664ccb814e4ab719a9bce7b68f3
-  data.tar.gz: 8f703fa1e8e7b9d0b966e37becbecbbb7a4c1d81d0a823d770fa3d531efb1a4a
+  metadata.gz: c43d8e9e117db1717ddfee73a54e4384743b8aa35863ab5bd19ffe57b8ce5674
+  data.tar.gz: 1020c8a3de8fcdf201500c1c0d22dfb99ed27daebac7baac92748f8127efc992
 SHA512:
-  metadata.gz: b05954e3c3890cbbc40d8171e6f1d7f6375569d69b5c90fc0e74b5ea5553ff5a45f1090a3741f50bfab7a6e0725b1e79f5c0eab7b92e6f7e518f38eb1eb6a3f8
-  data.tar.gz: 2a3441b7ee7ca89e1417ea4134c3ab7444b4f791f5cba274361e719626fe9b0b08c903425b53a7fb7b04be3cab96a4edbb5c502dc1e7b4e3cdc809f8a9ebafb6
+  metadata.gz: e19eea4e88077afb7e8532fa6fe9ab2a03ffc5588749b72277462a971ebcec877ee72868d0ab698744117d46566be98e65c10225649d3bd1b4cd6e64e9625767
+  data.tar.gz: 6626a9feba0af0b46f293c1069a0d53b458a0dc29d08b82253f14f9bb98a878b914042faccc433b73f2f0e35d4da47c58a1bdebd2f3dee2fefb24c076a4e6bb3

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,21 @@
+## 0.14.3 - 2021-02-16
+### Fixed
+* Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
+  text output
+* [HexaPDF::Serializer] to handle infinite recursion problem
+* Cross-reference table reconstruction to avoid an O(n^2) performance problem
+* [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
+  containing a single value instead of an array
+* Processing of invalid PDF files missing a required value in appearance streams
+* Processing of invalid empty arrays that should be rectangles by converting
+  them to PDF null objects
+* Processing of invalid PDF files containing indirect objects with offset 0
+* Processing of invalid PDF files containing a space/CR or space/LF combination
+  after the 'stream' keyword
 ## 0.14.2 - 2021-01-22
 ### Fixed

data/lib/hexapdf/dictionary.rb CHANGED Viewed

@@ -156,6 +156,9 @@ module HexaPDF
     #
     # * Returns the default value if one is specified and no value is available.
     #
+    # Note: If field information is available for the entry, a Hash or Array value will always be
+    # wrapped by Dictionary or PDFArray. Otherwise, the value will be returned as-is.
+    #
     # Note: This method may throw a "can't add a new key into hash during iteration" error in
     # certain cases because it potentially modifies the underlying hash!
     def [](name)

data/lib/hexapdf/dictionary_fields.rb CHANGED Viewed

@@ -344,7 +344,7 @@ module HexaPDF
       # Wraps a given array in the Rectangle class. Otherwise returns +nil+.
       def self.convert(data, _type, document)
         return unless data.kind_of?(Array) || data.kind_of?(HexaPDF::PDFArray)
-        document.wrap(data, type: Rectangle)
+        data.empty? ? document.wrap(nil) : document.wrap(data, type: Rectangle)
       end
     end

data/lib/hexapdf/font/true_type/subsetter.rb CHANGED Viewed

@@ -67,7 +67,11 @@ module HexaPDF
           # they never appear in the output (PDF serialization would need to escape them)
           if @last_id == 13 || @last_id == 40 || @last_id == 92
             @glyph_map[:"s#{@last_id}"] = @last_id
-            @last_id += (@last_id == 40 ? 2 : 1)
+            if @last_id == 40
+              @last_id += 1
+              @glyph_map[:"s#{@last_id}"] = @last_id
+            end
+            @last_id += 1
           end
           @glyph_map[glyph_id] = @last_id
         end

data/lib/hexapdf/parser.rb CHANGED Viewed

@@ -72,7 +72,13 @@ module HexaPDF
       obj, oid, gen, stream =
         case xref_entry.type
         when :in_use
-          parse_indirect_object(xref_entry.pos)
+          if xref_entry.pos == 0 && xref_entry.oid != 0
+            # Handle seen-in-the-wild objects with invalid offset 0
+            maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
+            [nil, xref_entry.oid, xref_entry.gen, nil]
+          else
+            parse_indirect_object(xref_entry.pos)
+          end
         when :free
           [nil, xref_entry.oid, xref_entry.gen, nil]
         when :compressed
@@ -83,7 +89,7 @@ module HexaPDF
       if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
         raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
-          "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
+                        "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
       end
       @document.wrap(obj, oid: oid, gen: gen, stream: stream)
@@ -133,7 +139,9 @@ module HexaPDF
         tok1 = @tokenizer.next_byte
         tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
         if tok1 != 10 && tok1 != 13
-          raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
+          tok2 = @tokenizer.next_byte
+          maybe_raise("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos,
+                      force: tok1 != 32 || (tok2 != 10 && tok2 != 13)) # 32=space
         elsif tok1 == 13 && tok2 != 10
           maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
                       pos: @tokenizer.pos)
@@ -390,14 +398,14 @@ module HexaPDF
       while true
         @tokenizer.skip_whitespace
         pos = @tokenizer.pos
-        @tokenizer.scan_until(/(\n|\r\n?)+/)
+        @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
         next_new_line_pos = @tokenizer.pos
         @tokenizer.pos = pos
-        token = @tokenizer.next_token rescue nil
+        token = @tokenizer.next_integer_or_keyword rescue nil
         if token.kind_of?(Integer)
-          gen = @tokenizer.next_token rescue nil
-          tok = @tokenizer.next_token rescue nil
+          gen = @tokenizer.next_integer_or_keyword rescue nil
+          tok = @tokenizer.next_integer_or_keyword rescue nil
           if @tokenizer.pos > next_new_line_pos
             @tokenizer.pos = next_new_line_pos
           elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'

data/lib/hexapdf/pdf_array.rb CHANGED Viewed

@@ -65,6 +65,9 @@ module HexaPDF
     # * Returns the native Ruby object for values with class HexaPDF::Object. However, all
     #   subclasses of HexaPDF::Object are returned as is (it makes no sense, for example, to return
     #   the hash that describes the Catalog instead of the Catalog object).
+    #
+    # Note: Hash or Array values will always be returned as-is, i.e. not wrapped with Dictionary or
+    # PDFArray.
     def [](arg1, arg2 = nil)
       data = arg2 ? value[arg1, arg2] : value[arg1]
       return if data.nil?

data/lib/hexapdf/serializer.rb CHANGED Viewed

@@ -343,6 +343,7 @@ module HexaPDF
           @io << data.freeze
         end
         @io << "\nendstream"
+        @in_object = false
         nil
       else
@@ -350,12 +351,12 @@ module HexaPDF
         obj.value[:Length] = data.size
         str = serialize_hash(obj.value)
+        @in_object = false
         str << "stream\n"
         str << data
         str << "\nendstream"
       end
-    ensure
-      @in_object = false
     end
     # Invokes the correct serialization method for the object.

data/lib/hexapdf/tokenizer.rb CHANGED Viewed

@@ -188,6 +188,28 @@ module HexaPDF
       token
     end
+    # Returns a single integer or keyword token read from the current position and advances the scan
+    # pointer. If the current position doesn't contain such a token, +nil+ is returned without
+    # advancing the scan pointer. The value +NO_MORE_TOKENS+ is returned if there are no more tokens
+    # available.
+    #
+    # Initial runs of whitespace characters are ignored.
+    #
+    # Note: This is a special method meant for use with reconstructing the cross-reference table!
+    def next_integer_or_keyword
+      skip_whitespace
+      byte = @ss.string.getbyte(@ss.pos) || -1
+      if 48 <= byte && byte <= 57
+        parse_number
+      elsif (97 <= byte && byte <= 122) || (65 <= byte && byte <= 90)
+        parse_keyword
+      elsif byte == -1 # we reached the end of the file
+        NO_MORE_TOKENS
+      else
+        nil
+      end
+    end
     # Reads the byte (an integer) at the current position and advances the scan pointer.
     def next_byte
       prepare_string_scanner(1)

data/lib/hexapdf/type/acro_form/appearance_generator.rb CHANGED Viewed

@@ -245,6 +245,9 @@ module HexaPDF
           end
           form = (@widget[:AP] ||= {})[:N] ||= @document.add({Type: :XObject, Subtype: :Form})
+          # Wrap existing object in Form class in case the PDF writer didn't include the /Subtype
+          # key; we can do this since we know this has to be a Form object
+          form = @document.wrap(form, type: :XObject, subtype: :Form) unless form[:Subtype] == :Form
           form.value.replace({Type: :XObject, Subtype: :Form, BBox: [0, 0, rect.width, rect.height]})
           form.contents = ''
           form[:Resources] = HexaPDF::Object.deep_copy(default_resources)

data/lib/hexapdf/type/resources.rb CHANGED Viewed

@@ -222,6 +222,10 @@ module HexaPDF
           yield("No procedure set specified", true)
           self[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI]
         else
+          if val.kind_of?(Symbol)
+            yield("Procedure set is a single value instead of an Array", true)
+            val = value[:ProcSet] = [val]
+          end
           val.reject! do |name|
             case name
             when :PDF, :Text, :ImageB, :ImageC, :ImageI

data/lib/hexapdf/version.rb CHANGED Viewed

@@ -37,6 +37,6 @@
 module HexaPDF
   # The version of HexaPDF.
-  VERSION = '0.14.2'
+  VERSION = '0.14.3'
 end

data/test/hexapdf/font/true_type/test_subsetter.rb CHANGED Viewed

@@ -29,6 +29,8 @@ describe HexaPDF::Font::TrueType::Subsetter do
   it "doesn't use certain subset glyph IDs for performance reasons" do
     1.upto(93) {|i| @subsetter.use_glyph(i) }
+    # glyph 0, 93 used glyph, 4 special glyphs
+    assert_equal(1 + 93 + 4, @subsetter.instance_variable_get(:@glyph_map).size)
     1.upto(12) {|i| assert_equal(i, @subsetter.subset_glyph_id(i), "id=#{i}") }
     13.upto(38) {|i| assert_equal(i + 1, @subsetter.subset_glyph_id(i), "id=#{i}") }
     39.upto(88) {|i| assert_equal(i + 3, @subsetter.subset_glyph_id(i), "id=#{i}") }

data/test/hexapdf/test_dictionary_fields.rb CHANGED Viewed

@@ -234,5 +234,12 @@ describe HexaPDF::DictionaryFields do
       @field.convert(data, doc)
       doc.verify
     end
+    it "converts to a null value if an (invalid) empty array is given" do
+      doc = Minitest::Mock.new
+      doc.expect(:wrap, :data, [nil])
+      @field.convert([], doc)
+      doc.verify
+    end
   end
 end

data/test/hexapdf/test_parser.rb CHANGED Viewed

@@ -88,6 +88,12 @@ describe HexaPDF::Parser do
       assert_equal('12', TestHelper.collector(stream.fiber))
     end
+    it "handles keyword stream followed by space and CR or LF" do
+      create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
+      *, stream = @parser.parse_indirect_object
+      assert_equal('12', TestHelper.collector(stream.fiber))
+    end
     it "handles invalid indirect object value consisting of number followed by endobj without space" do
       create_parser("1 0 obj 749endobj")
       object, * = @parser.parse_indirect_object
@@ -157,6 +163,12 @@ describe HexaPDF::Parser do
         assert_match(/not CR alone/, exp.message)
       end
+      it "fails if keyword stream is followed by space and CR or LF instead of LF or CR/LF" do
+        create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
+        exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
+        assert_match(/must be followed by LF or CR\/LF/, exp.message)
+      end
       it "fails for numbers followed by endobj without space" do
         create_parser("1 0 obj 749endobj")
         exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
@@ -222,6 +234,23 @@ describe HexaPDF::Parser do
       assert_equal([1, 2], obj.value)
     end
+    it "handles an invalid indirect object offset of 0" do
+      obj = @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
+      assert(obj.null?)
+      assert_equal(2, obj.oid)
+      assert_equal(0, obj.gen)
+    end
+    describe "with strict parsing" do
+      it "raises an error if an indirect object has an offset of 0" do
+        @document.config['parser.on_correctable_error'] = proc { true }
+        exp = assert_raises(HexaPDF::MalformedPDFError) do
+          @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
+        end
+        assert_match(/has offset 0/, exp.message)
+      end
+    end
     it "fails if another object is found instead of an object stream" do
       def (@document).object(_oid)
         :invalid
@@ -512,6 +541,13 @@ describe HexaPDF::Parser do
       assert_equal(6, @parser.load_object(@xref).value)
     end
+    it "handles pathalogical cases which contain many opened literal strings" do
+      time = Time.now
+      create_parser("(1" << "(abc\n" * 10000 << "\n1 0 obj\n6\nendobj\ntrailer\n<</Size 1>>")
+      assert_equal(6, @parser.load_object(@xref).value)
+      assert(Time.now - time < 0.5, "Xref reconstruction takes too long")
+    end
     it "ignores invalid objects" do
       create_parser("1 x obj\n5\nendobj\n1 0 xobj\n6\nendobj\n1 0 obj 4\nendobj\ntrailer\n<</Size 1>>")
       assert_equal(4, @parser.load_object(@xref).value)

data/test/hexapdf/test_serializer.rb CHANGED Viewed

@@ -153,6 +153,13 @@ describe HexaPDF::Serializer do
       assert_equal("<</Key(value)/Length 6>>stream\nsome\nendstream", io.string)
     end
+    it "doesn't reset the internal recursion flag if the stream is serialized as part of another object" do
+      object = HexaPDF::Dictionary.new({}, oid: 5)
+      object[:Stream] = @stream
+      object[:Self] = object # needs to be the last entry so that :Stream gets serialized first!
+      assert_serialized("<</Stream 2 0 R/Self 5 0 R>>", object)
+    end
     it "fails if a stream without object identifier is serialized" do
       @stream.oid = 0
       assert_raises(HexaPDF::Error) { @serializer.serialize(@stream) }

data/test/hexapdf/test_tokenizer.rb CHANGED Viewed

@@ -27,4 +27,32 @@ describe HexaPDF::Tokenizer do
       5.times {|i| assert_equal(i, @tokenizer.next_token) }
     end
   end
+  it "has a special token scanning method for use with xref reconstruction" do
+    create_tokenizer(<<-EOF.chomp.gsub(/^ {8}/, ''))
+        % Comment
+          true
+        123 50
+        obj
+        (ignored)
+        /Ignored
+        [/Ignored]
+        <</Ignored /Values>>
+    EOF
+    scan_to_newline = proc { @tokenizer.scan_until(/(\n|\r\n?)+|\z/) }
+    assert_nil(@tokenizer.next_integer_or_keyword)
+    scan_to_newline.call
+    assert_equal(true, @tokenizer.next_integer_or_keyword)
+    assert_equal(123, @tokenizer.next_integer_or_keyword)
+    assert_equal(50, @tokenizer.next_integer_or_keyword)
+    assert_equal('obj', @tokenizer.next_integer_or_keyword)
+    4.times do
+      assert_nil(@tokenizer.next_integer_or_keyword)
+      scan_to_newline.call
+    end
+    assert_equal(HexaPDF::Tokenizer::NO_MORE_TOKENS, @tokenizer.next_integer_or_keyword)
+  end
 end

data/test/hexapdf/test_writer.rb CHANGED Viewed

@@ -40,7 +40,7 @@ describe HexaPDF::Writer do
       219
       %%EOF
       3 0 obj
-      <</Producer(HexaPDF version 0.14.2)>>
+      <</Producer(HexaPDF version 0.14.3)>>
       endobj
       xref
       3 1
@@ -72,7 +72,7 @@ describe HexaPDF::Writer do
       141
       %%EOF
       6 0 obj
-      <</Producer(HexaPDF version 0.14.2)>>
+      <</Producer(HexaPDF version 0.14.3)>>
       endobj
       2 0 obj
       <</Length 10>>stream

data/test/hexapdf/type/acro_form/test_appearance_generator.rb CHANGED Viewed

@@ -407,10 +407,12 @@ describe HexaPDF::Type::AcroForm::AppearanceGenerator do
       @generator.create_appearances
       form = @widget[:AP][:N]
       form[:key] = :value
+      form.delete(:Subtype)
+      @widget[:AP][:N] = @doc.wrap(form, type: HexaPDF::Dictionary)
       @field[:V] = 'test1'
       @generator.create_appearances
-      assert_same(form, @widget[:AP][:N])
+      assert_equal(form, @widget[:AP][:N])
       refute(form.key?(:key))
       assert_match(/test1/, form.contents)
     end

data/test/hexapdf/type/test_resources.rb CHANGED Viewed

@@ -194,6 +194,12 @@ describe HexaPDF::Type::Resources do
       assert_equal([:PDF, :Text, :ImageB, :ImageC, :ImageI], @res[:ProcSet].value)
     end
+    it "handles an invalid ProcSet containing a single value instead of an array" do
+      @res[:ProcSet] = :PDF
+      @res.validate
+      assert_equal([:PDF], @res[:ProcSet].value)
+    end
     it "removes invalid procedure set names from ProcSet" do
       @res[:ProcSet] = [:PDF, :Unknown]
       @res.validate

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: hexapdf
 version: !ruby/object:Gem::Version
-  version: 0.14.2
+  version: 0.14.3
 platform: ruby
 authors:
 - Thomas Leitner
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-01-22 00:00:00.000000000 Z
+date: 2021-02-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: cmdparse