RubyGems - hexapdf - Versions diffs - 0.22.0 → 0.23.0 - Mend

hexapdf 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +50 -0
data/lib/hexapdf/cli/form.rb +26 -3
data/lib/hexapdf/cli/inspect.rb +12 -3
data/lib/hexapdf/cli/modify.rb +23 -3
data/lib/hexapdf/composer.rb +24 -2
data/lib/hexapdf/document/destinations.rb +396 -0
data/lib/hexapdf/document.rb +38 -89
data/lib/hexapdf/layout/frame.rb +8 -9
data/lib/hexapdf/layout/style.rb +280 -7
data/lib/hexapdf/layout/text_box.rb +10 -2
data/lib/hexapdf/layout/text_layouter.rb +6 -1
data/lib/hexapdf/revision.rb +8 -1
data/lib/hexapdf/revisions.rb +151 -50
data/lib/hexapdf/task/optimize.rb +21 -11
data/lib/hexapdf/type/acro_form/text_field.rb +8 -0
data/lib/hexapdf/type/catalog.rb +9 -1
data/lib/hexapdf/type/names.rb +13 -0
data/lib/hexapdf/type/xref_stream.rb +2 -1
data/lib/hexapdf/utils/sorted_tree_node.rb +3 -1
data/lib/hexapdf/version.rb +1 -1
data/lib/hexapdf/writer.rb +15 -2
data/test/hexapdf/document/test_destinations.rb +338 -0
data/test/hexapdf/encryption/test_security_handler.rb +2 -2
data/test/hexapdf/layout/test_frame.rb +15 -1
data/test/hexapdf/layout/test_text_box.rb +16 -0
data/test/hexapdf/layout/test_text_layouter.rb +7 -0
data/test/hexapdf/task/test_optimize.rb +17 -4
data/test/hexapdf/test_composer.rb +24 -1
data/test/hexapdf/test_document.rb +30 -133
data/test/hexapdf/test_parser.rb +1 -1
data/test/hexapdf/test_revision.rb +14 -0
data/test/hexapdf/test_revisions.rb +137 -29
data/test/hexapdf/test_writer.rb +43 -14
data/test/hexapdf/type/acro_form/test_text_field.rb +17 -0
data/test/hexapdf/type/test_catalog.rb +8 -0
data/test/hexapdf/type/test_names.rb +20 -0
data/test/hexapdf/type/test_xref_stream.rb +2 -1
data/test/hexapdf/utils/test_sorted_tree_node.rb +11 -1
metadata +5 -2

data/lib/hexapdf/revisions.rb CHANGED Viewed

@@ -51,6 +51,10 @@ module HexaPDF
   # the newest revision the highest index. This is also the order in which the revisions get
   # written.
   #
+  # *Important*: It is possible to manipulate the individual revisions and their objects oneself but
+  # this should only be done if one is familiar with the inner workings of HexaPDF. Otherwise it is
+  # best to use the convenience methods of this class to create, access or delete indirect objects.
+  #
   # See: PDF1.7 s7.5.6, HexaPDF::Revision
   class Revisions
@@ -68,27 +72,26 @@ module HexaPDF
         revisions = []
         begin
-          xref_section, trailer = parser.load_revision(parser.startxref_offset)
-          revisions << Revision.new(document.wrap(trailer, type: :XXTrailer),
-                                    xref_section: xref_section, loader: object_loader)
-          seen_xref_offsets = {parser.startxref_offset => true}
+          offset = parser.startxref_offset
+          seen_xref_offsets = {}
-          while (prev = revisions[0].trailer.value[:Prev]) &&
-              !seen_xref_offsets.key?(prev)
+          while offset && !seen_xref_offsets.key?(offset)
             # PDF1.7 s7.5.5 states that :Prev needs to be indirect, Adobe's reference 3.4.4 says it
             # should be direct. Adobe's POV is followed here. Same with :XRefStm.
-            xref_section, trailer = parser.load_revision(prev)
-            seen_xref_offsets[prev] = true
+            xref_section, trailer = parser.load_revision(offset)
+            seen_xref_offsets[offset] = true
-            stm = revisions[0].trailer.value[:XRefStm]
+            stm = trailer[:XRefStm]
             if stm && !seen_xref_offsets.key?(stm)
               stm_xref_section, = parser.load_revision(stm)
-              xref_section.merge!(stm_xref_section)
+              stm_xref_section.merge!(xref_section)
+              xref_section = stm_xref_section
               seen_xref_offsets[stm] = true
             end
             revisions.unshift(Revision.new(document.wrap(trailer, type: :XXTrailer),
                                            xref_section: xref_section, loader: object_loader))
+            offset = trailer[:Prev]
           end
         rescue HexaPDF::MalformedPDFError
           reconstructed_revision = parser.reconstructed_revision
@@ -133,23 +136,154 @@ module HexaPDF
       end
     end
-    # Returns the revision at the specified index.
-    def revision(index)
-      @revisions[index]
+    # Returns the next object identifier that should be used when adding a new object.
+    def next_oid
+      @revisions.map(&:next_free_oid).max
+    end
+    # :call-seq:
+    #   revisions.object(ref)    -> obj or nil
+    #   revisions.object(oid)    -> obj or nil
+    #
+    # Returns the current version of the indirect object for the given exact reference or for the
+    # given object number.
+    #
+    # For references to unknown objects, +nil+ is returned but free objects are represented by a
+    # PDF Null object, not by +nil+!
+    #
+    # See: PDF1.7 s7.3.9
+    def object(ref)
+      i = @revisions.size - 1
+      while i >= 0
+        if (result = @revisions[i].object(ref))
+          return result
+        end
+        i -= 1
+      end
+      nil
+    end
+    # :call-seq:
+    #   revisions.object?(ref)    -> true or false
+    #   revisions.object?(oid)    -> true or false
+    #
+    # Returns +true+ if one of the revisions contains an indirect object for the given exact
+    # reference or for the given object number.
+    #
+    # Even though this method might return +true+ for some references, #object may return +nil+
+    # because this method takes *all* revisions into account.
+    def object?(ref)
+      @revisions.any? {|rev| rev.object?(ref) }
+    end
+    # :call-seq:
+    #   revisions.add_object(object)     -> object
+    #
+    # Adds the given HexaPDF::Object to the current revision and returns it.
+    #
+    # If +object+ is a direct object, an object number is automatically assigned.
+    def add_object(obj)
+      if obj.indirect? && (rev_obj = current.object(obj.oid))
+        if rev_obj.data == obj.data
+          return obj
+        else
+          raise HexaPDF::Error, "Can't add object because there is already " \
+            "an object with object number #{obj.oid}"
+        end
+      end
+      obj.oid = next_oid unless obj.indirect?
+      current.add(obj)
+    end
+    # :call-seq:
+    #   revisions.delete_object(ref)
+    #   revisions.delete_object(oid)
+    #
+    # Deletes the indirect object specified by an exact reference or by an object number.
+    def delete_object(ref)
+      @revisions.reverse_each do |rev|
+        if rev.object?(ref)
+          rev.delete(ref)
+          break
+        end
+      end
+    end
+    # :call-seq:
+    #   revisions.each_object(only_current: true, only_loaded: false) {|obj| block }      -> revisions
+    #   revisions.each_object(only_current: true, only_loaded: false) {|obj, rev| block } -> revisions
+    #   revisions.each_object(only_current: true, only_loaded: false)                     -> Enumerator
+    #
+    # Yields every object and optionally the revision it is in.
+    #
+    # If +only_loaded+ is +true+, only the already loaded objects of the PDF document are yielded.
+    # This does only matter when the document instance was created from an existing PDF document.
+    #
+    # By default, only the current version of each object is returned which implies that each object
+    # number is yielded exactly once. If the +only_current+ option is +false+, all stored objects
+    # from newest to oldest are returned, not only the current version of each object.
+    #
+    # The +only_current+ option can make a difference because the document can contain multiple
+    # revisions:
+    #
+    # * Multiple revisions may contain objects with the same object and generation numbers, e.g.
+    #   two (different) objects with oid/gen [3,0].
+    #
+    # * Additionally, there may also be objects with the same object number but different
+    #   generation numbers in different revisions, e.g. one object with oid/gen [3,0] and one with
+    #   oid/gen [3,1].
+    def each_object(only_current: true, only_loaded: false, &block)
+      unless block_given?
+        return to_enum(__method__, only_current: only_current, only_loaded: only_loaded)
+      end
+      yield_rev = (block.arity == 2)
+      oids = {}
+      @revisions.reverse_each do |rev|
+        rev.each(only_loaded: only_loaded) do |obj|
+          next if only_current && oids.include?(obj.oid)
+          yield_rev ? yield(obj, rev) : yield(obj)
+          oids[obj.oid] = true
+        end
+      end
+      self
     end
-    alias [] revision
     # Returns the current revision.
+    #
+    # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
+    # *and the PDF specification.
     def current
       @revisions.last
     end
-    # Returns the number of HexaPDF::Revision objects managed by this object.
-    def size
-      @revisions.size
+    # Returns a list of all revisions.
+    #
+    # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
+    # *and the PDF specification.
+    def all
+      @revisions
+    end
+    # :call-seq:
+    #   revisions.each {|rev| block }   -> revisions
+    #   revisions.each                  -> Enumerator
+    #
+    # Iterates over all revisions from oldest to current one.
+    #
+    # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
+    # *and the PDF specification.
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      @revisions.each(&block)
+      self
     end
     # Adds a new empty revision to the document and returns it.
+    #
+    # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
+    # *and the PDF specification.
     def add
       if @revisions.empty?
         trailer = {}
@@ -164,28 +298,6 @@ module HexaPDF
       rev
     end
-    # :call-seq:
-    #   revisions.delete(index)    -> rev or nil
-    #   revisions.delete(oid)      -> rev or nil
-    #
-    # Deletes a revision from the document, either by index or by specifying the revision object
-    # itself.
-    #
-    # Returns the deleted revision object, or +nil+ if the index was out of range or no matching
-    # revision was found.
-    #
-    # Regarding the index: The oldest revision has index 0 and the current revision the highest
-    # index!
-    def delete(index_or_rev)
-      if @revisions.length == 1
-        raise HexaPDF::Error, "A document must have a least one revision, can't delete last one"
-      elsif index_or_rev.kind_of?(Integer)
-        @revisions.delete_at(index_or_rev)
-      else
-        @revisions.delete(index_or_rev)
-      end
-    end
     # :call-seq:
     #   revisions.merge(range = 0..-1)    -> revisions
     #
@@ -206,17 +318,6 @@ module HexaPDF
       self
     end
-    # :call-seq:
-    #   revisions.each {|rev| block }   -> revisions
-    #   revisions.each                  -> Enumerator
-    #
-    # Iterates over all revisions from oldest to current one.
-    def each(&block)
-      return to_enum(__method__) unless block_given?
-      @revisions.each(&block)
-      self
-    end
   end
 end

data/lib/hexapdf/task/optimize.rb CHANGED Viewed

@@ -106,7 +106,7 @@ module HexaPDF
         rev = doc.revisions.add
         oid = 1
-        doc.revisions[0].each do |obj|
+        doc.revisions.all[0].each do |obj|
           if obj.null? || unused.include?(obj) || (obj.type == :ObjStm) ||
               (obj.type == :XRef && xref_streams != :preserve)
             obj.data.value = nil
@@ -119,7 +119,7 @@ module HexaPDF
           rev.add(obj)
           oid += 1
         end
-        doc.revisions.delete(0)
+        doc.revisions.all.delete_at(0)
         if object_streams == :generate
           process_object_streams(doc, :generate, xref_streams)
@@ -134,7 +134,7 @@ module HexaPDF
       def self.process_object_streams(doc, method, xref_streams)
         case method
         when :delete
-          doc.revisions.each_with_index do |rev, rev_index|
+          doc.revisions.each do |rev|
             xref_stream = false
             objects_to_delete = []
             rev.each do |obj|
@@ -150,11 +150,11 @@ module HexaPDF
             end
             objects_to_delete.each {|obj| rev.delete(obj) }
             if xref_streams == :generate && !xref_stream
-              doc.add({Type: :XRef}, revision: rev_index)
+              rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid))
             end
           end
         when :generate
-          doc.revisions.each_with_index do |rev, rev_index|
+          doc.revisions.each do |rev|
             xref_stream = false
             count = 0
             objstms = [doc.wrap({Type: :ObjStm})]
@@ -178,8 +178,11 @@ module HexaPDF
               end
             end
             old_objstms.each {|objstm| rev.delete(objstm) }
-            objstms.each {|objstm| doc.add(objstm, revision: rev_index) }
-            doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
+            objstms.each do |objstm|
+              objstm.data.oid = doc.revisions.next_oid
+              rev.add(objstm)
+            end
+            rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
           end
         end
       end
@@ -198,13 +201,13 @@ module HexaPDF
             end
           end
         when :generate
-          doc.revisions.each_with_index do |rev, rev_index|
+          doc.revisions.each do |rev|
             xref_stream = false
             rev.each do |obj|
               xref_stream = true if obj.type == :XRef
               delete_fields_with_defaults(obj)
             end
-            doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
+            rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
           end
         end
       end
@@ -229,7 +232,10 @@ module HexaPDF
       def self.compress_pages(doc)
         used_refs = {}
         doc.pages.each do |page|
-          processor = SerializationProcessor.new
+          processor = SerializationProcessor.new do |error_message|
+            doc.config['parser.on_correctable_error'].call(doc, error_message, 0) &&
+              raise(HexaPDF::Error, error_message)
+          end
           HexaPDF::Content::Parser.parse(page.contents, processor)
           page.contents = processor.result
           page[:Contents].set_filter(:FlateDecode)
@@ -269,16 +275,20 @@ module HexaPDF
         # Contains all found references
         attr_reader :used_references
-        def initialize #:nodoc:
+        def initialize(&error_block) #:nodoc:
           @result = ''.b
           @serializer = HexaPDF::Serializer.new
           @used_references = []
+          @error_block = error_block
         end
         def process(op, operands) #:nodoc:
           @result << HexaPDF::Content::Operator::DEFAULT_OPERATORS[op].
             serialize(@serializer, *operands)
           @used_references << operands[0] if op == :Do
+        rescue StandardError => e
+          @error_block.call("Invalid content stream operation found: " \
+                            "#{op}#{operands.inspect} (#{e.message})")
         end
       end

data/lib/hexapdf/type/acro_form/text_field.rb CHANGED Viewed

@@ -164,8 +164,13 @@ module HexaPDF
         def field_value=(str)
           if flagged?(:password)
             raise HexaPDF::Error, "Storing a field value for a password field is not allowed"
+          elsif comb_text_field? && !key?(:MaxLen)
+            raise HexaPDF::Error, "A comb text field need a valid /MaxLen value"
           end
           str = str.gsub(/[[:space:]]/, ' ') if str && concrete_field_type == :single_line_text_field
+          if key?(:MaxLen) && str && str.length > self[:MaxLen]
+            raise HexaPDF::Error, "Value exceeds maximum allowed length of #{self[:MaxLen]}"
+          end
           self[:V] = str
           update_widgets
         end
@@ -243,6 +248,9 @@ module HexaPDF
           if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
             yield("Text contents of field '#{full_field_name}' is too long")
           end
+          if comb_text_field? && !max_len
+            yield("Comb text field needs a value for /MaxLen")
+          end
         end
       end

data/lib/hexapdf/type/catalog.rb CHANGED Viewed

@@ -90,13 +90,21 @@ module HexaPDF
         true
       end
-      # Returns the root node of the page tree.
+      # Returns the root node of the page tree, creating it if needed.
       #
       # See: PageTreeNode
       def pages
         self[:Pages] ||= document.add({Type: :Pages})
       end
+      # Returns the name dictionary containing all name trees of the document, creating it if
+      # needed.
+      #
+      # See: Names
+      def names
+        self[:Names] ||= document.add({}, type: :XXNames)
+      end
       # Returns the main AcroForm object.
       #
       # * If an AcroForm object exists, the +create+ argument is not used.

data/lib/hexapdf/type/names.rb CHANGED Viewed

@@ -63,6 +63,19 @@ module HexaPDF
       define_field :AlternatePresentations, type: NameTreeNode, version: '1.4'
       define_field :Renditions,             type: NameTreeNode, version: '1.5'
+      # Returns the destinations name tree containing a mapping from names to destination objects.
+      #
+      # The name tree will be created if needed.
+      #
+      # Note: It is possible to use this name tree directly, but HexaPDF::Document::Destinations
+      # provides a much easier to work with convenience interface for working with destination
+      # objects.
+      #
+      # See: PDF1.7 s12.3.2
+      def destinations
+        self[:Dests] ||= document.add({}, type: NameTreeNode)
+      end
     end
   end

data/lib/hexapdf/type/xref_stream.rb CHANGED Viewed

@@ -93,7 +93,8 @@ module HexaPDF
       #
       # See: Type::Trailer
       def trailer
-        Trailer.each_field.with_object({}) do |(name, _data), hash|
+        trailer = {Type: :XRef}
+        Trailer.each_field.with_object(trailer) do |(name, _data), hash|
           hash[name] = value[name] if key?(name)
         end
       end

data/lib/hexapdf/utils/sorted_tree_node.rb CHANGED Viewed

@@ -168,11 +168,13 @@ module HexaPDF
             index = find_in_leaf_node(node[container_name], key)
             if node[container_name][index] == key
               result = node[container_name][index + 1]
+            else
+              break
             end
           elsif node.key?(:Kids)
             index = find_in_intermediate_node(node[:Kids], key)
             node = node[:Kids][index]
-            break unless key >= node[:Limits][0] && key <= node[:Limits][1]
+            break unless node && key >= node[:Limits][0] && key <= node[:Limits][1]
           else
             break
           end

data/lib/hexapdf/version.rb CHANGED Viewed

@@ -37,6 +37,6 @@
 module HexaPDF
   # The version of HexaPDF.
-  VERSION = '0.22.0'
+  VERSION = '0.23.0'
 end

data/lib/hexapdf/writer.rb CHANGED Viewed

@@ -74,6 +74,7 @@ module HexaPDF
     # Writes the document to the IO object and returns the last XRefSection written.
     def write
+      move_modified_objects_into_current_revision
       write_file_header
       pos = xref_section = nil
@@ -109,7 +110,7 @@ module HexaPDF
       @document.revisions.each do |rev|
         rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
       end
-      _pos, xref_section = write_revision(revision, @document.revisions.parser.startxref_offset)
+      _pos, xref_section = write_revision(revision, parser.startxref_offset)
       xref_section
     end
@@ -123,6 +124,18 @@ module HexaPDF
       @io << "%PDF-#{@document.version}\n%\xCF\xEC\xFF\xE8\xD7\xCB\xCD\n"
     end
+    # Moves all modified objects into the current revision to avoid invalid references and such.
+    def move_modified_objects_into_current_revision
+      return if @document.revisions.count == 1
+      revision = @document.revisions.add
+      @document.revisions.all[0..-2].each do |rev|
+        rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
+        rev.reset_objects
+      end
+      @document.revisions.merge(-2..-1)
+    end
     # Writes the given revision.
     #
     # The optional +previous_xref_pos+ argument needs to contain the byte position of the previous
@@ -190,7 +203,7 @@ module HexaPDF
       end
       if (!object_streams.empty? || @use_xref_streams) && xref_stream.nil?
-        xref_stream = @document.wrap({Type: :XRef}, oid: rev.next_free_oid)
+        xref_stream = @document.wrap({Type: :XRef}, oid: @document.revisions.next_oid)
         rev.add(xref_stream)
       end