RubyGems - hexapdf - Versions diffs - 1.6.0 → 1.7.0 - Mend

hexapdf 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +28 -0
data/examples/032-acro_form_list_and_fill.rb +47 -0
data/examples/033-text_extraction.rb +34 -0
data/lib/hexapdf/cli/info.rb +2 -0
data/lib/hexapdf/configuration.rb +8 -0
data/lib/hexapdf/content/canvas.rb +1 -1
data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
data/lib/hexapdf/content.rb +2 -0
data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
data/lib/hexapdf/document.rb +7 -3
data/lib/hexapdf/filter/brotli_decode.rb +88 -0
data/lib/hexapdf/filter.rb +1 -0
data/lib/hexapdf/font/true_type/builder.rb +1 -1
data/lib/hexapdf/font/true_type/font.rb +13 -0
data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
data/lib/hexapdf/font/true_type.rb +1 -0
data/lib/hexapdf/layout/style.rb +6 -2
data/lib/hexapdf/task/pdfa.rb +108 -1
data/lib/hexapdf/type/acro_form/form.rb +4 -0
data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
data/lib/hexapdf/type/annotations/widget.rb +9 -0
data/lib/hexapdf/type/document_security_store.rb +80 -0
data/lib/hexapdf/type/page.rb +11 -0
data/lib/hexapdf/type.rb +1 -0
data/lib/hexapdf/version.rb +1 -1
data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
data/test/hexapdf/digital_signature/common.rb +19 -5
data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
data/test/hexapdf/font/true_type/test_builder.rb +9 -0
data/test/hexapdf/font/true_type/test_font.rb +17 -3
data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
data/test/hexapdf/task/test_pdfa.rb +72 -0
data/test/hexapdf/test_document.rb +13 -0
data/test/hexapdf/type/acro_form/test_form.rb +6 -0
data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
data/test/hexapdf/type/annotations/test_widget.rb +11 -0
data/test/hexapdf/type/test_page.rb +8 -0
metadata +25 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 35bbb5d1780d07ecf6098cc40359ff2cc02cd89231a124b6ff1a0a13c760d116
-  data.tar.gz: 8664f2ac8a6651ee83e7292d005ea10d89b7ea738de47cc62dbf219f4eae0cb4
+  metadata.gz: 04f2a87f1aaa95513275432d718996b7d598fc15e476f6999f6b6fe9f29cd0f8
+  data.tar.gz: 539d2b0e984db4ca4095bf0aad5208fbbdff5a08acc80d270a6b1c824f12c87e
 SHA512:
-  metadata.gz: 232aefc90eb4f9f9a913d27affa95a0c9eff43a72e04eeb1adc0fbe11e865033c6fd0b7779930b15a982afdd909d6ffa98640db6db668f95ce0c26332749cfae
-  data.tar.gz: e1b836a23d58e92ceb70f5b892d023edcf585288583f2254d35394688204bfdbf4401edea6562a96d1583a71a302d8d50e8a175262ff5077a3b4a2200ec922a4
+  metadata.gz: c35f8b0267ef60c6392ae99d8c001e4d6b5e18ea1f5a62132d44bbf865d52cc8a9b08436e107c35a01d3a6edaeb7be9bcede20931a255416d4ea4d07778f8fc0
+  data.tar.gz: bfdedefe99c534d62b11f406b447902ea6824758153448ebfba35d0e456850134ba36a6cb2c97d668983e8a5b5b96bf0ab0a03c6136f2478a7717a0e7bb0933b

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,31 @@
+## 1.7.0 - 2026-04-13
+### Added
+* Smart text extraction for retrieving layouted text from pages
+* Support for digitally signing with ECDSA keys
+* Support for digitally signing with DSA keys
+* Support for BrotliDecode filter
+* [HexaPDF::Type::DocumentSecurityStore] and
+  [HexaPDF::Type::ValidationRelatedInformation]
+### Changed
+* **Breaking change**: [HexaPDF::Document#unwrap] to not unwrap streams
+* Automatic detection of digital signature size to account for small deviations
+* [HexaPDF::Type::AcroForm::Form#fill] to ignore password fields
+* [HexaPDF::Type::AcroForm::TextField] validation to convert invalid Symbol
+  values to String
+* [HexaPDF::Type::Annotations::Widget] validation to also validate a widget as a
+  field if necessary
+* PDF/A task to include a fix for mismatching glyph widths for Type 2 CID fonts
+### Fixed
+* Writing of PDF documents with an invalid value for the /Info dictionary
+* Subsetting of TrueType fonts in case compound glyphs are themselves compound
 ## 1.6.0 - 2026-02-10
 ### Added

data/examples/032-acro_form_list_and_fill.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# # PDF Forms - List and fill fields
+#
+# This example shows how to list the form fields of an interactive PDF form and
+# how to fill out the form.
+#
+# The output file from the [PDF forms](acro_form.html) example can be used as
+# input.
+#
+# One way to list and fill a PDF form is to use the [HexaPDF CLI with the 'form'
+# command](/documentation/hexapdf.1.html#form). Here, however, we are doing it
+# with the HexaPDF API.
+#
+# Usage:
+# : `ruby acro_form_list_and_fill.rb [INPUT.PDF]`
+#
+require 'base64'
+require 'hexapdf'
+doc = HexaPDF::Document.open(ARGV[0] || 'acro_form.pdf')
+exit unless doc.acro_form
+puts "Listing all form fields:"
+doc.acro_form.each_field do |field|
+  puts "#{field.full_field_name} (#{field.concrete_field_type})"
+end
+# We are using this to generate some values for existing text fields. In the
+# real world one would be getting the values from the user.
+puts "\nFilling in the text fields with random values:"
+values = {}
+doc.acro_form.each_field do |field|
+  next unless field.field_type == :Tx
+  value = Base64.encode64(field.full_field_name).strip
+  value = if field.key?(:MaxLen)
+            value[0, field[:MaxLen]]
+          else
+            "Value #{field.field_type} #{value}"
+          end
+  values[field.full_field_name] = value
+  puts "#{field.full_field_name}: #{value}"
+end
+# Now actually fill out the form the values
+doc.acro_form.fill(values)
+doc.write('acro_form_list_and_fill.pdf', optimize: true)

data/examples/033-text_extraction.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# # Text Extraction
+#
+# This example shows how to extract layouted text from a page.
+#
+# It uses the provided input PDF or creates a small sample PDF as input. Then it
+# extracts the text for each page and creates new pages with the extracted text
+# in a fixed-width font.
+#
+# Usage:
+# : `ruby text_extraction.rb [INPUT.PDF]`
+#
+require 'hexapdf'
+# Use the input PDF or create a sample PDF.
+if ARGV.length > 0
+  doc = HexaPDF::Document.open(ARGV[0])
+else
+  composer = HexaPDF::Composer.new do |pdf|
+    pdf.lorem_ipsum(count: 3, padding: [0, 0, 20])
+    pdf.lorem_ipsum(padding: [0, 50, 20], text_indent: 40)
+    pdf.lorem_ipsum(count: 2)
+  end
+  doc = composer.document
+end
+# Extract the existing pages and add new ones with the extracted text
+doc.pages.count.times do |index|
+  text = doc.pages[index].extract_text
+  doc.pages.add.canvas.font('/usr/share/fonts/truetype/freefont/FreeMono.ttf', size: 6).
+    text(text, at: [10, 820])
+end
+doc.write('text_extraction.pdf', optimize: true)

data/lib/hexapdf/cli/info.rb CHANGED Viewed

@@ -137,6 +137,8 @@ module HexaPDF
             end
           elsif doc.encrypted?
             output_line("Encrypted", "yes (no or wrong password given)")
+          else
+            output_line("Encrypted", "no")
           end
           if doc.revisions.parser.linearized?

data/lib/hexapdf/configuration.rb CHANGED Viewed

@@ -559,6 +559,7 @@ module HexaPDF
                         JPXDecode: 'HexaPDF::Filter::PassThrough',
                         Crypt: 'HexaPDF::Filter::Crypt',
                         Encryption: 'HexaPDF::Filter::Encryption',
+                        BrotliDecode: 'HexaPDF::Filter::BrotliDecode',
                       },
                       'font.default' => 'Times',
                       'font.fallback' => ['ZapfDingbats', 'Symbol'],
@@ -636,6 +637,11 @@ module HexaPDF
   #
   #    See PDF2.0 s8.6
   #
+  # filter.brotli.compression::
+  #    Specifies the compression level that should be used with the BrotliDecode filter. The level
+  #    can range from 0 (no compression), 1 (best speed) to 11 (best compression). The default
+  #    value is 8 which is a good compromise between speed and resulting size.
+  #
   # filter.flate.compression::
   #    Specifies the compression level that should be used with the FlateDecode filter. The level
   #    can range from 0 (no compression), 1 (best speed) to 9 (best compression, default).
@@ -754,6 +760,8 @@ module HexaPDF
                         MCR: 'HexaPDF::Type::MarkedContentReference',
                         OBJR: 'HexaPDF::Type::ObjectReference',
                         Measure: 'HexaPDF::Type::Measure',
+                        DSS: 'HexaPDF::Type::DocumentSecurityStore',
+                        VRI: 'HexaPDF::Type::DocumentSecurityStore::ValidationRelatedInformation',
                       },
                       'object.subtype_map' => {
                         nil => {

data/lib/hexapdf/content/canvas.rb CHANGED Viewed

@@ -895,7 +895,7 @@ module HexaPDF
       #
       # * Any other string is treated as a color name. HexaPDF supports CSS Color Module Level 3
       #   color names (see https://www.w3.org/TR/css-color-3/#svg-color) as well as HexaPDF design
-      #   colors.
+      #   colors. See ColorSpace::COLOR_NAMES for the list of supported names.
       #
       # * Four numeric arguments specify a CMYK color (see ColorSpace::DeviceCMYK::Color).
       #

data/lib/hexapdf/content/smart_text_extractor.rb ADDED Viewed

@@ -0,0 +1,305 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+module HexaPDF
+  module Content
+    # This module converts the glyphs on a page to a single text string while preserving the layout.
+    #
+    # The general algorithm is:
+    #
+    # 1. Collect all individual glyphs with their user space coordinates in
+    #    TextRunCollector::TextRun objects.
+    #
+    # 2. Sort text runs top to bottom and then left to right.
+    #
+    # 3. Group those text runs into lines based on a "baseline" while also combining neighboring
+    #    text runs into larger runs.
+    #
+    # 4. Render each line into a string by taking into account the page size and the median glyph
+    #    width for a text run to column mapping.
+    #
+    # 5. Add blank lines between text lines based on the page's normal line spacing.
+    module SmartTextExtractor
+      # This module provides the functionality for collecting the necessary TextRun instances for
+      # layouting the text.
+      #
+      # To use this module include it in a processor class. Then invoke the #collect_text_runs
+      # method in the #show_text and #show_text_with_positioning methods.
+      #
+      # Example:
+      #
+      #   class CustomProcessor < HexaPDF::Content::Processor
+      #     include TextRunCollector
+      #
+      #     def show_text(str)
+      #       collect_text_runs(decode_text_with_positioning(str))
+      #     end
+      #     alias show_text_with_positioning show_text
+      #
+      #   end
+      #
+      # Once the processor has done its job, the collected text runs are available via the
+      # #text_runs method. Use them as input for SmartTextExtractor.layout_text_runs.
+      module TextRunCollector
+        # Represents a single run of continuous glyphs and their combined bounding box in user
+        # space.
+        TextRun = Struct.new(:string, :left, :bottom, :right, :top) do
+          # The "baseline" is approximated with the bottom of the bounding box.
+          #
+          # This works because HexaPDF uses a font's bounding box instead of the glyph's bounding
+          # box for each glyph. So while differently sized glyphs will have different "baseline"
+          # values, this is taken into account in the algorithm in the same way as subscript and
+          # superscript.
+          #
+          # Using this "fake" baseline works well enough and avoids additional calculations.
+          def baseline = bottom
+          # The height of the text run's bounding box.
+          def height = top - bottom
+          # The width of the text run's bounding box.
+          def width = right - left
+        end
+        # Array with all collected TextRun instances.
+        attr_reader :text_runs
+        def initialize # :nodoc:
+          super
+          @text_runs = []
+        end
+        private
+        # Collects all text runs from the glyphs in the +boxes+ array.
+        def collect_text_runs(boxes)
+          boxes.each do |box|
+            llx, lly, lrx, lry, urx, ury, ulx, uly = *box.points
+            x_min, x_max = [llx, lrx, ulx, urx].minmax
+            y_min, y_max = [lly, lry, uly, ury].minmax
+            @text_runs << TextRun.new(+box.string, x_min, y_min, x_max, y_max)
+          end
+        end
+      end
+      # This processor class is used when layouting the text through
+      # HexaPDF::Type::Page#extract_text.
+      class TextRunProcessor < HexaPDF::Content::Processor
+        include TextRunCollector
+        def show_text(str)
+          collect_text_runs(decode_text_with_positioning(str))
+        end
+        alias show_text_with_positioning show_text
+      end
+      # Converts an array of TextRun objects into a single string representation, preserving the
+      # visual layout.
+      #
+      # The +page_width+ and +page_height+ arguments specify the width and height of the page from
+      # which the text runs were extracted.
+      #
+      # The remaining keyword arguments can be used to fine-tune the algorithm for one's needs:
+      #
+      # +line_tolerance_factor+::
+      #     The tolerance factor is applied to the median text run height to determine the range
+      #     within which two text runs are considered to be on the same line. This ensures that
+      #     small differences in the baseline due to, for example, subscript or superscript parts
+      #     don't result in multiple lines.
+      #
+      #     The factor should not be too large to avoid forcing separate visual lines into one line
+      #     but also not too small to avoid subscript/superscript begin on separate lines. The
+      #     default seems to work quite well.
+      #
+      # +paragraph_distance_threshold+::
+      #     If the number of normal line spacings between two adjacent baselines is at least this
+      #     large (but smaller than +large_distance_threshold+), the gap is interpreted as a
+      #     paragraph break and a single blank line is inserted.
+      #
+      # +large_distance_threshold+::
+      #     Works like +paragraph_distance_threshold+ and indicates if a number of normal line
+      #     spacings is too large for being a paragraph break. A proportional number of blank lines
+      #     is inserted in this case.
+      #
+      #     This is used to represent large parts with non-text content like images.
+      def self.layout_text_runs(text_runs, page_width, page_height,
+                                line_tolerance_factor: 0.4, paragraph_distance_threshold: 1.35,
+                                large_distance_threshold: 3.0)
+        return '' if text_runs.empty?
+        # Use the median height of all text runs as an approximation of the main font size used on
+        # the page. The line tolerance uses a hard floor for small fonts.
+        median_height = median(text_runs.map(&:height).sort)
+        line_tolerance = [median_height * line_tolerance_factor, 2].max
+        # Group the text runs into lines which are sorted top to bottom. Text runs are pre-sorted by
+        # baseline from top to bottom and left to right (the latter is done so that consecutive text
+        # runs can be combined).
+        sorted = text_runs.sort_by {|run| [-run.baseline, run.left] }
+        lines = group_into_lines(sorted, line_tolerance)
+        # Calculate the normal line spacing, excluding anything too small/big.
+        line_distances = lines.map {|l| l.baseline }.each_cons(2).map {|a, b| a - b }.
+          select {|d| d >= median_height * 0.5 && d <= median_height * 2 }.sort
+        normal_line_spacing = line_distances.empty? ? median_height * 1.2 : median(line_distances)
+        # Convert the lines into actual text strings. Blank lines are inserted between the lines
+        # based on the normal line spacing.
+        output_lines = []
+        left_margin = lines.map {|line| line.text_runs[0].left }.min
+        glyph_widths = lines.flat_map do |line|
+          line.text_runs.flat_map {|run| [run.width.to_f / run.string.length] * run.string.length }
+        end.sort
+        median_glyph_width = median(glyph_widths)
+        lines.each_with_index do |line, index|
+          output_lines << text_runs_to_string(line.text_runs, median_glyph_width, left_margin)
+          next if index == lines.length - 1
+          # Add blank lines as needed.
+          ratio = (line.baseline - lines[index + 1].baseline) / normal_line_spacing
+          if ratio >= large_distance_threshold
+            # Subtract 1 because the newline after the output line already counts as one
+            # newline. Also cap at a maximum of 40 to avoid huge gaps.
+            [ratio.round - 1, 40].min.times { output_lines << '' }
+          elsif ratio >= paragraph_distance_threshold
+            output_lines << ''
+          end
+        end
+        output_lines.join("\n")
+      end
+      # Holds an array of TextRun objects and their median baseline.
+      Line = Struct.new(:text_runs, :baseline)
+      # Groups a sorted list of TextRuns (sorted by baseline, then left) into lines.
+      #
+      # Since the text_runs are already sorted, a single run through +sorted_text_runs+ is
+      # sufficient. A new line is created if a text run's baseline differs by more than +tolerance+
+      # from the current line's (median) baseline.
+      #
+      # The result is a list of Line objects with their contents sorted left to right.
+      def self.group_into_lines(sorted_text_runs, tolerance)
+        lines = []
+        current_line = []
+        current_baseline = sorted_text_runs[0].baseline
+        current_baselines = [current_baseline]
+        sorted_text_runs.each do |text_run|
+          # Try to combine text_runs that share exactly the same height and are next to each
+          # other. This avoids potentially garbled output because if two text parts are above each
+          # other but end up on the same line, the text runs would be mixed up (think: centered
+          # table header where some cells contain two lines).
+          if (last = current_line[-1]) && last.bottom == text_run.bottom &&
+             last.top == text_run.top && text_run.left - last.right < 1
+            last.string << text_run.string
+            last.right = text_run.right
+          elsif (current_baseline - text_run.baseline).abs <= tolerance
+            current_line << text_run
+            current_baselines << text_run.baseline
+            current_baseline = median(current_baselines)
+          else
+            lines << Line.new(current_line.sort_by!(&:left), current_baseline)
+            current_line = [text_run]
+            current_baseline = text_run.baseline
+            current_baselines.clear
+            current_baselines << current_baseline
+          end
+        end
+        lines << Line.new(current_line.sort_by!(&:left), current_baseline)
+      end
+      private_class_method :group_into_lines
+      # Returns the median value of the given sorted array of numerics.
+      def self.median(sorted_array)
+        mid = sorted_array.length / 2
+        sorted_array.length.odd? ? sorted_array[mid] : (sorted_array[mid - 1] + sorted_array[mid]) / 2.0
+      end
+      private_class_method :median
+      # Renders an array of TextRun objects representing one line to a single string.
+      #
+      # +median_glyph_width+:: Is used to determine the column for each text run.
+      # +left_margin+:: Is removed from the left side to avoid unnecessary indentation.
+      def self.text_runs_to_string(text_runs, median_glyph_width, left_margin)
+        # Minimum gap to classify as a word boundary
+        space_threshold = median_glyph_width * 0.5
+        result = +''
+        # The column where the last text run ended. Can be different from result.size due to fitting
+        # proportional-width fonts to a fixed-column output.
+        cursor = 0
+        text_runs.each_with_index do |text_run, index|
+          target_col = ((text_run.left - left_margin) / median_glyph_width).round
+          advance = target_col - cursor
+          if advance > 0
+            result << ' ' * advance
+            cursor += advance
+          elsif index >= 1 && text_run.left - text_runs[index - 1].right > space_threshold &&
+                result[-1] != ' '
+            # Force space even if advance < 0 when the actual spacing between text runs is large
+            # enough. This might happen because we are projecting proportional-width fonts to a
+            # fixed-column output.
+            cursor = target_col
+            result << ' '
+          end
+          result << text_run.string
+          # Move cursor to the text run's right edge but at least the text run's character count
+          # from the current position. This avoids gaps when there is too much difference between
+          # the on-page position and the approximated cursor. However, a one column difference is
+          # ignored to account for rounding errors.
+          cursor += text_run.string.size
+          text_run_right_edge_cursor = ((text_run.right - left_margin) / median_glyph_width).round
+          cursor = [text_run_right_edge_cursor, cursor].max if text_run_right_edge_cursor != cursor + 1
+        end
+        result.rstrip
+      end
+      private_class_method :text_runs_to_string
+    end
+  end
+end

data/lib/hexapdf/content.rb CHANGED Viewed

@@ -44,6 +44,7 @@ module HexaPDF
   #
   # * The Canvas class which provides an interface for drawing graphics and text.
   # * The Parser and Processor classes for processing an existing content stream.
+  # * SmartTextExtractor for extracting layouted text from a page.
   module Content
     autoload(:Canvas, 'hexapdf/content/canvas')
@@ -52,6 +53,7 @@ module HexaPDF
     autoload(:ColorSpace, 'hexapdf/content/color_space')
     autoload(:Operator, 'hexapdf/content/operator')
     autoload(:CanvasComposer, 'hexapdf/content/canvas_composer')
+    autoload(:SmartTextExtractor, 'hexapdf/content/smart_text_extractor')
   end

data/lib/hexapdf/digital_signature/signing/default_handler.rb CHANGED Viewed

@@ -52,9 +52,6 @@ module HexaPDF
       # The signing handler is used by default by all methods that need a signing handler. Therefore
       # it is usually only necessary to provide the actual attribute values.
       #
-      # *Note*: Currently only RSA is supported, DSA and ECDSA are not. See the examples below for
-      # how to handle them using external signing.
-      #
       #
       # == CMS and PAdES Signatures
       #
@@ -131,17 +128,6 @@ module HexaPDF
       #   document.sign("output.pdf", certificate: my_cert, certificate_chain: my_chain,
       #                 external_signing: signing_proc)
       #
-      #   # Signing with DSA or ECDSA certificate/keys
-      #   signing_proc = lambda do |io, byte_range|
-      #     io.pos = byte_range[0]
-      #     data = io.read(byte_range[1])
-      #     io.pos = byte_range[2]
-      #     data << io.read(byte_range[3])
-      #     OpenSSL::PKCS7.sign(certificate, key, data, certificate_chain,
-      #                         OpenSSL::PKCS7::DETACHED | OpenSSL::PKCS7::BINARY).to_der
-      #   end
-      #   document.sign("output.pdf", signature_size: 10_000, external_signing: signing_proc)
-      #
       #
       # == Implementing a Signing Handler
       #
@@ -277,7 +263,7 @@ module HexaPDF
         # If a custom size is set using #signature_size=, it used. Otherwise the size is determined
         # by using #sign to sign an empty string.
         def signature_size
-          @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size
+          @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size + 5
         end
         # Finalizes the signature field as well as the signature dictionary before writing.

data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb CHANGED Viewed

@@ -121,7 +121,7 @@ module HexaPDF
         private
         # Creates the set of signed attributes for the signer information structure.
-        def create_signed_attrs(data, signing_time: true)
+        def create_signed_attrs(data, ess_cert_hash: 'sha256', signing_time: true)
           signing_time = (self.signing_time || Time.now).utc if signing_time
           set(
             attribute('content-type', oid('id-data')),
@@ -132,12 +132,13 @@ module HexaPDF
             ),
             attribute(
               'id-aa-signingCertificateV2',
-              sequence( # SigningCertificateV2
+              sequence( # SigningCertificateV2, see RFC5035
                 sequence( # Seq of ESSCertIDv2
                   sequence( # ESSCertIDv2
-                    #TODO: Does not validate on ETSI checker if used, doesn't matter if SHA256 or 512
-                    #oid('sha512'),
-                    binary(OpenSSL::Digest.digest('sha256', @certificate.to_der)), # certHash
+                    (sequence( # AlgorithmIdentifier RFC3280 4.1.1.2
+                      oid(ess_cert_hash) # algorithm
+                    ) unless ess_cert_hash == 'sha256'),
+                    binary(OpenSSL::Digest.digest(ess_cert_hash, @certificate.to_der)), # certHash
                     sequence(                                      # issuerSerial
                       sequence(                                    #  issuer
                         implicit(4, sequence(@certificate.issuer)) #   choice 4 directoryName
@@ -184,13 +185,19 @@ module HexaPDF
         # Creates a signer information structure containing the actual meat of the whole CMS object.
         def create_signer_info(signature, signed_attrs, unsigned_attrs = nil)
           certificate_pkey_algorithm = @certificate.public_key.oid
-          signature_algorithm = if certificate_pkey_algorithm == 'rsaEncryption'
+          signature_algorithm = case certificate_pkey_algorithm
+                                when 'rsaEncryption'
                                   sequence(               # signatureAlgorithm
                                     oid('rsaEncryption'), #   algorithmID
                                     null                  #   params
                                   )
-                                else
-                                  raise HexaPDF::Error, "Unsupported key type/signature algorithm"
+                                when 'DSA'
+                                  unless @digest_algorithm == 'sha256'
+                                    raise HexaPDF::Error, "Only SHA256 supported with DSA"
+                                  end
+                                  sequence(oid('id-dsa-with-sha256'), null)
+                                when 'id-ecPublicKey'
+                                  sequence(oid("ecdsa-with-#{@digest_algorithm.upcase}"), null)
                                 end
           sequence(
@@ -273,6 +280,12 @@ module HexaPDF
           'sha384' => '2.16.840.1.101.3.4.2.2',
           'sha512' => '2.16.840.1.101.3.4.2.3',
           'rsaEncryption' => '1.2.840.113549.1.1.1',
+          'id-dsa-with-sha1' => '1.2.840.10040.4.3',
+          'id-dsa-with-sha256' => '2.16.840.1.101.3.4.3.2',
+          'ecdsa-with-SHA1' => '1.2.840.10045.4.1',
+          'ecdsa-with-SHA256' => '1.2.840.10045.4.3.2',
+          'ecdsa-with-SHA384' => '1.2.840.10045.4.3.3',
+          'ecdsa-with-SHA512' => '1.2.840.10045.4.3.4',
           'id-aa-signingCertificate' => '1.2.840.113549.1.9.16.2.12',
           'id-aa-timeStampToken' => '1.2.840.113549.1.9.16.2.14',
           'id-aa-signingCertificateV2' => '1.2.840.113549.1.9.16.2.47',

data/lib/hexapdf/document.rb CHANGED Viewed

@@ -394,11 +394,12 @@ module HexaPDF
     # :call-seq:
     #   document.unwrap(obj)   -> unwrapped_obj
     #
-    # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
-    # instead of HexaPDF::Reference and HexaPDF::Object).
+    # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...)
+    # instead of HexaPDF::Reference and HexaPDF::Object. Only HexaPDF::Stream objects are retained
+    # as they are not representable by native Ruby objects.
     def unwrap(object, seen = {})
       object = deref(object)
-      object = object.data if object.kind_of?(HexaPDF::Object)
+      object = object.data if object.kind_of?(HexaPDF::Object) && !object.kind_of?(HexaPDF::Stream)
       if seen.key?(object)
         raise HexaPDF::Error, "Can't unwrap a recursive structure"
       end
@@ -413,6 +414,8 @@ module HexaPDF
       when HexaPDF::PDFData
         seen[object] = true
         unwrap(object.value, seen.dup)
+      when HexaPDF::Stream
+        object
       else
         object
       end
@@ -790,6 +793,7 @@ module HexaPDF
         if @metadata
           metadata.modification_date(Time.now)
         else
+          trailer.delete(:Info) unless trailer.info.kind_of?(HexaPDF::Dictionary)
           trailer.info[:ModDate] = Time.now
         end
       end