RubyGems - hexapdf - Versions diffs - 1.5.0 → 1.7.0 - Mend

hexapdf 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +54 -0
data/README.md +8 -7
data/examples/022-outline.rb +5 -1
data/examples/032-acro_form_list_and_fill.rb +47 -0
data/examples/033-text_extraction.rb +34 -0
data/lib/hexapdf/cli/debug_info.rb +98 -0
data/lib/hexapdf/cli/images.rb +2 -2
data/lib/hexapdf/cli/info.rb +2 -0
data/lib/hexapdf/cli/inspect.rb +5 -1
data/lib/hexapdf/cli.rb +2 -0
data/lib/hexapdf/configuration.rb +8 -0
data/lib/hexapdf/content/canvas.rb +1 -1
data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
data/lib/hexapdf/content.rb +2 -0
data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
data/lib/hexapdf/document.rb +7 -3
data/lib/hexapdf/encryption/security_handler.rb +3 -1
data/lib/hexapdf/filter/brotli_decode.rb +88 -0
data/lib/hexapdf/filter.rb +1 -0
data/lib/hexapdf/font/cmap.rb +10 -6
data/lib/hexapdf/font/true_type/builder.rb +1 -1
data/lib/hexapdf/font/true_type/font.rb +13 -0
data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
data/lib/hexapdf/font/true_type.rb +1 -0
data/lib/hexapdf/layout/style.rb +6 -2
data/lib/hexapdf/parser.rb +29 -4
data/lib/hexapdf/revision.rb +6 -2
data/lib/hexapdf/task/pdfa.rb +108 -1
data/lib/hexapdf/type/acro_form/field.rb +4 -1
data/lib/hexapdf/type/acro_form/form.rb +4 -0
data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
data/lib/hexapdf/type/annotations/widget.rb +9 -0
data/lib/hexapdf/type/document_security_store.rb +80 -0
data/lib/hexapdf/type/page.rb +11 -0
data/lib/hexapdf/type.rb +1 -0
data/lib/hexapdf/version.rb +1 -1
data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
data/test/hexapdf/digital_signature/common.rb +19 -5
data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
data/test/hexapdf/encryption/test_security_handler.rb +7 -5
data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
data/test/hexapdf/font/true_type/test_builder.rb +9 -0
data/test/hexapdf/font/true_type/test_font.rb +17 -3
data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
data/test/hexapdf/task/test_pdfa.rb +72 -0
data/test/hexapdf/test_document.rb +13 -0
data/test/hexapdf/test_parser.rb +55 -3
data/test/hexapdf/test_revision.rb +27 -6
data/test/hexapdf/type/acro_form/test_field.rb +5 -0
data/test/hexapdf/type/acro_form/test_form.rb +6 -0
data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
data/test/hexapdf/type/annotations/test_widget.rb +11 -0
data/test/hexapdf/type/test_page.rb +8 -0
data/test/test_helper.rb +6 -0
metadata +41 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: de7c1790b3c958a91f071b5c20063eafea93fed12a034b89890242fec25c3026
-  data.tar.gz: 0e201dc452930a2a81461be5bf9cd27d2749b92815498c06b37a1e2635a20d7d
+  metadata.gz: 04f2a87f1aaa95513275432d718996b7d598fc15e476f6999f6b6fe9f29cd0f8
+  data.tar.gz: 539d2b0e984db4ca4095bf0aad5208fbbdff5a08acc80d270a6b1c824f12c87e
 SHA512:
-  metadata.gz: 1d1b13a5c28c83ca8ec4730cfc0af3016ceb14831c16587b000d8b69d0c7482d166bed21542f4929a8e4614fc732208c5670451a34339776b78a66dea8374949
-  data.tar.gz: 309e3aa2a80ec92b4fd35e72e9ab0c114fe4022467be9fb6fb5805085d6616ea8301ab30345a7322b009aeeffa5d6b052abfb8f0bafbaa9ca8b2223af3a6b223
+  metadata.gz: c35f8b0267ef60c6392ae99d8c001e4d6b5e18ea1f5a62132d44bbf865d52cc8a9b08436e107c35a01d3a6edaeb7be9bcede20931a255416d4ea4d07778f8fc0
+  data.tar.gz: bfdedefe99c534d62b11f406b447902ea6824758153448ebfba35d0e456850134ba36a6cb2c97d668983e8a5b5b96bf0ab0a03c6136f2478a7717a0e7bb0933b

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,57 @@
+## 1.7.0 - 2026-04-13
+### Added
+* Smart text extraction for retrieving layouted text from pages
+* Support for digitally signing with ECDSA keys
+* Support for digitally signing with DSA keys
+* Support for BrotliDecode filter
+* [HexaPDF::Type::DocumentSecurityStore] and
+  [HexaPDF::Type::ValidationRelatedInformation]
+### Changed
+* **Breaking change**: [HexaPDF::Document#unwrap] to not unwrap streams
+* Automatic detection of digital signature size to account for small deviations
+* [HexaPDF::Type::AcroForm::Form#fill] to ignore password fields
+* [HexaPDF::Type::AcroForm::TextField] validation to convert invalid Symbol
+  values to String
+* [HexaPDF::Type::Annotations::Widget] validation to also validate a widget as a
+  field if necessary
+* PDF/A task to include a fix for mismatching glyph widths for Type 2 CID fonts
+### Fixed
+* Writing of PDF documents with an invalid value for the /Info dictionary
+* Subsetting of TrueType fonts in case compound glyphs are themselves compound
+## 1.6.0 - 2026-02-10
+### Added
+* CLI command `hexapdf debug-info` for creating debugging information,
+  especially for malformed files
+### Changed
+* Optimized decoding character codes with a CMap to drastically lower memory
+  usage
+* CLI command `hexapdf inspect rev` to show whether the cross-reference table
+  was reconstructed
+### Fixed
+* Path generation for image extraction in CLI command `hexapdf images`
+* Handling of certain invalid PDFs where the generation number for object
+  identifiers don't match their cross-reference section value
+* AES 256bit encryption to include unnecessary field /Length in encryption
+  dictionary to work around buggy PDF libraries
+* Parsing of invalid /Filter and /DecodeParms stream keys in case they resolve
+  to a recursive structure
+* [HexaPDF::Type::AcroForm::Field#each_widget] to only yield widget objects
 ## 1.5.0 - 2025-12-08
 ### Added

data/README.md CHANGED Viewed

@@ -13,7 +13,7 @@ In short, it allows
 * **securing** PDF files by encrypting or signing them and
 * **optimizing** PDF files for smaller file size or other criteria.
-HexaPDF is available under two license, the AGPL and a commercial license, see the [License
+HexaPDF is available under two licenses, the AGPL and a commercial license, see the [License
 section](#License) for details.
@@ -93,12 +93,13 @@ with example graphics and PDF files and tightly integrated into the rest of the
 ## Requirements and Installation
-Since HexaPDF is written in Ruby, a working Ruby installation is needed - see the
-[official installation documentation][rbinstall] for details. Note that you need Ruby version 2.6 or
-higher as prior versions are not supported!
+Since HexaPDF is written in Ruby, a working Ruby installation is needed - see the [official
+installation documentation][rbinstall] for details. Note that you need Ruby version 3.0 or higher as
+prior versions are not supported!
-HexaPDF works on all Ruby implementations that are CRuby compatible, e.g. TruffleRuby, and on any
-platform supported by Ruby (Linux, macOS, Windows, ...).
+HexaPDF works on all Ruby implementations that are CRuby compatible and on any platform supported by
+Ruby (Linux, macOS, Windows, ...). Implementations like JRuby and TruffleRuby should work but
+HexaPDF is not actively tested against them.
 Apart from Ruby itself the HexaPDF library has only one external dependency `geom2d` which is
 written and provided by the HexaPDF authors. The `hexapdf` application has an additional dependency
@@ -117,7 +118,7 @@ Prawn is a **library for generating content**.
 To be more specific, it is easily possible to read an existing PDF with HexaPDF and modify parts of
 it before writing it out again. The modifications can be to the PDF object structure like removing
-superfluous annotations or the the content itself.
+superfluous annotations or the content itself.
 Prawn has no such functionality. There is basic support for using a PDF as a template using the
 `pdf-reader` and `prawn-template` gems but support is very limited. However, Prawn has a very

data/examples/022-outline.rb CHANGED Viewed

@@ -10,7 +10,11 @@
 require 'hexapdf'
 doc = HexaPDF::Document.new
-6.times { doc.pages.add }
+6.times do |i|
+  doc.pages.add.canvas.
+    font("Helvetica", size: 150).
+    text("Page #{i + 1}", at: [10, 660])
+end
 doc.outline.add_item("Main") do |main|
   main.add_item("Page 1", destination: 0)

data/examples/032-acro_form_list_and_fill.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# # PDF Forms - List and fill fields
+#
+# This example shows how to list the form fields of an interactive PDF form and
+# how to fill out the form.
+#
+# The output file from the [PDF forms](acro_form.html) example can be used as
+# input.
+#
+# One way to list and fill a PDF form is to use the [HexaPDF CLI with the 'form'
+# command](/documentation/hexapdf.1.html#form). Here, however, we are doing it
+# with the HexaPDF API.
+#
+# Usage:
+# : `ruby acro_form_list_and_fill.rb [INPUT.PDF]`
+#
+require 'base64'
+require 'hexapdf'
+doc = HexaPDF::Document.open(ARGV[0] || 'acro_form.pdf')
+exit unless doc.acro_form
+puts "Listing all form fields:"
+doc.acro_form.each_field do |field|
+  puts "#{field.full_field_name} (#{field.concrete_field_type})"
+end
+# We are using this to generate some values for existing text fields. In the
+# real world one would be getting the values from the user.
+puts "\nFilling in the text fields with random values:"
+values = {}
+doc.acro_form.each_field do |field|
+  next unless field.field_type == :Tx
+  value = Base64.encode64(field.full_field_name).strip
+  value = if field.key?(:MaxLen)
+            value[0, field[:MaxLen]]
+          else
+            "Value #{field.field_type} #{value}"
+          end
+  values[field.full_field_name] = value
+  puts "#{field.full_field_name}: #{value}"
+end
+# Now actually fill out the form the values
+doc.acro_form.fill(values)
+doc.write('acro_form_list_and_fill.pdf', optimize: true)

data/examples/033-text_extraction.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# # Text Extraction
+#
+# This example shows how to extract layouted text from a page.
+#
+# It uses the provided input PDF or creates a small sample PDF as input. Then it
+# extracts the text for each page and creates new pages with the extracted text
+# in a fixed-width font.
+#
+# Usage:
+# : `ruby text_extraction.rb [INPUT.PDF]`
+#
+require 'hexapdf'
+# Use the input PDF or create a sample PDF.
+if ARGV.length > 0
+  doc = HexaPDF::Document.open(ARGV[0])
+else
+  composer = HexaPDF::Composer.new do |pdf|
+    pdf.lorem_ipsum(count: 3, padding: [0, 0, 20])
+    pdf.lorem_ipsum(padding: [0, 50, 20], text_indent: 40)
+    pdf.lorem_ipsum(count: 2)
+  end
+  doc = composer.document
+end
+# Extract the existing pages and add new ones with the extracted text
+doc.pages.count.times do |index|
+  text = doc.pages[index].extract_text
+  doc.pages.add.canvas.font('/usr/share/fonts/truetype/freefont/FreeMono.ttf', size: 6).
+    text(text, at: [10, 820])
+end
+doc.write('text_extraction.pdf', optimize: true)

data/lib/hexapdf/cli/debug_info.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+require 'hexapdf/cli/command'
+module HexaPDF
+  module CLI
+    # Creates debugging information for adding to an issue.
+    class DebugInfo < Command
+      def initialize #:nodoc:
+        super('debug-info', takes_commands: false)
+        short_desc("Create debug information for a PDF file")
+        long_desc(<<~EOF)
+          Creates debug information for a possibly malformed PDF file that can be attached to an
+          issue.
+          Two files are created: anonymized-FILE where all strings are replaced with zeroes and
+          debug_info.txt with additional debug information.
+        EOF
+        options.on("--password PASSWORD", "-p", String,
+                   "The password for decryption. Use - for reading from standard input.") do |pwd|
+          @password = (pwd == '-' ? read_password : pwd)
+        end
+        @password = nil
+      end
+      def execute(file) #:nodoc:
+        output_name = "anonymized-#{file}"
+        puts "Creating anonymized file '#{output_name}'"
+        data = File.binread(file)
+        data.gsub!(/(>>\s*stream\s*)(.*?)(\s*endstream)/m) {|m| "#{$1}#{'0' * $2.length}#{$3}" }
+        data.gsub!(/([^<]<)([0-9A-Fa-f#{Tokenizer::WHITESPACE}]*?)>/m) {|m| "#{$1}#{'0' * $2.length}>" }
+        data.gsub!(/\((.*?)\)/m) {|m| "(#{'0' * $1.length})" }
+        File.binwrite(output_name, data)
+        debug_info = +''
+        puts "Collecting debug information in debug_info.txt"
+        begin
+          output = capture_output { HexaPDF::CLI::Application.new.parse(['info', '--check', file]) }
+          debug_info << "Output:\n"<< output
+        rescue
+          debug_info << "Error collecting info: #{$!.message}\n"
+        end
+        File.write('debug_info.txt', debug_info)
+      end
+      private
+      def capture_output
+        stdout, stderr = $stdout, $stderr
+        $stdout = $stderr = StringIO.new
+        yield
+        $stdout.string
+      ensure
+        $stdout, $stderr = stdout, stderr
+      end
+    end
+  end
+end

data/lib/hexapdf/cli/images.rb CHANGED Viewed

@@ -147,7 +147,7 @@ module HexaPDF
       # Extracts the images with the given indices.
       def extract_images(doc)
         FileUtils.mkdir_p(File.dirname("#{@prefix}filename"))
-        prefix = File.directory?(@prefix) ? @prefix : "@{prefix}-"
+        prefix = File.directory?(@prefix) ? @prefix : "#{@prefix}-"
         done = Set.new
         count = total = 0
@@ -157,7 +157,7 @@ module HexaPDF
           info = image.info
           if info.writable
             count += 1
-            path = "#{@prefix}#{index}.#{image.info.extension}"
+            path = "#{prefix}#{index}.#{image.info.extension}"
             maybe_raise_on_existing_file(path)
             if command_parser.verbosity_info?
               puts "Extracting image #{index} (#{image.width}x#{image.height}, " \

data/lib/hexapdf/cli/info.rb CHANGED Viewed

@@ -137,6 +137,8 @@ module HexaPDF
             end
           elsif doc.encrypted?
             output_line("Encrypted", "yes (no or wrong password given)")
+          else
+            output_line("Encrypted", "no")
           end
           if doc.revisions.parser.linearized?

data/lib/hexapdf/cli/inspect.rb CHANGED Viewed

@@ -293,6 +293,10 @@ module HexaPDF
               IO.copy_stream(@doc.revisions.parser.io, $stdout, length, 0)
             else
               puts "Document has #{@doc.revisions.count} revision#{@doc.revisions.count == 1 ? '' : 's'}"
+              if @doc.revisions.parser.reconstructed? && @doc.revisions.count == 1 &&
+                 @doc.revisions.current == @doc.revisions.parser.reconstructed_revision
+                puts "Document cross-reference table has been reconstructed"
+              end
               revision_information do |rev, index, count, signature, end_offset|
                 type = if rev.trailer[:XRefStm]
                          "xref table + stream"
@@ -415,7 +419,7 @@ module HexaPDF
           sig = signatures[rev]
           if sig
             end_index = sig[:ByteRange][-2] + sig[:ByteRange][-1]
-          else
+          elsif rev != @doc.revisions.parser.reconstructed_revision
             io.seek(startxrefs[index], IO::SEEK_SET)
             buffer = ''.b
             while io.pos < startxrefs[index + 1]

data/lib/hexapdf/cli.rb CHANGED Viewed

@@ -49,6 +49,7 @@ require 'hexapdf/cli/image2pdf'
 require 'hexapdf/cli/form'
 require 'hexapdf/cli/fonts'
 require 'hexapdf/cli/usage'
+require 'hexapdf/cli/debug_info'
 require 'hexapdf/version'
 require 'hexapdf/document'
@@ -125,6 +126,7 @@ module HexaPDF
         add_command(HexaPDF::CLI::Form.new)
         add_command(HexaPDF::CLI::Fonts.new)
         add_command(HexaPDF::CLI::Usage.new)
+        add_command(HexaPDF::CLI::DebugInfo.new)
         add_command(CmdParse::HelpCommand.new)
         version_command = CmdParse::VersionCommand.new(add_switches: false)
         add_command(version_command)

data/lib/hexapdf/configuration.rb CHANGED Viewed

@@ -559,6 +559,7 @@ module HexaPDF
                         JPXDecode: 'HexaPDF::Filter::PassThrough',
                         Crypt: 'HexaPDF::Filter::Crypt',
                         Encryption: 'HexaPDF::Filter::Encryption',
+                        BrotliDecode: 'HexaPDF::Filter::BrotliDecode',
                       },
                       'font.default' => 'Times',
                       'font.fallback' => ['ZapfDingbats', 'Symbol'],
@@ -636,6 +637,11 @@ module HexaPDF
   #
   #    See PDF2.0 s8.6
   #
+  # filter.brotli.compression::
+  #    Specifies the compression level that should be used with the BrotliDecode filter. The level
+  #    can range from 0 (no compression), 1 (best speed) to 11 (best compression). The default
+  #    value is 8 which is a good compromise between speed and resulting size.
+  #
   # filter.flate.compression::
   #    Specifies the compression level that should be used with the FlateDecode filter. The level
   #    can range from 0 (no compression), 1 (best speed) to 9 (best compression, default).
@@ -754,6 +760,8 @@ module HexaPDF
                         MCR: 'HexaPDF::Type::MarkedContentReference',
                         OBJR: 'HexaPDF::Type::ObjectReference',
                         Measure: 'HexaPDF::Type::Measure',
+                        DSS: 'HexaPDF::Type::DocumentSecurityStore',
+                        VRI: 'HexaPDF::Type::DocumentSecurityStore::ValidationRelatedInformation',
                       },
                       'object.subtype_map' => {
                         nil => {

data/lib/hexapdf/content/canvas.rb CHANGED Viewed

@@ -895,7 +895,7 @@ module HexaPDF
       #
       # * Any other string is treated as a color name. HexaPDF supports CSS Color Module Level 3
       #   color names (see https://www.w3.org/TR/css-color-3/#svg-color) as well as HexaPDF design
-      #   colors.
+      #   colors. See ColorSpace::COLOR_NAMES for the list of supported names.
       #
       # * Four numeric arguments specify a CMYK color (see ColorSpace::DeviceCMYK::Color).
       #