RubyGems - hexapdf - Versions diffs - 1.5.0 → 1.7.0 - Mend

hexapdf 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +54 -0
data/README.md +8 -7
data/examples/022-outline.rb +5 -1
data/examples/032-acro_form_list_and_fill.rb +47 -0
data/examples/033-text_extraction.rb +34 -0
data/lib/hexapdf/cli/debug_info.rb +98 -0
data/lib/hexapdf/cli/images.rb +2 -2
data/lib/hexapdf/cli/info.rb +2 -0
data/lib/hexapdf/cli/inspect.rb +5 -1
data/lib/hexapdf/cli.rb +2 -0
data/lib/hexapdf/configuration.rb +8 -0
data/lib/hexapdf/content/canvas.rb +1 -1
data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
data/lib/hexapdf/content.rb +2 -0
data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
data/lib/hexapdf/document.rb +7 -3
data/lib/hexapdf/encryption/security_handler.rb +3 -1
data/lib/hexapdf/filter/brotli_decode.rb +88 -0
data/lib/hexapdf/filter.rb +1 -0
data/lib/hexapdf/font/cmap.rb +10 -6
data/lib/hexapdf/font/true_type/builder.rb +1 -1
data/lib/hexapdf/font/true_type/font.rb +13 -0
data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
data/lib/hexapdf/font/true_type.rb +1 -0
data/lib/hexapdf/layout/style.rb +6 -2
data/lib/hexapdf/parser.rb +29 -4
data/lib/hexapdf/revision.rb +6 -2
data/lib/hexapdf/task/pdfa.rb +108 -1
data/lib/hexapdf/type/acro_form/field.rb +4 -1
data/lib/hexapdf/type/acro_form/form.rb +4 -0
data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
data/lib/hexapdf/type/annotations/widget.rb +9 -0
data/lib/hexapdf/type/document_security_store.rb +80 -0
data/lib/hexapdf/type/page.rb +11 -0
data/lib/hexapdf/type.rb +1 -0
data/lib/hexapdf/version.rb +1 -1
data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
data/test/hexapdf/digital_signature/common.rb +19 -5
data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
data/test/hexapdf/encryption/test_security_handler.rb +7 -5
data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
data/test/hexapdf/font/true_type/test_builder.rb +9 -0
data/test/hexapdf/font/true_type/test_font.rb +17 -3
data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
data/test/hexapdf/task/test_pdfa.rb +72 -0
data/test/hexapdf/test_document.rb +13 -0
data/test/hexapdf/test_parser.rb +55 -3
data/test/hexapdf/test_revision.rb +27 -6
data/test/hexapdf/type/acro_form/test_field.rb +5 -0
data/test/hexapdf/type/acro_form/test_form.rb +6 -0
data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
data/test/hexapdf/type/annotations/test_widget.rb +11 -0
data/test/hexapdf/type/test_page.rb +8 -0
data/test/test_helper.rb +6 -0
metadata +41 -4

data/lib/hexapdf/content/smart_text_extractor.rb ADDED Viewed

@@ -0,0 +1,305 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+module HexaPDF
+  module Content
+    # This module converts the glyphs on a page to a single text string while preserving the layout.
+    #
+    # The general algorithm is:
+    #
+    # 1. Collect all individual glyphs with their user space coordinates in
+    #    TextRunCollector::TextRun objects.
+    #
+    # 2. Sort text runs top to bottom and then left to right.
+    #
+    # 3. Group those text runs into lines based on a "baseline" while also combining neighboring
+    #    text runs into larger runs.
+    #
+    # 4. Render each line into a string by taking into account the page size and the median glyph
+    #    width for a text run to column mapping.
+    #
+    # 5. Add blank lines between text lines based on the page's normal line spacing.
+    module SmartTextExtractor
+      # This module provides the functionality for collecting the necessary TextRun instances for
+      # layouting the text.
+      #
+      # To use this module include it in a processor class. Then invoke the #collect_text_runs
+      # method in the #show_text and #show_text_with_positioning methods.
+      #
+      # Example:
+      #
+      #   class CustomProcessor < HexaPDF::Content::Processor
+      #     include TextRunCollector
+      #
+      #     def show_text(str)
+      #       collect_text_runs(decode_text_with_positioning(str))
+      #     end
+      #     alias show_text_with_positioning show_text
+      #
+      #   end
+      #
+      # Once the processor has done its job, the collected text runs are available via the
+      # #text_runs method. Use them as input for SmartTextExtractor.layout_text_runs.
+      module TextRunCollector
+        # Represents a single run of continuous glyphs and their combined bounding box in user
+        # space.
+        TextRun = Struct.new(:string, :left, :bottom, :right, :top) do
+          # The "baseline" is approximated with the bottom of the bounding box.
+          #
+          # This works because HexaPDF uses a font's bounding box instead of the glyph's bounding
+          # box for each glyph. So while differently sized glyphs will have different "baseline"
+          # values, this is taken into account in the algorithm in the same way as subscript and
+          # superscript.
+          #
+          # Using this "fake" baseline works well enough and avoids additional calculations.
+          def baseline = bottom
+          # The height of the text run's bounding box.
+          def height = top - bottom
+          # The width of the text run's bounding box.
+          def width = right - left
+        end
+        # Array with all collected TextRun instances.
+        attr_reader :text_runs
+        def initialize # :nodoc:
+          super
+          @text_runs = []
+        end
+        private
+        # Collects all text runs from the glyphs in the +boxes+ array.
+        def collect_text_runs(boxes)
+          boxes.each do |box|
+            llx, lly, lrx, lry, urx, ury, ulx, uly = *box.points
+            x_min, x_max = [llx, lrx, ulx, urx].minmax
+            y_min, y_max = [lly, lry, uly, ury].minmax
+            @text_runs << TextRun.new(+box.string, x_min, y_min, x_max, y_max)
+          end
+        end
+      end
+      # This processor class is used when layouting the text through
+      # HexaPDF::Type::Page#extract_text.
+      class TextRunProcessor < HexaPDF::Content::Processor
+        include TextRunCollector
+        def show_text(str)
+          collect_text_runs(decode_text_with_positioning(str))
+        end
+        alias show_text_with_positioning show_text
+      end
+      # Converts an array of TextRun objects into a single string representation, preserving the
+      # visual layout.
+      #
+      # The +page_width+ and +page_height+ arguments specify the width and height of the page from
+      # which the text runs were extracted.
+      #
+      # The remaining keyword arguments can be used to fine-tune the algorithm for one's needs:
+      #
+      # +line_tolerance_factor+::
+      #     The tolerance factor is applied to the median text run height to determine the range
+      #     within which two text runs are considered to be on the same line. This ensures that
+      #     small differences in the baseline due to, for example, subscript or superscript parts
+      #     don't result in multiple lines.
+      #
+      #     The factor should not be too large to avoid forcing separate visual lines into one line
+      #     but also not too small to avoid subscript/superscript begin on separate lines. The
+      #     default seems to work quite well.
+      #
+      # +paragraph_distance_threshold+::
+      #     If the number of normal line spacings between two adjacent baselines is at least this
+      #     large (but smaller than +large_distance_threshold+), the gap is interpreted as a
+      #     paragraph break and a single blank line is inserted.
+      #
+      # +large_distance_threshold+::
+      #     Works like +paragraph_distance_threshold+ and indicates if a number of normal line
+      #     spacings is too large for being a paragraph break. A proportional number of blank lines
+      #     is inserted in this case.
+      #
+      #     This is used to represent large parts with non-text content like images.
+      def self.layout_text_runs(text_runs, page_width, page_height,
+                                line_tolerance_factor: 0.4, paragraph_distance_threshold: 1.35,
+                                large_distance_threshold: 3.0)
+        return '' if text_runs.empty?
+        # Use the median height of all text runs as an approximation of the main font size used on
+        # the page. The line tolerance uses a hard floor for small fonts.
+        median_height = median(text_runs.map(&:height).sort)
+        line_tolerance = [median_height * line_tolerance_factor, 2].max
+        # Group the text runs into lines which are sorted top to bottom. Text runs are pre-sorted by
+        # baseline from top to bottom and left to right (the latter is done so that consecutive text
+        # runs can be combined).
+        sorted = text_runs.sort_by {|run| [-run.baseline, run.left] }
+        lines = group_into_lines(sorted, line_tolerance)
+        # Calculate the normal line spacing, excluding anything too small/big.
+        line_distances = lines.map {|l| l.baseline }.each_cons(2).map {|a, b| a - b }.
+          select {|d| d >= median_height * 0.5 && d <= median_height * 2 }.sort
+        normal_line_spacing = line_distances.empty? ? median_height * 1.2 : median(line_distances)
+        # Convert the lines into actual text strings. Blank lines are inserted between the lines
+        # based on the normal line spacing.
+        output_lines = []
+        left_margin = lines.map {|line| line.text_runs[0].left }.min
+        glyph_widths = lines.flat_map do |line|
+          line.text_runs.flat_map {|run| [run.width.to_f / run.string.length] * run.string.length }
+        end.sort
+        median_glyph_width = median(glyph_widths)
+        lines.each_with_index do |line, index|
+          output_lines << text_runs_to_string(line.text_runs, median_glyph_width, left_margin)
+          next if index == lines.length - 1
+          # Add blank lines as needed.
+          ratio = (line.baseline - lines[index + 1].baseline) / normal_line_spacing
+          if ratio >= large_distance_threshold
+            # Subtract 1 because the newline after the output line already counts as one
+            # newline. Also cap at a maximum of 40 to avoid huge gaps.
+            [ratio.round - 1, 40].min.times { output_lines << '' }
+          elsif ratio >= paragraph_distance_threshold
+            output_lines << ''
+          end
+        end
+        output_lines.join("\n")
+      end
+      # Holds an array of TextRun objects and their median baseline.
+      Line = Struct.new(:text_runs, :baseline)
+      # Groups a sorted list of TextRuns (sorted by baseline, then left) into lines.
+      #
+      # Since the text_runs are already sorted, a single run through +sorted_text_runs+ is
+      # sufficient. A new line is created if a text run's baseline differs by more than +tolerance+
+      # from the current line's (median) baseline.
+      #
+      # The result is a list of Line objects with their contents sorted left to right.
+      def self.group_into_lines(sorted_text_runs, tolerance)
+        lines = []
+        current_line = []
+        current_baseline = sorted_text_runs[0].baseline
+        current_baselines = [current_baseline]
+        sorted_text_runs.each do |text_run|
+          # Try to combine text_runs that share exactly the same height and are next to each
+          # other. This avoids potentially garbled output because if two text parts are above each
+          # other but end up on the same line, the text runs would be mixed up (think: centered
+          # table header where some cells contain two lines).
+          if (last = current_line[-1]) && last.bottom == text_run.bottom &&
+             last.top == text_run.top && text_run.left - last.right < 1
+            last.string << text_run.string
+            last.right = text_run.right
+          elsif (current_baseline - text_run.baseline).abs <= tolerance
+            current_line << text_run
+            current_baselines << text_run.baseline
+            current_baseline = median(current_baselines)
+          else
+            lines << Line.new(current_line.sort_by!(&:left), current_baseline)
+            current_line = [text_run]
+            current_baseline = text_run.baseline
+            current_baselines.clear
+            current_baselines << current_baseline
+          end
+        end
+        lines << Line.new(current_line.sort_by!(&:left), current_baseline)
+      end
+      private_class_method :group_into_lines
+      # Returns the median value of the given sorted array of numerics.
+      def self.median(sorted_array)
+        mid = sorted_array.length / 2
+        sorted_array.length.odd? ? sorted_array[mid] : (sorted_array[mid - 1] + sorted_array[mid]) / 2.0
+      end
+      private_class_method :median
+      # Renders an array of TextRun objects representing one line to a single string.
+      #
+      # +median_glyph_width+:: Is used to determine the column for each text run.
+      # +left_margin+:: Is removed from the left side to avoid unnecessary indentation.
+      def self.text_runs_to_string(text_runs, median_glyph_width, left_margin)
+        # Minimum gap to classify as a word boundary
+        space_threshold = median_glyph_width * 0.5
+        result = +''
+        # The column where the last text run ended. Can be different from result.size due to fitting
+        # proportional-width fonts to a fixed-column output.
+        cursor = 0
+        text_runs.each_with_index do |text_run, index|
+          target_col = ((text_run.left - left_margin) / median_glyph_width).round
+          advance = target_col - cursor
+          if advance > 0
+            result << ' ' * advance
+            cursor += advance
+          elsif index >= 1 && text_run.left - text_runs[index - 1].right > space_threshold &&
+                result[-1] != ' '
+            # Force space even if advance < 0 when the actual spacing between text runs is large
+            # enough. This might happen because we are projecting proportional-width fonts to a
+            # fixed-column output.
+            cursor = target_col
+            result << ' '
+          end
+          result << text_run.string
+          # Move cursor to the text run's right edge but at least the text run's character count
+          # from the current position. This avoids gaps when there is too much difference between
+          # the on-page position and the approximated cursor. However, a one column difference is
+          # ignored to account for rounding errors.
+          cursor += text_run.string.size
+          text_run_right_edge_cursor = ((text_run.right - left_margin) / median_glyph_width).round
+          cursor = [text_run_right_edge_cursor, cursor].max if text_run_right_edge_cursor != cursor + 1
+        end
+        result.rstrip
+      end
+      private_class_method :text_runs_to_string
+    end
+  end
+end

data/lib/hexapdf/content.rb CHANGED Viewed

@@ -44,6 +44,7 @@ module HexaPDF
   #
   # * The Canvas class which provides an interface for drawing graphics and text.
   # * The Parser and Processor classes for processing an existing content stream.
+  # * SmartTextExtractor for extracting layouted text from a page.
   module Content
     autoload(:Canvas, 'hexapdf/content/canvas')
@@ -52,6 +53,7 @@ module HexaPDF
     autoload(:ColorSpace, 'hexapdf/content/color_space')
     autoload(:Operator, 'hexapdf/content/operator')
     autoload(:CanvasComposer, 'hexapdf/content/canvas_composer')
+    autoload(:SmartTextExtractor, 'hexapdf/content/smart_text_extractor')
   end

data/lib/hexapdf/digital_signature/signing/default_handler.rb CHANGED Viewed

@@ -52,9 +52,6 @@ module HexaPDF
       # The signing handler is used by default by all methods that need a signing handler. Therefore
       # it is usually only necessary to provide the actual attribute values.
       #
-      # *Note*: Currently only RSA is supported, DSA and ECDSA are not. See the examples below for
-      # how to handle them using external signing.
-      #
       #
       # == CMS and PAdES Signatures
       #
@@ -131,17 +128,6 @@ module HexaPDF
       #   document.sign("output.pdf", certificate: my_cert, certificate_chain: my_chain,
       #                 external_signing: signing_proc)
       #
-      #   # Signing with DSA or ECDSA certificate/keys
-      #   signing_proc = lambda do |io, byte_range|
-      #     io.pos = byte_range[0]
-      #     data = io.read(byte_range[1])
-      #     io.pos = byte_range[2]
-      #     data << io.read(byte_range[3])
-      #     OpenSSL::PKCS7.sign(certificate, key, data, certificate_chain,
-      #                         OpenSSL::PKCS7::DETACHED | OpenSSL::PKCS7::BINARY).to_der
-      #   end
-      #   document.sign("output.pdf", signature_size: 10_000, external_signing: signing_proc)
-      #
       #
       # == Implementing a Signing Handler
       #
@@ -277,7 +263,7 @@ module HexaPDF
         # If a custom size is set using #signature_size=, it used. Otherwise the size is determined
         # by using #sign to sign an empty string.
         def signature_size
-          @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size
+          @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size + 5
         end
         # Finalizes the signature field as well as the signature dictionary before writing.

data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb CHANGED Viewed

@@ -121,7 +121,7 @@ module HexaPDF
         private
         # Creates the set of signed attributes for the signer information structure.
-        def create_signed_attrs(data, signing_time: true)
+        def create_signed_attrs(data, ess_cert_hash: 'sha256', signing_time: true)
           signing_time = (self.signing_time || Time.now).utc if signing_time
           set(
             attribute('content-type', oid('id-data')),
@@ -132,12 +132,13 @@ module HexaPDF
             ),
             attribute(
               'id-aa-signingCertificateV2',
-              sequence( # SigningCertificateV2
+              sequence( # SigningCertificateV2, see RFC5035
                 sequence( # Seq of ESSCertIDv2
                   sequence( # ESSCertIDv2
-                    #TODO: Does not validate on ETSI checker if used, doesn't matter if SHA256 or 512
-                    #oid('sha512'),
-                    binary(OpenSSL::Digest.digest('sha256', @certificate.to_der)), # certHash
+                    (sequence( # AlgorithmIdentifier RFC3280 4.1.1.2
+                      oid(ess_cert_hash) # algorithm
+                    ) unless ess_cert_hash == 'sha256'),
+                    binary(OpenSSL::Digest.digest(ess_cert_hash, @certificate.to_der)), # certHash
                     sequence(                                      # issuerSerial
                       sequence(                                    #  issuer
                         implicit(4, sequence(@certificate.issuer)) #   choice 4 directoryName
@@ -184,13 +185,19 @@ module HexaPDF
         # Creates a signer information structure containing the actual meat of the whole CMS object.
         def create_signer_info(signature, signed_attrs, unsigned_attrs = nil)
           certificate_pkey_algorithm = @certificate.public_key.oid
-          signature_algorithm = if certificate_pkey_algorithm == 'rsaEncryption'
+          signature_algorithm = case certificate_pkey_algorithm
+                                when 'rsaEncryption'
                                   sequence(               # signatureAlgorithm
                                     oid('rsaEncryption'), #   algorithmID
                                     null                  #   params
                                   )
-                                else
-                                  raise HexaPDF::Error, "Unsupported key type/signature algorithm"
+                                when 'DSA'
+                                  unless @digest_algorithm == 'sha256'
+                                    raise HexaPDF::Error, "Only SHA256 supported with DSA"
+                                  end
+                                  sequence(oid('id-dsa-with-sha256'), null)
+                                when 'id-ecPublicKey'
+                                  sequence(oid("ecdsa-with-#{@digest_algorithm.upcase}"), null)
                                 end
           sequence(
@@ -273,6 +280,12 @@ module HexaPDF
           'sha384' => '2.16.840.1.101.3.4.2.2',
           'sha512' => '2.16.840.1.101.3.4.2.3',
           'rsaEncryption' => '1.2.840.113549.1.1.1',
+          'id-dsa-with-sha1' => '1.2.840.10040.4.3',
+          'id-dsa-with-sha256' => '2.16.840.1.101.3.4.3.2',
+          'ecdsa-with-SHA1' => '1.2.840.10045.4.1',
+          'ecdsa-with-SHA256' => '1.2.840.10045.4.3.2',
+          'ecdsa-with-SHA384' => '1.2.840.10045.4.3.3',
+          'ecdsa-with-SHA512' => '1.2.840.10045.4.3.4',
           'id-aa-signingCertificate' => '1.2.840.113549.1.9.16.2.12',
           'id-aa-timeStampToken' => '1.2.840.113549.1.9.16.2.14',
           'id-aa-signingCertificateV2' => '1.2.840.113549.1.9.16.2.47',

data/lib/hexapdf/document.rb CHANGED Viewed

@@ -394,11 +394,12 @@ module HexaPDF
     # :call-seq:
     #   document.unwrap(obj)   -> unwrapped_obj
     #
-    # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
-    # instead of HexaPDF::Reference and HexaPDF::Object).
+    # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...)
+    # instead of HexaPDF::Reference and HexaPDF::Object. Only HexaPDF::Stream objects are retained
+    # as they are not representable by native Ruby objects.
     def unwrap(object, seen = {})
       object = deref(object)
-      object = object.data if object.kind_of?(HexaPDF::Object)
+      object = object.data if object.kind_of?(HexaPDF::Object) && !object.kind_of?(HexaPDF::Stream)
       if seen.key?(object)
         raise HexaPDF::Error, "Can't unwrap a recursive structure"
       end
@@ -413,6 +414,8 @@ module HexaPDF
       when HexaPDF::PDFData
         seen[object] = true
         unwrap(object.value, seen.dup)
+      when HexaPDF::Stream
+        object
       else
         object
       end
@@ -790,6 +793,7 @@ module HexaPDF
         if @metadata
           metadata.modification_date(Time.now)
         else
+          trailer.delete(:Info) unless trailer.info.kind_of?(HexaPDF::Dictionary)
           trailer.info[:ModDate] = Time.now
         end
       end

data/lib/hexapdf/encryption/security_handler.rb CHANGED Viewed

@@ -363,7 +363,9 @@ module HexaPDF
             raise(HexaPDF::UnsupportedEncryptionError,
                   "Invalid key length #{key_length} specified")
           end
-        dict[:Length] = key_length if dict[:V] == 4 || dict[:V] == 2
+        # /Length should only be set for V=2 as per the spec. However, software like Adobe Reader
+        # fails if this is not set for V=5 or V=4.
+        dict[:Length] = key_length if dict[:V] == 5 || dict[:V] == 4 || dict[:V] == 2
         if ![:aes, :arc4].include?(algorithm)
           raise(HexaPDF::UnsupportedEncryptionError,

data/lib/hexapdf/filter/brotli_decode.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+require 'fiber'
+require 'brotli'
+require 'hexapdf/filter/predictor'
+require 'hexapdf/configuration'
+module HexaPDF
+  module Filter
+    # Implements the Brotli filter using the brotli library which must be installed manually.
+    #
+    # The BrotliDecode specification is not yet available as a standard but will be in the near
+    # future. Therefore it is recommended to wait using it for encoding streams until most of the
+    # PDF ecosystem has support for it.
+    #
+    # See: HexaPDF::Filter
+    module BrotliDecode
+      # See HexaPDF::Filter
+      #
+      # Note that the brotli gem currently doesn't support a streaming decoder. This means that the
+      # whole source must be read and decoded at once.
+      def self.decoder(source, options = nil)
+        fib = Fiber.new do
+          data = Filter.string_from_source(source)
+          data.empty? ? data: Brotli.inflate(data)
+        end
+        if options && options[:Predictor]
+          Predictor.decoder(fib, options)
+        else
+          fib
+        end
+      end
+      # See HexaPDF::Filter
+      #
+      # As with ::decoder a usable streaming encoder is not available.
+      def self.encoder(source, options = nil)
+        if options && options[:Predictor]
+          source = Predictor.encoder(source, options)
+        end
+        Fiber.new do
+          Brotli.deflate(Filter.string_from_source(source),
+                         quality: HexaPDF::GlobalConfiguration['filter.brotli.compression'])
+        end
+      end
+    end
+  end
+end

data/lib/hexapdf/filter.rb CHANGED Viewed

@@ -134,6 +134,7 @@ module HexaPDF
     autoload(:FlateDecode, 'hexapdf/filter/flate_decode')
     autoload(:LZWDecode, 'hexapdf/filter/lzw_decode')
     autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode')
+    autoload(:BrotliDecode, 'hexapdf/filter/brotli_decode')
     autoload(:Predictor, 'hexapdf/filter/predictor')

data/lib/hexapdf/font/cmap.rb CHANGED Viewed

@@ -143,10 +143,13 @@ module HexaPDF
       # An error is raised if the string contains invalid bytes.
       def read_codes(string)
         codes = []
-        bytes = string.each_byte
+        bytes = string.bytes
+        length = bytes.length
+        i = 0
-        loop do
-          byte = bytes.next
+        while i < length
+          byte = bytes[i]
+          i += 1
           code = 0
           found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
@@ -154,9 +157,10 @@ module HexaPDF
             code = (code << 8) + byte
             valid = rest_ranges.all? do |range|
-              begin
-                byte = bytes.next
-              rescue StopIteration
+              if i < length
+                byte = bytes[i]
+                i += 1
+              else
                 raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
               end
               code = (code << 8) + byte

data/lib/hexapdf/font/true_type/builder.rb CHANGED Viewed

@@ -48,7 +48,7 @@ module HexaPDF
           entry_selector = tables.length.bit_length - 1
           range_shift = tables.length * 16 - search_range
-          font_data = "\x0\x1\x0\x0".b +
+          font_data = (tables.key?('glyf') ? "\x0\x1\x0\x0" : "OTTO").b +
             [tables.length, search_range, entry_selector, range_shift].pack('n4')
           offset = font_data.length + tables.length * 16

data/lib/hexapdf/font/true_type/font.rb CHANGED Viewed

@@ -35,6 +35,7 @@
 #++
 require 'hexapdf/font/true_type/table'
+require 'hexapdf/font/true_type/builder'
 require 'set'
 module HexaPDF
@@ -84,6 +85,18 @@ module HexaPDF
           @tables = {}
         end
+        # Uses Builder to build a font file for this font.
+        #
+        # The +table_overrides+ argument can be used to supply mappings from table names (in string
+        # form) to raw table data that should override the respective font's tables.
+        def build(table_overrides = {})
+          tables = directory.table_names.each_with_object({}) do |name, hash|
+            hash[name] = self[name.to_sym].raw_data
+          end
+          tables.merge!(table_overrides)
+          Builder.build(tables)
+        end
         # Returns the table instance for the given tag (a symbol), or +nil+ if no such table exists.
         def [](tag)
           return @tables[tag] if @tables.key?(tag)

data/lib/hexapdf/font/true_type/subsetter.rb CHANGED Viewed

@@ -176,9 +176,14 @@ module HexaPDF
         # Adds the components of compound glyphs to the subset.
         def add_glyph_components
           glyf = @font[:glyf]
+          process_glyph_components = lambda do |gid|
+            glyf[gid].components&.each do |cgid|
+              use_glyph(cgid)
+              process_glyph_components.call(cgid) if glyf[cgid].compound?
+            end
+          end
           @glyph_map.keys.each do |gid|
-            next if gid.kind_of?(Symbol)
-            glyf[gid].components&.each {|cgid| use_glyph(cgid) }
+            process_glyph_components.call(gid) unless gid.kind_of?(Symbol)
           end
         end

data/lib/hexapdf/font/true_type/table/directory.rb CHANGED Viewed

@@ -69,6 +69,11 @@ module HexaPDF
             @tables[tag]
           end
+          # Returns an array with all the table names (in string form) in the directory.
+          def table_names
+            @tables.keys
+          end
           private
           def load_from_io #:nodoc:

data/lib/hexapdf/font/true_type.rb CHANGED Viewed

@@ -49,6 +49,7 @@ module HexaPDF
       autoload(:Font, 'hexapdf/font/true_type/font')
       autoload(:Subsetter, 'hexapdf/font/true_type/subsetter')
       autoload(:Optimizer, 'hexapdf/font/true_type/optimizer')
+      autoload(:Builder, 'hexapdf/font/true_type/builder')
     end