RubyGems - hexapdf - Versions diffs - 1.6.0 → 1.7.0 - Mend

hexapdf 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +28 -0
data/examples/032-acro_form_list_and_fill.rb +47 -0
data/examples/033-text_extraction.rb +34 -0
data/lib/hexapdf/cli/info.rb +2 -0
data/lib/hexapdf/configuration.rb +8 -0
data/lib/hexapdf/content/canvas.rb +1 -1
data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
data/lib/hexapdf/content.rb +2 -0
data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
data/lib/hexapdf/document.rb +7 -3
data/lib/hexapdf/filter/brotli_decode.rb +88 -0
data/lib/hexapdf/filter.rb +1 -0
data/lib/hexapdf/font/true_type/builder.rb +1 -1
data/lib/hexapdf/font/true_type/font.rb +13 -0
data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
data/lib/hexapdf/font/true_type.rb +1 -0
data/lib/hexapdf/layout/style.rb +6 -2
data/lib/hexapdf/task/pdfa.rb +108 -1
data/lib/hexapdf/type/acro_form/form.rb +4 -0
data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
data/lib/hexapdf/type/annotations/widget.rb +9 -0
data/lib/hexapdf/type/document_security_store.rb +80 -0
data/lib/hexapdf/type/page.rb +11 -0
data/lib/hexapdf/type.rb +1 -0
data/lib/hexapdf/version.rb +1 -1
data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
data/test/hexapdf/digital_signature/common.rb +19 -5
data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
data/test/hexapdf/font/true_type/test_builder.rb +9 -0
data/test/hexapdf/font/true_type/test_font.rb +17 -3
data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
data/test/hexapdf/task/test_pdfa.rb +72 -0
data/test/hexapdf/test_document.rb +13 -0
data/test/hexapdf/type/acro_form/test_form.rb +6 -0
data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
data/test/hexapdf/type/annotations/test_widget.rb +11 -0
data/test/hexapdf/type/test_page.rb +8 -0
metadata +25 -3

data/lib/hexapdf/filter/brotli_decode.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+require 'fiber'
+require 'brotli'
+require 'hexapdf/filter/predictor'
+require 'hexapdf/configuration'
+module HexaPDF
+  module Filter
+    # Implements the Brotli filter using the brotli library which must be installed manually.
+    #
+    # The BrotliDecode specification is not yet available as a standard but will be in the near
+    # future. Therefore it is recommended to wait using it for encoding streams until most of the
+    # PDF ecosystem has support for it.
+    #
+    # See: HexaPDF::Filter
+    module BrotliDecode
+      # See HexaPDF::Filter
+      #
+      # Note that the brotli gem currently doesn't support a streaming decoder. This means that the
+      # whole source must be read and decoded at once.
+      def self.decoder(source, options = nil)
+        fib = Fiber.new do
+          data = Filter.string_from_source(source)
+          data.empty? ? data: Brotli.inflate(data)
+        end
+        if options && options[:Predictor]
+          Predictor.decoder(fib, options)
+        else
+          fib
+        end
+      end
+      # See HexaPDF::Filter
+      #
+      # As with ::decoder a usable streaming encoder is not available.
+      def self.encoder(source, options = nil)
+        if options && options[:Predictor]
+          source = Predictor.encoder(source, options)
+        end
+        Fiber.new do
+          Brotli.deflate(Filter.string_from_source(source),
+                         quality: HexaPDF::GlobalConfiguration['filter.brotli.compression'])
+        end
+      end
+    end
+  end
+end

data/lib/hexapdf/filter.rb CHANGED Viewed

@@ -134,6 +134,7 @@ module HexaPDF
     autoload(:FlateDecode, 'hexapdf/filter/flate_decode')
     autoload(:LZWDecode, 'hexapdf/filter/lzw_decode')
     autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode')
+    autoload(:BrotliDecode, 'hexapdf/filter/brotli_decode')
     autoload(:Predictor, 'hexapdf/filter/predictor')

data/lib/hexapdf/font/true_type/builder.rb CHANGED Viewed

@@ -48,7 +48,7 @@ module HexaPDF
           entry_selector = tables.length.bit_length - 1
           range_shift = tables.length * 16 - search_range
-          font_data = "\x0\x1\x0\x0".b +
+          font_data = (tables.key?('glyf') ? "\x0\x1\x0\x0" : "OTTO").b +
             [tables.length, search_range, entry_selector, range_shift].pack('n4')
           offset = font_data.length + tables.length * 16

data/lib/hexapdf/font/true_type/font.rb CHANGED Viewed

@@ -35,6 +35,7 @@
 #++
 require 'hexapdf/font/true_type/table'
+require 'hexapdf/font/true_type/builder'
 require 'set'
 module HexaPDF
@@ -84,6 +85,18 @@ module HexaPDF
           @tables = {}
         end
+        # Uses Builder to build a font file for this font.
+        #
+        # The +table_overrides+ argument can be used to supply mappings from table names (in string
+        # form) to raw table data that should override the respective font's tables.
+        def build(table_overrides = {})
+          tables = directory.table_names.each_with_object({}) do |name, hash|
+            hash[name] = self[name.to_sym].raw_data
+          end
+          tables.merge!(table_overrides)
+          Builder.build(tables)
+        end
         # Returns the table instance for the given tag (a symbol), or +nil+ if no such table exists.
         def [](tag)
           return @tables[tag] if @tables.key?(tag)

data/lib/hexapdf/font/true_type/subsetter.rb CHANGED Viewed

@@ -176,9 +176,14 @@ module HexaPDF
         # Adds the components of compound glyphs to the subset.
         def add_glyph_components
           glyf = @font[:glyf]
+          process_glyph_components = lambda do |gid|
+            glyf[gid].components&.each do |cgid|
+              use_glyph(cgid)
+              process_glyph_components.call(cgid) if glyf[cgid].compound?
+            end
+          end
           @glyph_map.keys.each do |gid|
-            next if gid.kind_of?(Symbol)
-            glyf[gid].components&.each {|cgid| use_glyph(cgid) }
+            process_glyph_components.call(gid) unless gid.kind_of?(Symbol)
           end
         end

data/lib/hexapdf/font/true_type/table/directory.rb CHANGED Viewed

@@ -69,6 +69,11 @@ module HexaPDF
             @tables[tag]
           end
+          # Returns an array with all the table names (in string form) in the directory.
+          def table_names
+            @tables.keys
+          end
           private
           def load_from_io #:nodoc:

data/lib/hexapdf/font/true_type.rb CHANGED Viewed

@@ -49,6 +49,7 @@ module HexaPDF
       autoload(:Font, 'hexapdf/font/true_type/font')
       autoload(:Subsetter, 'hexapdf/font/true_type/subsetter')
       autoload(:Optimizer, 'hexapdf/font/true_type/optimizer')
+      autoload(:Builder, 'hexapdf/font/true_type/builder')
     end

data/lib/hexapdf/layout/style.rb CHANGED Viewed

@@ -211,6 +211,8 @@ module HexaPDF
         attr_reader :width
         # The colors of each edge. See Quad.
+        #
+        # See: HexaPDF::Content::ColorSpace.device_color_from_specification
         attr_reader :color
         # The styles of each edge. See Quad.
@@ -897,7 +899,7 @@ module HexaPDF
       #
       # The color used for filling (e.g. text), defaults to black.
       #
-      # See: HexaPDF::Content::Canvas#fill_color
+      # See: HexaPDF::Content::ColorSpace.device_color_from_specification
       #
       # Examples:
       #
@@ -926,7 +928,7 @@ module HexaPDF
       #
       # The color used for stroking (e.g. text outlines), defaults to black.
       #
-      # See: HexaPDF::Content::Canvas#stroke_color
+      # See: HexaPDF::Content::ColorSpace.device_color_from_specification
       #
       # Examples:
       #
@@ -1175,6 +1177,8 @@ module HexaPDF
       #
       # The color used for backgrounds, defaults to +nil+ (i.e. no background).
       #
+      # See: HexaPDF::Content::ColorSpace.device_color_from_specification
+      #
       # Examples:
       #
       #   #>pdf-composer100

data/lib/hexapdf/task/pdfa.rb CHANGED Viewed

@@ -40,6 +40,7 @@ require 'hexapdf/content/parser'
 require 'hexapdf/content/operator'
 require 'hexapdf/type/xref_stream'
 require 'hexapdf/type/object_stream'
+require 'hexapdf/font/true_type'
 module HexaPDF
   module Task
@@ -51,6 +52,13 @@ module HexaPDF
     # * prevents the Standard 14 PDF fonts to be used.
     # * adds an appropriate output intent if none is set.
     # * adds the necessary PDF/A metadata properties.
+    #
+    # Additionally, it applies fixes to the document so that the structures and content of
+    # non-conforming PDFs are corrected. See ::call for more information on the available fixes.
+    #
+    # Note that you should use a PDF/A validation tool like veraPDF (https://verapdf.org/) to ensure
+    # that the resulting files confirm to the PDF/A specification because not all documents can be
+    # fixed at the moment.
     module PDFA
       # Performs the necessary tasks to make the document PDF/A compatible.
@@ -58,7 +66,22 @@ module HexaPDF
       # +level+::
       #     Specifies the PDF/A conformance level that should be used. Can be one of the following
       #     strings: 2b, 2u, 3b, 3u.
-      def self.call(doc, level: '3u')
+      #
+      # +fixes+::
+      #     Specifies the fixes that should be applied when converting a non-conforming PDF. If a
+      #     document is created with HexaPDF but also includes parts of loaded documents, this
+      #     argument hast to be set to +:all+.
+      #
+      #     Can be +:default+ (which is also the default value), +:all+ or an array with one or more
+      #     fix names.
+      #
+      #     +:default+:: Applies all fixes if the document was loaded from a file. Otherwise applies
+      #         only those fixes necessary for files created with HexaPDF.
+      #
+      #     +:all+: Applies all available fixes.
+      #
+      #     +:glyph_widths+:: Corrects mismatching width information in fonts.
+      def self.call(doc, level: '3u', fixes: :default)
         unless level.match?(/\A[23][bu]\z/)
           raise ArgumentError, "The given PDF/A conformance level '#{level}' is not supported"
         end
@@ -68,6 +91,15 @@ module HexaPDF
           doc.metadata.property('pdfaid', 'part', part)
           doc.metadata.property('pdfaid', 'conformance', conformance.upcase)
           add_srgb_icc_output_intent(doc) unless doc.catalog.key?(:OutputIntents)
+          fixes = if fixes == :all || (fixes == :default && doc.revisions.parser)
+                    ALL_FIXES
+                  elsif fixes == :default
+                    ALL_FIXES - FIXES_FOR_LOADED_DOCUMENTS
+                  else
+                    fixes
+                  end
+          fixes.each {|fix| send(fix, doc) }
         end
       end
@@ -81,6 +113,81 @@ module HexaPDF
         ]
       end
+      ALL_FIXES = [:fix_glyph_widths] # :nodoc:
+      FIXES_FOR_LOADED_DOCUMENTS = [:fix_glyph_widths] # :nodoc:
+      # Makes the glyph widths stored in the embedded fonts the same as the ones specified in the
+      # PDF font data structures.
+      #
+      # Note: Currently only handles Type 2 CIDFonts.
+      def self.fix_glyph_widths(doc) # :nodoc:
+        # Step 1: Collect all CIDs together with their respective fonts
+        processor = CIDCollector.new
+        doc.pages.each do |page|
+          page.process_contents(processor)
+          page.each_annotation do |annotation|
+            next unless (appearance = annotation.appearance)
+            appearance.process_contents(processor, original_resources: page.resources)
+          end
+        end
+        # Step 2: Process all found fonts
+        processor.map.each do |font_object, all_cids|
+          next if all_cids.empty?
+          font = HexaPDF::Font::TrueType::Font.new(StringIO.new(font_object.font_file.stream))
+          cid_to_gid = cid_to_gid_mapping(font_object)
+          # Process all found CIDs by comparing their width with the ones defined in the font and
+          # correcting the font if necessary.
+          raw_hmtx = font[:hmtx].raw_data
+          width_conversion_factor = 1000.0 / font[:head].units_per_em
+          all_cids.each do |cid|
+            cid_width = font_object.width(cid)
+            gid = cid_to_gid[cid]
+            gid_width = font[:hmtx][gid].advance_width * width_conversion_factor
+            next if (cid_width - gid_width).abs.round <= 1
+            raw_hmtx[4 * gid, 2] = [(cid_width / width_conversion_factor).round].pack('n')
+          end
+          font_object.font_file.stream = font.build('hmtx' => raw_hmtx)
+        end
+      end
+      # Processes the contents of a stream and collects the CIDs for each composite font.
+      class CIDCollector < HexaPDF::Content::Processor
+        # The mapping from the composite font's descendant font to the set of used CIDs.
+        attr_reader :map
+        def initialize(*) # :nodoc:
+          super
+          @map = Hash.new {|h, k| h[k] = Set.new }
+        end
+        def show_text(data) # :nodoc:
+          font = graphics_state.font
+          return unless font[:Subtype] == :Type0 && font.descendant_font[:Subtype] == :CIDFontType2
+          Array(data).each do |item|
+            next if item.kind_of?(Numeric)
+            @map[font.descendant_font].merge(font.decode(item))
+          end
+        end
+        alias show_text_with_positioning show_text
+      end
+      # Returns an object responding to #[] that maps CIDs to GIDs for Type 2 CIDFonts.
+      def self.cid_to_gid_mapping(font)
+        if font[:CIDToGIDMap] == :Identity
+          proc {|cid| cid }
+        else
+          font[:CIDToGIDMap].stream.unpack('n*')
+        end
+      end
+      private_class_method :cid_to_gid_mapping
     end
   end

data/lib/hexapdf/type/acro_form/form.rb CHANGED Viewed

@@ -412,6 +412,8 @@ module HexaPDF
         #
         # * For radio buttons the value needs to be a String or a Symbol representing the name of
         #   the radio button widget to select.
+        #
+        # * Values for password fields are ignored as they should not be stored in the PDF.
         def fill(data)
           data.each do |field_name, value|
             field = field_by_name(field_name)
@@ -427,6 +429,8 @@ module HexaPDF
                                   when /\A(?:n(o)?|f(alse)?)\z/ then false
                                   else value
                                   end
+            when :password_field
+              # Ignore the value
             else
               raise HexaPDF::Error, "AcroForm field type #{field.concrete_field_type} not yet supported"
             end

data/lib/hexapdf/type/acro_form/text_field.rb CHANGED Viewed

@@ -344,8 +344,10 @@ module HexaPDF
           super
           if self[:V] && !(self[:V].kind_of?(String) || self[:V].kind_of?(HexaPDF::Stream))
-            yield("Text field doesn't contain text but #{self[:V].class} object")
-            return
+            correctable = self[:V].kind_of?(Symbol)
+            yield("Text field doesn't contain text but an object of type #{self[:V].class}", correctable)
+            return unless correctable
+            self[:V] = self[:V].to_s
           end
           if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
             correctable = true

data/lib/hexapdf/type/annotations/widget.rb CHANGED Viewed

@@ -250,6 +250,15 @@ module HexaPDF
           end
         end
+        private
+        def perform_validation(&block) #:nodoc:
+          super
+          if !key?(:Parent) && (field = form_field) == self
+            field.validate(&block)
+          end
+        end
       end
     end

data/lib/hexapdf/type/document_security_store.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# -*- encoding: utf-8; frozen_string_literal: true -*-
+#
+#--
+# This file is part of HexaPDF.
+#
+# HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
+# Copyright (C) 2014-2025 Thomas Leitner
+#
+# HexaPDF is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License version 3 as
+# published by the Free Software Foundation with the addition of the
+# following permission added to Section 15 as permitted in Section 7(a):
+# FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
+# THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
+# INFRINGEMENT OF THIRD PARTY RIGHTS.
+#
+# HexaPDF is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
+#
+# The interactive user interfaces in modified source and object code
+# versions of HexaPDF must display Appropriate Legal Notices, as required
+# under Section 5 of the GNU Affero General Public License version 3.
+#
+# In accordance with Section 7(b) of the GNU Affero General Public
+# License, a covered work must retain the producer line in every PDF that
+# is created or manipulated using HexaPDF.
+#
+# If the GNU Affero General Public License doesn't fit your need,
+# commercial licenses are available at <https://gettalong.at/hexapdf/>.
+#++
+require 'hexapdf/dictionary'
+module HexaPDF
+  module Type
+    # The document security store (DSS) dictionary contains data needed for verifying digital
+    # signatures.
+    #
+    # See: PDF2.0 s12.8.4.3
+    class DocumentSecurityStore < Dictionary
+      # The validation-related information (VRI) dictionary contains validation information for one
+      # signature. It signifies that the signature has been validated using this information.
+      #
+      # See: PDF2.0 s12.8.4.4
+      class ValidationRelatedInformation < Dictionary
+        define_type :VRI
+        define_field :Type, type: Symbol, default: type
+        define_field :Cert, type: PDFArray
+        define_field :CRL,  type: PDFArray
+        define_field :OCSP, type: PDFArray
+        define_field :TU,   type: PDFDate
+        define_field :TS,   type: Stream
+      end
+      define_type :DSS
+      define_field :Type,  type: Symbol, default: type
+      define_field :VRI,   type: Dictionary
+      define_field :Certs, type: PDFArray
+      define_field :OCSPs, type: PDFArray
+      define_field :CRLs,  type: PDFArray
+      define_field :SW, type: Symbol,   default: :A, allowed_values: [:A, :B, :S, :N]
+      define_field :S,  type: Symbol,   default: :P, allowed_values: [:A, :P]
+      define_field :A,  type: PDFArray, default: [0.5, 0.5]
+      define_field :FB, type: Boolean,  default: false, version: '1.5'
+    end
+  end
+end

data/lib/hexapdf/type/page.rb CHANGED Viewed

@@ -395,6 +395,17 @@ module HexaPDF
         Content::Parser.parse(contents, processor)
       end
+      # Extracts the layouted text from the page.
+      #
+      # See HexaPDF::Content::SmartTextExtractor.layout_text_runs for the available +options+.
+      def extract_text(**options)
+        processor = Content::SmartTextExtractor::TextRunProcessor.new
+        process_contents(processor)
+        box = box(:media)
+        Content::SmartTextExtractor.layout_text_runs(processor.text_runs, box.width, box.height,
+                                                     **options)
+      end
       # Returns the index of the page in the page tree.
       def index
         idx = 0

data/lib/hexapdf/type.rb CHANGED Viewed

@@ -89,6 +89,7 @@ module HexaPDF
     autoload(:MarkedContentReference, 'hexapdf/type/marked_content_reference')
     autoload(:ObjectReference, 'hexapdf/type/object_reference')
     autoload(:Measure, 'hexapdf/type/measure')
+    autoload(:DocumentSecurityStore, 'hexapdf/type/document_security_store')
   end

data/lib/hexapdf/version.rb CHANGED Viewed

@@ -37,6 +37,6 @@
 module HexaPDF
   # The version of HexaPDF.
-  VERSION = '1.6.0'
+  VERSION = '1.7.0'
 end

data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf ADDED Viewed

Binary file

data/test/hexapdf/content/test_smart_text_extractor.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# -*- encoding: utf-8 -*-
+require 'test_helper'
+require 'hexapdf/content/smart_text_extractor'
+require 'hexapdf/document'
+describe HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun do
+  it "has various accessors" do
+    text_run = HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new('s', 1, 2, 3, 5)
+    assert_equal('s', text_run.string)
+    assert_equal(2, text_run.width)
+    assert_equal(3, text_run.height)
+  end
+end
+describe HexaPDF::Content::SmartTextExtractor::TextRunProcessor do
+  it "turns glyphs into TextRun objects" do
+    processor = HexaPDF::Content::SmartTextExtractor::TextRunProcessor.new
+    doc = HexaPDF::Document.new
+    page = doc.pages.add
+    page.canvas.font('Helvetica', size: 10).
+      text('Te', at: [10, 500]).
+      text_matrix(0.866, -0.5, 0.5, 0.866, 0, 0).
+      text('Te')
+    page.process_contents(processor)
+    assert_equal([['T', 10, 497.75, 16.11, 509.31], ['e', 16.11, 497.75, 21.67, 509.31],
+                  ["T", -1.125, -5.0035, 9.94626, 8.06246],
+                  ["e", 4.16626, -7.7835, 14.761220000000002, 5.00746]],
+                  processor.text_runs.map(&:to_a))
+  end
+end
+describe HexaPDF::Content::SmartTextExtractor do
+  def text_run(str, left, bottom, right, top)
+    HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new(str, left, bottom, right, top)
+  end
+  def layout_runs(runs, width = 595, height = 842, **options)
+    runs = runs.map {|args| text_run(*args) }
+    HexaPDF::Content::SmartTextExtractor.layout_text_runs(runs, width, height, **options)
+  end
+  it "works for a page with no text" do
+    assert_equal('', layout_runs([]))
+  end
+  it "works for a single run on the left side of the page" do
+    assert_equal('test', layout_runs([['test', 0, 100, 20, 110]]))
+  end
+  it "works for a single run not on the left side of the page" do
+    assert_equal('test', layout_runs([['test', 50, 100, 70, 110]]))
+  end
+  it "preserves the relative indent" do
+    assert_equal("Hello\n     World", layout_runs([['Hello', 50, 100, 70, 110],
+                                                   ['World', 70, 80, 90, 100]]))
+  end
+  it "combines text runs if they have the same top/bottom and there is less than 1pt between them" do
+    x = +'Hello'
+    assert_equal('HelloWorld', layout_runs([[x, 50, 100, 60, 110],
+                                            ['World', 60, 100, 70, 110]]))
+    assert_equal('HelloWorld', x)
+  end
+  it "preserves the space between two runs" do
+    assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
+                                             ['World', 72, 100, 92, 110]]))
+    assert_equal('Hello   World', layout_runs([['Hello', 50, 100, 70, 110],
+                                               ['World', 80, 100, 100, 110]]))
+ end
+  it "inserts a space after very narrow text parts if necessary" do
+    assert_equal('Hello World!', layout_runs([['Hello', 50, 100, 60, 110],
+                                              ['World!', 63, 100, 87, 110]]))
+ end
+  it "preserves the visual horizontal ordering of two runs" do
+    assert_equal('Hello World', layout_runs([['World', 72, 100, 92, 110],
+                                             ['Hello', 50, 100, 70, 110]]))
+  end
+  it "preserves the visual vertical ordering of two runs" do
+    assert_equal("Hello\nWorld", layout_runs([['World', 50, 80, 70, 100],
+                                              ['Hello', 50, 100, 70, 110]]))
+  end
+  it "inserts a single blank line between paragraphs" do
+    assert_equal("Hello\nWorld\n\nHere",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100],
+                              ['Here', 50, 65, 66, 75]]))
+  end
+  it "inserts multiply lines for large gaps between paragraphs" do
+    assert_equal("Hello\nWorld\nHere\n\n\n\n\n\n\nFoot",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100],
+                              ['Here', 50, 80, 70, 90],
+                              ['Foot', 50, 10, 66, 20]]))
+  end
+  it "ignores outliers when calculating the normal line spacing" do
+    assert_equal("Hello\nWorld\n\n\n\nHere",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100],
+                              ['Here', 50, 50, 70, 60]]))
+  end
+  it "can use a different line_tolerance_factor" do
+    assert_equal("HelloWorld",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100]], line_tolerance_factor: 1))
+  end
+  it "can use a different paragraph_distance_threshold" do
+    assert_equal("Hello\n\nWorld",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100]], paragraph_distance_threshold: 1))
+  end
+  it "can use a different large_distance_threshold" do
+    assert_equal("Hello\nWorld\n\nHere",
+                 layout_runs([['Hello', 50, 100, 70, 110],
+                              ['World', 50, 90, 70, 100],
+                              ['Here', 50, 50, 66, 60]], large_distance_threshold: 8))
+  end
+end