RubyGems - hexapdf - Versions diffs - 0.12.3 → 0.14.3 - Mend

hexapdf 0.12.3 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +132 -0
data/examples/019-acro_form.rb +41 -4
data/lib/hexapdf/cli/command.rb +4 -2
data/lib/hexapdf/cli/image2pdf.rb +2 -1
data/lib/hexapdf/cli/info.rb +51 -2
data/lib/hexapdf/cli/inspect.rb +30 -8
data/lib/hexapdf/cli/merge.rb +1 -1
data/lib/hexapdf/cli/split.rb +74 -14
data/lib/hexapdf/configuration.rb +15 -0
data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
data/lib/hexapdf/dictionary.rb +12 -6
data/lib/hexapdf/dictionary_fields.rb +2 -10
data/lib/hexapdf/document.rb +41 -16
data/lib/hexapdf/document/files.rb +0 -1
data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
data/lib/hexapdf/encryption/security_handler.rb +1 -0
data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
data/lib/hexapdf/font/cmap.rb +1 -4
data/lib/hexapdf/font/true_type/subsetter.rb +16 -3
data/lib/hexapdf/font/true_type/table/head.rb +1 -0
data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
data/lib/hexapdf/font/true_type/table/post.rb +15 -10
data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
data/lib/hexapdf/font_loader/from_file.rb +18 -8
data/lib/hexapdf/image_loader/png.rb +3 -2
data/lib/hexapdf/importer.rb +3 -2
data/lib/hexapdf/layout/line.rb +1 -1
data/lib/hexapdf/layout/style.rb +23 -23
data/lib/hexapdf/layout/text_layouter.rb +2 -2
data/lib/hexapdf/layout/text_shaper.rb +3 -2
data/lib/hexapdf/object.rb +52 -25
data/lib/hexapdf/parser.rb +107 -7
data/lib/hexapdf/pdf_array.rb +15 -5
data/lib/hexapdf/revisions.rb +29 -21
data/lib/hexapdf/serializer.rb +37 -10
data/lib/hexapdf/task/optimize.rb +6 -4
data/lib/hexapdf/tokenizer.rb +22 -0
data/lib/hexapdf/type/acro_form/appearance_generator.rb +130 -27
data/lib/hexapdf/type/acro_form/button_field.rb +5 -2
data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
data/lib/hexapdf/type/acro_form/field.rb +35 -5
data/lib/hexapdf/type/acro_form/form.rb +139 -14
data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
data/lib/hexapdf/type/actions/uri.rb +3 -2
data/lib/hexapdf/type/annotations/widget.rb +3 -4
data/lib/hexapdf/type/catalog.rb +2 -2
data/lib/hexapdf/type/cid_font.rb +1 -1
data/lib/hexapdf/type/file_specification.rb +1 -1
data/lib/hexapdf/type/font.rb +1 -1
data/lib/hexapdf/type/font_simple.rb +4 -2
data/lib/hexapdf/type/font_true_type.rb +6 -2
data/lib/hexapdf/type/font_type0.rb +4 -4
data/lib/hexapdf/type/form.rb +6 -2
data/lib/hexapdf/type/image.rb +2 -2
data/lib/hexapdf/type/page.rb +21 -12
data/lib/hexapdf/type/page_tree_node.rb +29 -5
data/lib/hexapdf/type/resources.rb +5 -0
data/lib/hexapdf/type/trailer.rb +2 -3
data/lib/hexapdf/utils/object_hash.rb +0 -1
data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
data/lib/hexapdf/version.rb +1 -1
data/test/hexapdf/common_tokenizer_tests.rb +2 -2
data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
data/test/hexapdf/content/test_canvas.rb +3 -3
data/test/hexapdf/content/test_color_space.rb +1 -1
data/test/hexapdf/encryption/test_aes.rb +4 -4
data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
data/test/hexapdf/font/true_type/test_subsetter.rb +10 -0
data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
data/test/hexapdf/font_loader/test_from_file.rb +7 -0
data/test/hexapdf/layout/test_text_layouter.rb +12 -5
data/test/hexapdf/test_configuration.rb +2 -2
data/test/hexapdf/test_dictionary.rb +8 -1
data/test/hexapdf/test_dictionary_fields.rb +9 -2
data/test/hexapdf/test_document.rb +18 -10
data/test/hexapdf/test_object.rb +71 -26
data/test/hexapdf/test_parser.rb +205 -51
data/test/hexapdf/test_pdf_array.rb +8 -1
data/test/hexapdf/test_revisions.rb +35 -0
data/test/hexapdf/test_serializer.rb +7 -0
data/test/hexapdf/test_tokenizer.rb +28 -0
data/test/hexapdf/test_writer.rb +2 -2
data/test/hexapdf/type/acro_form/test_appearance_generator.rb +288 -35
data/test/hexapdf/type/acro_form/test_button_field.rb +15 -0
data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
data/test/hexapdf/type/acro_form/test_field.rb +39 -0
data/test/hexapdf/type/acro_form/test_form.rb +87 -15
data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
data/test/hexapdf/type/test_font_simple.rb +2 -1
data/test/hexapdf/type/test_font_true_type.rb +6 -0
data/test/hexapdf/type/test_form.rb +8 -1
data/test/hexapdf/type/test_page.rb +8 -1
data/test/hexapdf/type/test_page_tree_node.rb +42 -0
data/test/hexapdf/type/test_resources.rb +6 -0
data/test/hexapdf/utils/test_bit_field.rb +2 -0
data/test/hexapdf/utils/test_object_hash.rb +5 -0
data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
data/test/test_helper.rb +2 -0
metadata +6 -12

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 889b4bf1bc77da0a3fdfc62d2b5b09042aa1b5a567d5ed80ae382e6cdeb193f9
-  data.tar.gz: 67f217de3dbd01653e9df4e8f8af7e8dba3745cd772e6d6ab930411ff3d1cfb3
+  metadata.gz: c43d8e9e117db1717ddfee73a54e4384743b8aa35863ab5bd19ffe57b8ce5674
+  data.tar.gz: 1020c8a3de8fcdf201500c1c0d22dfb99ed27daebac7baac92748f8127efc992
 SHA512:
-  metadata.gz: 71affdceb736e0645c45b181a585b3a425135c0b22fba1daf28d89aaa6e73e5226f18a1e420fb75325653c87274f66664526d8ca55baaaa5251b4f822617b986
-  data.tar.gz: 63aceaac41dd2ea797f92e7335a381bea5d1bdd2f7388c583431323e7ac9fae0855a404a84fbed70222130bd5eded126dae5385be2291d76c91021633d03a3bb
+  metadata.gz: e19eea4e88077afb7e8532fa6fe9ab2a03ffc5588749b72277462a971ebcec877ee72868d0ab698744117d46566be98e65c10225649d3bd1b4cd6e64e9625767
+  data.tar.gz: 6626a9feba0af0b46f293c1069a0d53b458a0dc29d08b82253f14f9bb98a878b914042faccc433b73f2f0e35d4da47c58a1bdebd2f3dee2fefb24c076a4e6bb3

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,135 @@
+## 0.14.3 - 2021-02-16
+### Fixed
+* Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
+  text output
+* [HexaPDF::Serializer] to handle infinite recursion problem
+* Cross-reference table reconstruction to avoid an O(n^2) performance problem
+* [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
+  containing a single value instead of an array
+* Processing of invalid PDF files missing a required value in appearance streams
+* Processing of invalid empty arrays that should be rectangles by converting
+  them to PDF null objects
+* Processing of invalid PDF files containing indirect objects with offset 0
+* Processing of invalid PDF files containing a space/CR or space/LF combination
+  after the 'stream' keyword
+## 0.14.2 - 2021-01-22
+### Fixed
+* [HexaPDF::Font::TrueType::Subsetter#use_glyph] to really avoid using subset
+  glyph ID 41 (`)`)
+## 0.14.1 - 2021-01-21
+### Changed
+* Validation message when checking for allowed values to include the invalid
+  object
+* [HexaPDF::FontLoader::FromFile] to allow (re)using an existing font object
+* [HexaPDF::Importer] internals to avoid problems with retained memory
+### Fixed
+* Parsing of invalid PDF files where whitespace is missing after the integer
+  value of an indirect object
+* [HexaPDF::Dictionary] so that adding new key-value pairs during validation is
+  possible
+## 0.14.0 - 2020-12-30
+### Added
+* Support for creating AcroForm multiline text fields and their appearances
+* Support for creating AcroForm comb text fields and their appearances
+* Support for creating AcroForm password fields and their appearances
+* Support for creating AcroForm file select fields and their appearances
+* Support for creating AcroForm list box appearances
+* [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index] and its setter
+  method
+* [HexaPDF::Type::AcroForm::ChoiceField#update_widgets] to create appearances if
+  they don't exist
+* Methods for caching data to [HexaPDF::Object]
+* Support for splitting by page size to CLI command `hexapdf split`
+### Changed
+* [HexaPDF::Utils::ObjectHash#oids] to be public instead of private
+* Cross-reference table parsing to handle invalidly numbered main sections
+* [HexaPDF::Document#cache] and [HexaPDF::Object#cache] to allow updating
+  values for existing keys
+* Appearance creation methods of AcroForm objects to allow forcing the creation
+  of new appearances
+* [HexaPDF::Type::AcroForm::AppearanceGenerator#create_text_appearances] to
+  re-use existing form objects
+* AcroForm field creation methods to allow specifying often used field
+  properties
+### Fixed
+* Missing usage of `:sort` flag for AcroForm choice fields
+* Setting the `/I` field for AcroForm list boxes with multiple selection
+* [HexaPDF::Layout::TextLayouter::SimpleLineWrapping] to remove glue items
+  (whitespace) before a hard line break
+* Infinite loop when reconstructing the cross-reference table
+* [HexaPDF::Type::AcroForm::ChoiceField] to support export values for option
+  items
+* AcroForm text field appearance creation to only create a new appearance if the
+  field's value has changed
+* AcroForm choice field appearance creation to only create a new appearance if
+  the involved dictionary fields' values have changed
+* [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index=] to raise an error
+  if no option items are set
+* [HexaPDF::PDFArray#to_ary] to return an array with preprocessed values
+* [HexaPDF::Type::Form#contents=] to clear cached values to avoid returning e.g.
+  an invalid canvas object later
+* [HexaPDF::Type::AcroForm::ButtonField#update_widgets] to create appearances if
+  they don't exist
+## 0.13.0 - 2020-11-15
+### Added
+* Cross-reference table reconstruction for damaged PDFs, controllable via the
+  new 'parser.try_xref_reconstruction' option
+* Two new `hexapdf inspect` commands for showing page objects and page content
+  streams by page number
+* Flag `--check` to the CLI command `hexapdf info` for checking a file for
+  parse and validation errors
+* [HexaPDF::Type::AcroForm::Field#embedded_widget?] for checking if a widget is
+  embedded in the field object
+* [HexaPDF::Type::AcroForm::Field#delete_widget] for deleting a widget
+* [HexaPDF::PDFArray#delete] for deleting an object from a PDF array
+* [HexaPDF::Type::Page#ancestor_nodes] for retrieving all ancestor page tree
+  nodes of a page
+* [HexaPDF::Type::PageTreeNode#move_page] for moving a page to another index
+### Changed
+* **Breaking change**: Overhauled document/object validation interfaces and
+  internals to be more similar and to allow for reporting of multiple validation
+  problems
+* Validation of TrueType fonts to ignore missing fields if the font name
+  suggests that the font is one of the standard 14 PDF fonts
+* Option `-p` of CLI command `hexapdf image2pdf` to also allow lowercase page
+  size names
+### Fixed
+* Reporting of cross-reference section entry parsing error
+* PDF version used by default for dictionary fields
+* Error in CLI command `hexapdf inspect` when parsing an invalid object number
+* Output of error messages in CLI command `hexapdf inspect` to go to `$stderr`
+* Bug in [HexaPDF::Type::AcroForm::TextField] validation due to missing nil
+  handling
 ## 0.12.3 - 2020-08-22
 ### Changed

data/examples/019-acro_form.rb CHANGED Viewed

@@ -42,10 +42,47 @@ rb = form.create_radio_button("Radio")
 end
 rb.field_value = :button0
-canvas.text("Text field", at: [50, 450])
-tx = form.create_text_field("Single Line")
-widget = tx.create_widget(page, Rect: [200, 445, 500, 465])
-tx.set_default_appearance_string(font_size: 16)
+canvas.text("Text fields", at: [50, 450])
+canvas.text("Single line", at: [70, 420])
+tx = form.create_text_field("Single Line", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 415, 500, 435])
 tx.field_value = "A sample test string!"
+canvas.text("Multiline", at: [70, 390])
+tx = form.create_multiline_text_field("Multiline", font_size: 0, align: :right)
+widget = tx.create_widget(page, Rect: [200, 325, 500, 405])
+widget.border_style(color: 0, width: 1)
+tx.field_value = "A sample test string! " * 30 + "\nNew line\n\nAnother line"
+canvas.text("Password", at: [70, 300])
+tx = form.create_password_field("Password", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 295, 500, 315])
+canvas.text("File select", at: [70, 270])
+tx = form.create_file_select_field("File Select", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 265, 500, 285])
+tx.field_value = "path/to/file.pdf"
+canvas.text("Comb", at: [70, 240])
+tx = form.create_comb_text_field("Comb field", max_chars: 10, font_size: 16, align: :center)
+widget = tx.create_widget(page, Rect: [200, 220, 500, 255])
+widget.border_style(color: [30, 128, 0], width: 1)
+tx.field_value = 'Hello'
+canvas.text("Combo Box", at: [50, 170])
+cb = form.create_combo_box("Combo Box", font_size: 12, editable: true,
+                           option_items: ['Value 1', 'Another value', 'Choose me!'])
+widget = cb.create_widget(page, Rect: [200, 150, 500, 185])
+widget.border_style(width: 1)
+cb.field_value = 'Another value'
+canvas.text("List Box", at: [50, 120])
+lb = form.create_list_box("List Box", font_size: 15, align: :center, multi_select: true,
+                         option_items: 1.upto(7).map {|i| "Value #{i}" })
+widget = lb.create_widget(page, Rect: [200, 50, 500, 135])
+widget.border_style(width: 1)
+lb.list_box_top_index = 1
+lb.field_value = ['Value 6', 'Value 2']
 doc.write('acro_form.pdf', optimize: true)

data/lib/hexapdf/cli/command.rb CHANGED Viewed

@@ -100,6 +100,7 @@ module HexaPDF
       def pdf_options(password)
         hash = {decryption_opts: {password: password}, config: {}}
         HexaPDF::GlobalConfiguration['filter.predictor.strict'] = command_parser.strict
+        hash[:config]['parser.try_xref_reconstruction'] = !command_parser.strict
         hash[:config]['parser.on_correctable_error'] =
           if command_parser.strict
             proc { true }
@@ -277,14 +278,15 @@ module HexaPDF
       #
       # See: #define_encryption_options
       def apply_encryption_options(doc)
-        if @out_options.encryption == :add
+        case @out_options.encryption
+        when :add
           doc.encrypt(algorithm: @out_options.enc_algorithm,
                       key_length: @out_options.enc_key_length,
                       force_v4: @out_options.enc_force_v4,
                       permissions: @out_options.enc_permissions,
                       owner_password: @out_options.enc_owner_pwd,
                       user_password: @out_options.enc_user_pwd)
-        elsif @out_options.encryption == :remove
+        when :remove
           doc.encrypt(name: nil)
         end
       end

data/lib/hexapdf/cli/image2pdf.rb CHANGED Viewed

@@ -64,7 +64,8 @@ module HexaPDF
                            orientation = :landscape
                            page_size.delete_suffix!('-landscape')
                          end
-                         HexaPDF::Type::Page.media_box(page_size.to_sym, orientation: orientation)
+                         page_size = page_size.capitalize.to_sym
+                         HexaPDF::Type::Page.media_box(page_size, orientation: orientation)
                        end
         end
         options.on("--[no-]auto-rotate", "Automatically rotate pages based on image dimesions. " \

data/lib/hexapdf/cli/info.rb CHANGED Viewed

@@ -55,13 +55,21 @@ module HexaPDF
         long_desc(<<~EOF)
           This command extracts information from the Info dictionary of a PDF file as well
           as some other useful information like the used PDF version and encryption information.
+          If the --check option is specified, the PDF file will also be checked for parse and
+          validation errors. And if the process doesn't abort, HexaPDF is still able to handle the
+          file by correcting the errors.
         EOF
+        options.on("--check", "-c", "Check the PDF file for parse errors and validity") do |check|
+          @check_file = check
+        end
         options.on("--password PASSWORD", "-p", String,
                    "The password for decryption. Use - for reading from standard input.") do |pwd|
           @password = (pwd == '-' ? read_password : pwd)
         end
         @password = nil
         @auto_decrypt = true
+        @check_file = false
       end
       def execute(file) #:nodoc:
@@ -79,8 +87,30 @@ module HexaPDF
         options = pdf_options(@password)
         options[:config]['document.auto_decrypt'] = @auto_decrypt
         HexaPDF::Document.open(file, **options) do |doc|
+          if @check_file
+            indirect_object = nil
+            validation_block = lambda do |msg, correctable, object|
+              object = indirect_object unless object.indirect? || object.type == :XXTrailer
+              object_type = if object.type == :XXTrailer
+                              'trailer'
+                            elsif !object.type.to_s.start_with?("XX")
+                              "object type #{object.type} (#{object.oid},#{object.gen})"
+                            else
+                              "object (#{object.oid},#{object.gen})"
+                            end
+              object_type = "sub-object of #{object_type}" if object == indirect_object
+              puts "WARNING: Validation error for #{object_type}: #{msg} " \
+                "#{correctable ? '(correctable)' : ''}"
+            end
+            doc.trailer.validate(auto_correct: true, &validation_block)
+            doc.each(only_current: false, only_loaded: false) do |obj|
+              indirect_object = obj
+              obj.validate(auto_correct: true, &validation_block)
+            end
+          end
           output_line("File name", file)
-          output_line("File size", File.stat(file).size.to_s + " bytes")
+          output_line("File size", File.stat(file).size.to_s << " bytes")
           @auto_decrypt && INFO_KEYS.each do |name|
             next unless doc.trailer.info.key?(name)
             output_line(name.to_s, doc.trailer.info[name].to_s)
@@ -110,10 +140,29 @@ module HexaPDF
         else
           raise
         end
+      rescue HexaPDF::MalformedPDFError => e
+        $stderr.puts "Error: PDF file #{file} is damaged and cannot be recovered"
+        $stderr.puts "       #{e}"
+      end
+      # Use custom options if we are checking the PDF file for errors.
+      def pdf_options(password)
+        if @check_file
+          options = {decryption_opts: {password: password}, config: {}}
+          HexaPDF::GlobalConfiguration['filter.predictor.strict'] = false
+          options[:config]['parser.try_xref_reconstruction'] = true
+          options[:config]['parser.on_correctable_error'] = lambda do |_, msg, pos|
+            puts "WARNING: Parse error at position #{pos}: #{msg}"
+            false
+          end
+          options
+        else
+          super
+        end
       end
       def output_line(header, text) #:nodoc:
-        puts((header + ":").ljust(COLUMN_WIDTH) << text)
+        puts(("#{header}:").ljust(COLUMN_WIDTH) << text)
       end
     end

data/lib/hexapdf/cli/inspect.rb CHANGED Viewed

@@ -122,22 +122,22 @@ module HexaPDF
           case command
           when /^\d+(,\d+)?$/, 'o', 'object'
             arg = (command.start_with?('o') ? data.shift : command)
-            obj = pdf_object_from_string_reference(arg) rescue puts($!.message)
-            if obj.data.stream && command_parser.verbosity_info?
+            obj = pdf_object_from_string_reference(arg) rescue $stderr.puts($!.message)
+            if obj&.data&.stream && command_parser.verbosity_info?
               $stderr.puts("Note: Object also has stream data")
             end
             serialize(obj.value, recursive: false) if obj
           when 'r', 'recursive'
             obj = if (obj = data.shift)
-                    pdf_object_from_string_reference(obj) rescue puts($!.message)
+                    pdf_object_from_string_reference(obj) rescue $stderr.puts($!.message)
                   else
                     @doc.trailer
                   end
             serialize(obj.value, recursive: true) if obj
           when 's', 'stream', 'raw', 'raw-stream'
-            if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message)) &&
+            if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) &&
                 obj.kind_of?(HexaPDF::Stream)
               source = (command.start_with?('raw') ? obj.stream_source : obj.stream_decoder)
               while source.alive? && (stream_data = source.resume)
@@ -148,7 +148,7 @@ module HexaPDF
             end
           when 'x', 'xref'
-            if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message))
+            if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message))
               @doc.revisions.reverse_each do |rev|
                 if (xref = rev.xref(obj))
                   puts xref
@@ -178,6 +178,26 @@ module HexaPDF
               puts str
             end
+          when 'po', 'ps'
+            page_number_str = data.shift
+            unless page_number_str
+              $stderr.puts("Error: Missing PAGE argument to #{command}")
+              next
+            end
+            page_number = parse_pages_specification(page_number_str, @doc.pages.count).first&.first
+            unless page_number
+              $stderr.puts("Error: Invalid page number #{page_number_str}")
+              next
+            end
+            page = @doc.pages[page_number]
+            if command.start_with?('ps')
+              $stdout.write(page.contents)
+            else
+              puts "#{page.oid} #{page.gen} obj"
+              serialize(page.value, recursive: false)
+              puts "endobj"
+            end
           when 'pc', 'page-count'
             puts @doc.pages.count
@@ -217,9 +237,9 @@ module HexaPDF
         if str.nil?
           raise "Error: Missing argument object identifier OID[,GEN]"
         elsif !str.match?(/^\d+(,\d+)?$/)
-          raise "Error: Invalid argument: Must be of form OID[,GEN]"
+          raise "Error: Invalid argument: Must be of form OID[,GEN], not '#{str}'"
         elsif !(obj = @doc.object(pdf_reference_from_string(str)))
-          raise "Error: No object with the given object identifier found"
+          raise "Error: No object with the given object identifier '#{str}' found"
         else
           obj
         end
@@ -240,7 +260,7 @@ module HexaPDF
           puts "<<"
           (recursive ? val.sort : val).each do |k, v|
             next if v.nil? || (v.respond_to?(:null?) && v.null?)
-            print '  ' * (indent + 1) + @serializer.serialize_symbol(k) + " "
+            print '%s%s ' % ['  ' * (indent + 1), @serializer.serialize_symbol(k)]
             serialize(v, recursive: recursive, seen: seen, indent: indent + 1)
             puts
           end
@@ -283,6 +303,8 @@ module HexaPDF
         ["c[atalog]", "Print the catalog dictionary"],
         ["t[railer]", "Print the trailer dictionary"],
         ["p[ages] [RANGE]",  "Print information about pages"],
+        ["po PAGE", "Print the page object"],
+        ["ps PAGE", "Print the content stream of the page"],
         ["pc | page-count", "Print the number of pages"],
         ["search REGEXP", "Print objects matching the pattern"],
         ["h[elp]", "Show the help"],

data/lib/hexapdf/cli/merge.rb CHANGED Viewed

@@ -122,7 +122,7 @@ module HexaPDF
         # Assemble pages
         target = (@initial_empty ? HexaPDF::Document.new : @files.first.file)
-        page_tree = target.add(Type: :Pages)
+        page_tree = target.add({Type: :Pages})
         import_pages(page_tree)
         target.catalog[:Pages] = page_tree
         remove_unused_pages(target)

data/lib/hexapdf/cli/split.rb CHANGED Viewed

@@ -44,16 +44,28 @@ module HexaPDF
       def initialize #:nodoc:
         super('split', takes_commands: false)
-        short_desc("Split a PDF file into individual pages")
+        short_desc("Split a PDF file")
         long_desc(<<~EOF)
-          If no OUTPUT_SPEC is specified, the pages are named <PDF>_0001.pdf, <PDF>_0002.pdf, ...
-          and so on. To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a
-          printf-style format definition like '%04d' to specify the place where the page number
-          should be inserted.
+          The default strategy is to split a PDF into individual pages, i.e. splitting is done by
+          page number. It is also possible to split by page size where pages with the same page size
+          get put into the same output PDF.
+          If no OUTPUT_SPEC is specified, the resulting PDF files are named <PDF>_0001.pdf,
+          <PDF>_0002.pdf, ... when splitting by page number and <PDF>_A4.pdf, <PDF>_Letter.pdf, ...
+          when splitting by page size.
+          To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a printf-style
+          format definition like '%04d' to specify the place where the page number should be
+          inserted. In case of splitting by page size, the place of the format defintion is replaced
+          with the name of the page size, e.g. A4 or Letter.
           The optimization and encryption options are applied to each created output file.
         EOF
+        options.on("--strategy STRATEGY", "-s", [:page_number, :page_size], "Defines how the PDF " \
+                   "file should be split: page_number or page_size (default: page_number)") do |s|
+          @strategy = s
+        end
         options.on("--password PASSWORD", "-p", String,
                    "The password for decryption. Use - for reading from standard input.") do |pwd|
           @password = (pwd == '-' ? read_password : pwd)
@@ -62,23 +74,71 @@ module HexaPDF
         define_encryption_options
         @password = nil
+        @strategy = :page_number
       end
       def execute(pdf, output_spec = pdf.sub(/\.pdf$/i, '_%04d.pdf')) #:nodoc:
-        output_spec = output_spec.sub('%', '%<page>')
         with_document(pdf, password: @password) do |doc|
-          doc.pages.each_with_index do |page, index|
-            output_file = sprintf(output_spec, page: index + 1)
-            maybe_raise_on_existing_file(output_file)
-            out = HexaPDF::Document.new
-            out.pages.add(out.import(page))
-            apply_encryption_options(out)
-            apply_optimization_options(out)
-            write_document(out, output_file)
+          if @strategy == :page_number
+            split_by_page_number(doc, output_spec)
+          else
+            split_by_page_size(doc, output_spec)
           end
         end
       end
+      private
+      # Splits the document into individual pages.
+      def split_by_page_number(doc, output_spec)
+        doc.pages.each_with_index do |page, index|
+          output_file = sprintf(output_spec, index + 1)
+          maybe_raise_on_existing_file(output_file)
+          out = HexaPDF::Document.new
+          out.pages.add(out.import(page))
+          apply_encryption_options(out)
+          apply_optimization_options(out)
+          write_document(out, output_file)
+        end
+      end
+      # Splits the document into files based on the page sizes.
+      def split_by_page_size(doc, output_spec)
+        output_spec = output_spec.sub(/%.*?[a-zA-Z]/, '%s')
+        out_files = Hash.new do |hash, key|
+          output_file = sprintf(output_spec, key)
+          maybe_raise_on_existing_file(output_file)
+          out = HexaPDF::Document.new
+          out.config['output_file'] = output_file
+          hash[key] = out
+        end
+        doc.pages.each do |page|
+          out = out_files[page_size_name(page.box(:media).value)]
+          out.pages.add(out.import(page))
+        end
+        out_files.each_value do |out|
+          apply_encryption_options(out)
+          apply_optimization_options(out)
+          write_document(out, out.config['output_file'])
+        end
+      end
+      # Tries to retrieve a page size name based on the media box. If this is not possible, the
+      # returned page size name consists of width x height.
+      def page_size_name(media_box)
+        @page_name_cache ||= {}
+        return @page_name_cache[media_box] if @page_name_cache.key?(media_box)
+        paper_size = HexaPDF::Type::Page::PAPER_SIZE.find do |_name, box|
+          box.each_with_index.all? {|entry, index| (entry - media_box[index]).abs < 5 }
+        end
+        @page_name_cache[media_box] =
+          paper_size ? paper_size[0] : "%.0fx%.0f" % media_box.values_at(2, 3)
+      end
     end
   end