RubyGems - hexapdf - Versions diffs - 0.12.1 → 0.14.1 - Mend

hexapdf 0.12.1 → 0.14.1

Files changed (102) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +130 -0
data/examples/019-acro_form.rb +41 -4
data/lib/hexapdf/cli/command.rb +4 -2
data/lib/hexapdf/cli/image2pdf.rb +2 -1
data/lib/hexapdf/cli/info.rb +51 -2
data/lib/hexapdf/cli/inspect.rb +30 -8
data/lib/hexapdf/cli/merge.rb +1 -1
data/lib/hexapdf/cli/split.rb +74 -14
data/lib/hexapdf/configuration.rb +15 -0
data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
data/lib/hexapdf/content/parser.rb +1 -1
data/lib/hexapdf/dictionary.rb +9 -6
data/lib/hexapdf/dictionary_fields.rb +1 -9
data/lib/hexapdf/document.rb +41 -16
data/lib/hexapdf/document/files.rb +0 -1
data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
data/lib/hexapdf/encryption/security_handler.rb +1 -0
data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
data/lib/hexapdf/font/cmap.rb +1 -4
data/lib/hexapdf/font/true_type/subsetter.rb +12 -3
data/lib/hexapdf/font/true_type/table/head.rb +1 -0
data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
data/lib/hexapdf/font/true_type/table/post.rb +15 -10
data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
data/lib/hexapdf/font_loader/from_file.rb +18 -8
data/lib/hexapdf/image_loader/png.rb +3 -2
data/lib/hexapdf/importer.rb +3 -2
data/lib/hexapdf/layout/line.rb +1 -1
data/lib/hexapdf/layout/style.rb +23 -23
data/lib/hexapdf/layout/text_layouter.rb +2 -2
data/lib/hexapdf/layout/text_shaper.rb +3 -2
data/lib/hexapdf/object.rb +52 -25
data/lib/hexapdf/parser.rb +96 -4
data/lib/hexapdf/pdf_array.rb +12 -5
data/lib/hexapdf/revisions.rb +29 -21
data/lib/hexapdf/serializer.rb +34 -8
data/lib/hexapdf/task/optimize.rb +6 -4
data/lib/hexapdf/tokenizer.rb +4 -3
data/lib/hexapdf/type/acro_form/appearance_generator.rb +132 -28
data/lib/hexapdf/type/acro_form/button_field.rb +21 -13
data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
data/lib/hexapdf/type/acro_form/field.rb +35 -5
data/lib/hexapdf/type/acro_form/form.rb +139 -14
data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
data/lib/hexapdf/type/actions/uri.rb +3 -2
data/lib/hexapdf/type/annotations/widget.rb +3 -4
data/lib/hexapdf/type/catalog.rb +2 -2
data/lib/hexapdf/type/cid_font.rb +1 -1
data/lib/hexapdf/type/file_specification.rb +1 -1
data/lib/hexapdf/type/font.rb +1 -1
data/lib/hexapdf/type/font_simple.rb +4 -2
data/lib/hexapdf/type/font_true_type.rb +6 -2
data/lib/hexapdf/type/font_type0.rb +4 -4
data/lib/hexapdf/type/form.rb +15 -2
data/lib/hexapdf/type/image.rb +2 -2
data/lib/hexapdf/type/page.rb +37 -13
data/lib/hexapdf/type/page_tree_node.rb +29 -5
data/lib/hexapdf/type/resources.rb +1 -0
data/lib/hexapdf/type/trailer.rb +2 -3
data/lib/hexapdf/utils/object_hash.rb +0 -1
data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
data/lib/hexapdf/version.rb +1 -1
data/test/hexapdf/common_tokenizer_tests.rb +6 -1
data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
data/test/hexapdf/content/test_canvas.rb +3 -3
data/test/hexapdf/content/test_color_space.rb +1 -1
data/test/hexapdf/encryption/test_aes.rb +4 -4
data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
data/test/hexapdf/font/true_type/test_subsetter.rb +5 -0
data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
data/test/hexapdf/font_loader/test_from_file.rb +7 -0
data/test/hexapdf/layout/test_style.rb +1 -1
data/test/hexapdf/layout/test_text_layouter.rb +12 -5
data/test/hexapdf/test_configuration.rb +2 -2
data/test/hexapdf/test_dictionary.rb +8 -1
data/test/hexapdf/test_dictionary_fields.rb +2 -2
data/test/hexapdf/test_document.rb +18 -10
data/test/hexapdf/test_object.rb +71 -26
data/test/hexapdf/test_parser.rb +171 -53
data/test/hexapdf/test_pdf_array.rb +8 -1
data/test/hexapdf/test_revisions.rb +35 -0
data/test/hexapdf/test_writer.rb +2 -2
data/test/hexapdf/type/acro_form/test_appearance_generator.rb +296 -38
data/test/hexapdf/type/acro_form/test_button_field.rb +22 -2
data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
data/test/hexapdf/type/acro_form/test_field.rb +39 -0
data/test/hexapdf/type/acro_form/test_form.rb +87 -15
data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
data/test/hexapdf/type/test_font_simple.rb +2 -1
data/test/hexapdf/type/test_font_true_type.rb +6 -0
data/test/hexapdf/type/test_form.rb +26 -1
data/test/hexapdf/type/test_page.rb +45 -7
data/test/hexapdf/type/test_page_tree_node.rb +42 -0
data/test/hexapdf/utils/test_bit_field.rb +2 -0
data/test/hexapdf/utils/test_object_hash.rb +5 -0
data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
data/test/test_helper.rb +2 -0
metadata +6 -11

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a0354d0b129396ae7c479e806b3115f0be7a7f7d74d47df13f9ad82e5f93df50
-  data.tar.gz: 15f8a80efbffea1724ffdd54bc2f204816def3980ba59986336b742eed3ce6b3
+  metadata.gz: e4010e277168cec5c8cc5d584ec324064461e63756d18b538cd335235fe04e6d
+  data.tar.gz: 2b7a71463082a32605adee682c81cdde6b0eb48d360ca66249b08884f82e571b
 SHA512:
-  metadata.gz: dc132b215e6d6bd2ab6684e269c559e1d55301403c2c035e32eb7fc213572b86e993f1307c219f6e0aec6bbd8da36ab6d10146c82b89e52f68ca5c90949ef819
-  data.tar.gz: 9bd6252238d418844de0d9e08250dc6b7d8a620fc1e4f232bea6870986764ced86dadc0e4794512ae6abc347a6cf95569b5af7ff35c54833d57b2c2513a0d987
+  metadata.gz: 5748273dc4dc532cd365598e25c4a9cc5872011d2eb638c2986050aeed0a68d2dc5769fda075eb60cbcb76fccbfb1a5b52c3c58581cb6e969978c17d770013e6
+  data.tar.gz: 0ab3abf80967804486fa1f50f186b508fd792acfbd8c47646fa7d0c5b0245161e2833620142b2f05a1ee73b01145016dca7bf7781d579284160c9d2dd2c78d0c

data/CHANGELOG.md CHANGED

@@ -1,3 +1,133 @@
+## 0.14.1 - 2021-01-21
+### Changed
+* Validation message when checking for allowed values to include the invalid
+  object
+* [HexaPDF::FontLoader::FromFile] to allow (re)using an existing font object
+* [HexaPDF::Importer] internals to avoid problems with retained memory
+### Fixed
+* Parsing of invalid PDF files where whitespace is missing after the integer
+  value of an indirect object
+* [HexaPDF::Dictionary] so that adding new key-value pairs during validation is
+  possible
+## 0.14.0 - 2020-12-30
+### Added
+* Support for creating AcroForm multiline text fields and their appearances
+* Support for creating AcroForm comb text fields and their appearances
+* Support for creating AcroForm password fields and their appearances
+* Support for creating AcroForm file select fields and their appearances
+* Support for creating AcroForm list box appearances
+* [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index] and its setter
+  method
+* [HexaPDF::Type::AcroForm::ChoiceField#update_widgets] to create appearances if
+  they don't exist
+* Methods for caching data to [HexaPDF::Object]
+* Support for splitting by page size to CLI command `hexapdf split`
+### Changed
+* [HexaPDF::Utils::ObjectHash#oids] to be public instead of private
+* Cross-reference table parsing to handle invalidly numbered main sections
+* [HexaPDF::Document#cache] and [HexaPDF::Object#cache] to allow updating
+  values for existing keys
+* Appearance creation methods of AcroForm objects to allow forcing the creation
+  of new appearances
+* [HexaPDF::Type::AcroForm::AppearanceGenerator#create_text_appearances] to
+  re-use existing form objects
+* AcroForm field creation methods to allow specifying often used field
+  properties
+### Fixed
+* Missing usage of `:sort` flag for AcroForm choice fields
+* Setting the `/I` field for AcroForm list boxes with multiple selection
+* [HexaPDF::Layout::TextLayouter::SimpleLineWrapping] to remove glue items
+  (whitespace) before a hard line break
+* Infinite loop when reconstructing the cross-reference table
+* [HexaPDF::Type::AcroForm::ChoiceField] to support export values for option
+  items
+* AcroForm text field appearance creation to only create a new appearance if the
+  field's value has changed
+* AcroForm choice field appearance creation to only create a new appearance if
+  the involved dictionary fields' values have changed
+* [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index=] to raise an error
+  if no option items are set
+* [HexaPDF::PDFArray#to_ary] to return an array with preprocessed values
+* [HexaPDF::Type::Form#contents=] to clear cached values to avoid returning e.g.
+  an invalid canvas object later
+* [HexaPDF::Type::AcroForm::ButtonField#update_widgets] to create appearances if
+  they don't exist
+## 0.13.0 - 2020-11-15
+### Added
+* Cross-reference table reconstruction for damaged PDFs, controllable via the
+  new 'parser.try_xref_reconstruction' option
+* Two new `hexapdf inspect` commands for showing page objects and page content
+  streams by page number
+* Flag `--check` to the CLI command `hexapdf info` for checking a file for
+  parse and validation errors
+* [HexaPDF::Type::AcroForm::Field#embedded_widget?] for checking if a widget is
+  embedded in the field object
+* [HexaPDF::Type::AcroForm::Field#delete_widget] for deleting a widget
+* [HexaPDF::PDFArray#delete] for deleting an object from a PDF array
+* [HexaPDF::Type::Page#ancestor_nodes] for retrieving all ancestor page tree
+  nodes of a page
+* [HexaPDF::Type::PageTreeNode#move_page] for moving a page to another index
+### Changed
+* **Breaking change**: Overhauled document/object validation interfaces and
+  internals to be more similar and to allow for reporting of multiple validation
+  problems
+* Validation of TrueType fonts to ignore missing fields if the font name
+  suggests that the font is one of the standard 14 PDF fonts
+* Option `-p` of CLI command `hexapdf image2pdf` to also allow lowercase page
+  size names
+### Fixed
+* Reporting of cross-reference section entry parsing error
+* PDF version used by default for dictionary fields
+* Error in CLI command `hexapdf inspect` when parsing an invalid object number
+* Output of error messages in CLI command `hexapdf inspect` to go to `$stderr`
+* Bug in [HexaPDF::Type::AcroForm::TextField] validation due to missing nil
+  handling
+## 0.12.3 - 2020-08-22
+### Changed
+* Allow any object responding to `#to_sym` when setting a radio button value
+### Fixed
+* Error in the AcroForm appearance generator for text fields when the font is
+  not found in the default resources
+* Parsing of long numbers when reading a file from IO
+* Usage of unsupported method for Ruby 2.4 so that all tests pass again
+## 0.12.2 - 2020-08-17
+### Fixed
+- Wrong origin for page canvases when bottom left corner of media box doesn't
+  coincide with origin of coordinate system
+- Wrong origin for Form XObject canvas when bottom left corner of bounding box
+  doesn't coincide with origin of coordinate system
 ## 0.12.1 - 2020-08-16
 ### Added

data/examples/019-acro_form.rb CHANGED

@@ -42,10 +42,47 @@ rb = form.create_radio_button("Radio")
 end
 rb.field_value = :button0
-canvas.text("Text field", at: [50, 450])
-tx = form.create_text_field("Single Line")
-widget = tx.create_widget(page, Rect: [200, 445, 500, 465])
-tx.set_default_appearance_string(font_size: 16)
+canvas.text("Text fields", at: [50, 450])
+canvas.text("Single line", at: [70, 420])
+tx = form.create_text_field("Single Line", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 415, 500, 435])
 tx.field_value = "A sample test string!"
+canvas.text("Multiline", at: [70, 390])
+tx = form.create_multiline_text_field("Multiline", font_size: 0, align: :right)
+widget = tx.create_widget(page, Rect: [200, 325, 500, 405])
+widget.border_style(color: 0, width: 1)
+tx.field_value = "A sample test string! " * 30 + "\nNew line\n\nAnother line"
+canvas.text("Password", at: [70, 300])
+tx = form.create_password_field("Password", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 295, 500, 315])
+canvas.text("File select", at: [70, 270])
+tx = form.create_file_select_field("File Select", font_size: 16)
+widget = tx.create_widget(page, Rect: [200, 265, 500, 285])
+tx.field_value = "path/to/file.pdf"
+canvas.text("Comb", at: [70, 240])
+tx = form.create_comb_text_field("Comb field", max_chars: 10, font_size: 16, align: :center)
+widget = tx.create_widget(page, Rect: [200, 220, 500, 255])
+widget.border_style(color: [30, 128, 0], width: 1)
+tx.field_value = 'Hello'
+canvas.text("Combo Box", at: [50, 170])
+cb = form.create_combo_box("Combo Box", font_size: 12, editable: true,
+                           option_items: ['Value 1', 'Another value', 'Choose me!'])
+widget = cb.create_widget(page, Rect: [200, 150, 500, 185])
+widget.border_style(width: 1)
+cb.field_value = 'Another value'
+canvas.text("List Box", at: [50, 120])
+lb = form.create_list_box("List Box", font_size: 15, align: :center, multi_select: true,
+                         option_items: 1.upto(7).map {|i| "Value #{i}" })
+widget = lb.create_widget(page, Rect: [200, 50, 500, 135])
+widget.border_style(width: 1)
+lb.list_box_top_index = 1
+lb.field_value = ['Value 6', 'Value 2']
 doc.write('acro_form.pdf', optimize: true)

data/lib/hexapdf/cli/command.rb CHANGED

@@ -100,6 +100,7 @@ module HexaPDF
       def pdf_options(password)
         hash = {decryption_opts: {password: password}, config: {}}
         HexaPDF::GlobalConfiguration['filter.predictor.strict'] = command_parser.strict
+        hash[:config]['parser.try_xref_reconstruction'] = !command_parser.strict
         hash[:config]['parser.on_correctable_error'] =
           if command_parser.strict
             proc { true }
@@ -277,14 +278,15 @@ module HexaPDF
       #
       # See: #define_encryption_options
       def apply_encryption_options(doc)
-        if @out_options.encryption == :add
+        case @out_options.encryption
+        when :add
           doc.encrypt(algorithm: @out_options.enc_algorithm,
                       key_length: @out_options.enc_key_length,
                       force_v4: @out_options.enc_force_v4,
                       permissions: @out_options.enc_permissions,
                       owner_password: @out_options.enc_owner_pwd,
                       user_password: @out_options.enc_user_pwd)
-        elsif @out_options.encryption == :remove
+        when :remove
           doc.encrypt(name: nil)
         end
       end

data/lib/hexapdf/cli/image2pdf.rb CHANGED

@@ -64,7 +64,8 @@ module HexaPDF
                            orientation = :landscape
                            page_size.delete_suffix!('-landscape')
                          end
-                         HexaPDF::Type::Page.media_box(page_size.to_sym, orientation: orientation)
+                         page_size = page_size.capitalize.to_sym
+                         HexaPDF::Type::Page.media_box(page_size, orientation: orientation)
                        end
         end
         options.on("--[no-]auto-rotate", "Automatically rotate pages based on image dimesions. " \

data/lib/hexapdf/cli/info.rb CHANGED

@@ -55,13 +55,21 @@ module HexaPDF
         long_desc(<<~EOF)
           This command extracts information from the Info dictionary of a PDF file as well
           as some other useful information like the used PDF version and encryption information.
+          If the --check option is specified, the PDF file will also be checked for parse and
+          validation errors. And if the process doesn't abort, HexaPDF is still able to handle the
+          file by correcting the errors.
         EOF
+        options.on("--check", "-c", "Check the PDF file for parse errors and validity") do |check|
+          @check_file = check
+        end
         options.on("--password PASSWORD", "-p", String,
                    "The password for decryption. Use - for reading from standard input.") do |pwd|
           @password = (pwd == '-' ? read_password : pwd)
         end
         @password = nil
         @auto_decrypt = true
+        @check_file = false
       end
       def execute(file) #:nodoc:
@@ -79,8 +87,30 @@ module HexaPDF
         options = pdf_options(@password)
         options[:config]['document.auto_decrypt'] = @auto_decrypt
         HexaPDF::Document.open(file, **options) do |doc|
+          if @check_file
+            indirect_object = nil
+            validation_block = lambda do |msg, correctable, object|
+              object = indirect_object unless object.indirect? || object.type == :XXTrailer
+              object_type = if object.type == :XXTrailer
+                              'trailer'
+                            elsif !object.type.to_s.start_with?("XX")
+                              "object type #{object.type} (#{object.oid},#{object.gen})"
+                            else
+                              "object (#{object.oid},#{object.gen})"
+                            end
+              object_type = "sub-object of #{object_type}" if object == indirect_object
+              puts "WARNING: Validation error for #{object_type}: #{msg} " \
+                "#{correctable ? '(correctable)' : ''}"
+            end
+            doc.trailer.validate(auto_correct: true, &validation_block)
+            doc.each(only_current: false, only_loaded: false) do |obj|
+              indirect_object = obj
+              obj.validate(auto_correct: true, &validation_block)
+            end
+          end
           output_line("File name", file)
-          output_line("File size", File.stat(file).size.to_s + " bytes")
+          output_line("File size", File.stat(file).size.to_s << " bytes")
           @auto_decrypt && INFO_KEYS.each do |name|
             next unless doc.trailer.info.key?(name)
             output_line(name.to_s, doc.trailer.info[name].to_s)
@@ -110,10 +140,29 @@ module HexaPDF
         else
           raise
         end
+      rescue HexaPDF::MalformedPDFError => e
+        $stderr.puts "Error: PDF file #{file} is damaged and cannot be recovered"
+        $stderr.puts "       #{e}"
+      end
+      # Use custom options if we are checking the PDF file for errors.
+      def pdf_options(password)
+        if @check_file
+          options = {decryption_opts: {password: password}, config: {}}
+          HexaPDF::GlobalConfiguration['filter.predictor.strict'] = false
+          options[:config]['parser.try_xref_reconstruction'] = true
+          options[:config]['parser.on_correctable_error'] = lambda do |_, msg, pos|
+            puts "WARNING: Parse error at position #{pos}: #{msg}"
+            false
+          end
+          options
+        else
+          super
+        end
       end
       def output_line(header, text) #:nodoc:
-        puts((header + ":").ljust(COLUMN_WIDTH) << text)
+        puts(("#{header}:").ljust(COLUMN_WIDTH) << text)
       end
     end

data/lib/hexapdf/cli/inspect.rb CHANGED

@@ -122,22 +122,22 @@ module HexaPDF
           case command
           when /^\d+(,\d+)?$/, 'o', 'object'
             arg = (command.start_with?('o') ? data.shift : command)
-            obj = pdf_object_from_string_reference(arg) rescue puts($!.message)
-            if obj.data.stream && command_parser.verbosity_info?
+            obj = pdf_object_from_string_reference(arg) rescue $stderr.puts($!.message)
+            if obj&.data&.stream && command_parser.verbosity_info?
               $stderr.puts("Note: Object also has stream data")
             end
             serialize(obj.value, recursive: false) if obj
           when 'r', 'recursive'
             obj = if (obj = data.shift)
-                    pdf_object_from_string_reference(obj) rescue puts($!.message)
+                    pdf_object_from_string_reference(obj) rescue $stderr.puts($!.message)
                   else
                     @doc.trailer
                   end
             serialize(obj.value, recursive: true) if obj
           when 's', 'stream', 'raw', 'raw-stream'
-            if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message)) &&
+            if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) &&
                 obj.kind_of?(HexaPDF::Stream)
               source = (command.start_with?('raw') ? obj.stream_source : obj.stream_decoder)
               while source.alive? && (stream_data = source.resume)
@@ -148,7 +148,7 @@ module HexaPDF
             end
           when 'x', 'xref'
-            if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message))
+            if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message))
               @doc.revisions.reverse_each do |rev|
                 if (xref = rev.xref(obj))
                   puts xref
@@ -178,6 +178,26 @@ module HexaPDF
               puts str
             end
+          when 'po', 'ps'
+            page_number_str = data.shift
+            unless page_number_str
+              $stderr.puts("Error: Missing PAGE argument to #{command}")
+              next
+            end
+            page_number = parse_pages_specification(page_number_str, @doc.pages.count).first&.first
+            unless page_number
+              $stderr.puts("Error: Invalid page number #{page_number_str}")
+              next
+            end
+            page = @doc.pages[page_number]
+            if command.start_with?('ps')
+              $stdout.write(page.contents)
+            else
+              puts "#{page.oid} #{page.gen} obj"
+              serialize(page.value, recursive: false)
+              puts "endobj"
+            end
           when 'pc', 'page-count'
             puts @doc.pages.count
@@ -217,9 +237,9 @@ module HexaPDF
         if str.nil?
           raise "Error: Missing argument object identifier OID[,GEN]"
         elsif !str.match?(/^\d+(,\d+)?$/)
-          raise "Error: Invalid argument: Must be of form OID[,GEN]"
+          raise "Error: Invalid argument: Must be of form OID[,GEN], not '#{str}'"
         elsif !(obj = @doc.object(pdf_reference_from_string(str)))
-          raise "Error: No object with the given object identifier found"
+          raise "Error: No object with the given object identifier '#{str}' found"
         else
           obj
         end
@@ -240,7 +260,7 @@ module HexaPDF
           puts "<<"
           (recursive ? val.sort : val).each do |k, v|
             next if v.nil? || (v.respond_to?(:null?) && v.null?)
-            print '  ' * (indent + 1) + @serializer.serialize_symbol(k) + " "
+            print '%s%s ' % ['  ' * (indent + 1), @serializer.serialize_symbol(k)]
             serialize(v, recursive: recursive, seen: seen, indent: indent + 1)
             puts
           end
@@ -283,6 +303,8 @@ module HexaPDF
         ["c[atalog]", "Print the catalog dictionary"],
         ["t[railer]", "Print the trailer dictionary"],
         ["p[ages] [RANGE]",  "Print information about pages"],
+        ["po PAGE", "Print the page object"],
+        ["ps PAGE", "Print the content stream of the page"],
         ["pc | page-count", "Print the number of pages"],
         ["search REGEXP", "Print objects matching the pattern"],
         ["h[elp]", "Show the help"],

data/lib/hexapdf/cli/merge.rb CHANGED

@@ -122,7 +122,7 @@ module HexaPDF
         # Assemble pages
         target = (@initial_empty ? HexaPDF::Document.new : @files.first.file)
-        page_tree = target.add(Type: :Pages)
+        page_tree = target.add({Type: :Pages})
         import_pages(page_tree)
         target.catalog[:Pages] = page_tree
         remove_unused_pages(target)

data/lib/hexapdf/cli/split.rb CHANGED

@@ -44,16 +44,28 @@ module HexaPDF
       def initialize #:nodoc:
         super('split', takes_commands: false)
-        short_desc("Split a PDF file into individual pages")
+        short_desc("Split a PDF file")
         long_desc(<<~EOF)
-          If no OUTPUT_SPEC is specified, the pages are named <PDF>_0001.pdf, <PDF>_0002.pdf, ...
-          and so on. To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a
-          printf-style format definition like '%04d' to specify the place where the page number
-          should be inserted.
+          The default strategy is to split a PDF into individual pages, i.e. splitting is done by
+          page number. It is also possible to split by page size where pages with the same page size
+          get put into the same output PDF.
+          If no OUTPUT_SPEC is specified, the resulting PDF files are named <PDF>_0001.pdf,
+          <PDF>_0002.pdf, ... when splitting by page number and <PDF>_A4.pdf, <PDF>_Letter.pdf, ...
+          when splitting by page size.
+          To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a printf-style
+          format definition like '%04d' to specify the place where the page number should be
+          inserted. In case of splitting by page size, the place of the format defintion is replaced
+          with the name of the page size, e.g. A4 or Letter.
           The optimization and encryption options are applied to each created output file.
         EOF
+        options.on("--strategy STRATEGY", "-s", [:page_number, :page_size], "Defines how the PDF " \
+                   "file should be split: page_number or page_size (default: page_number)") do |s|
+          @strategy = s
+        end
         options.on("--password PASSWORD", "-p", String,
                    "The password for decryption. Use - for reading from standard input.") do |pwd|
           @password = (pwd == '-' ? read_password : pwd)
@@ -62,23 +74,71 @@ module HexaPDF
         define_encryption_options
         @password = nil
+        @strategy = :page_number
       end
       def execute(pdf, output_spec = pdf.sub(/\.pdf$/i, '_%04d.pdf')) #:nodoc:
-        output_spec = output_spec.sub('%', '%<page>')
         with_document(pdf, password: @password) do |doc|
-          doc.pages.each_with_index do |page, index|
-            output_file = sprintf(output_spec, page: index + 1)
-            maybe_raise_on_existing_file(output_file)
-            out = HexaPDF::Document.new
-            out.pages.add(out.import(page))
-            apply_encryption_options(out)
-            apply_optimization_options(out)
-            write_document(out, output_file)
+          if @strategy == :page_number
+            split_by_page_number(doc, output_spec)
+          else
+            split_by_page_size(doc, output_spec)
           end
         end
       end
+      private
+      # Splits the document into individual pages.
+      def split_by_page_number(doc, output_spec)
+        doc.pages.each_with_index do |page, index|
+          output_file = sprintf(output_spec, index + 1)
+          maybe_raise_on_existing_file(output_file)
+          out = HexaPDF::Document.new
+          out.pages.add(out.import(page))
+          apply_encryption_options(out)
+          apply_optimization_options(out)
+          write_document(out, output_file)
+        end
+      end
+      # Splits the document into files based on the page sizes.
+      def split_by_page_size(doc, output_spec)
+        output_spec = output_spec.sub(/%.*?[a-zA-Z]/, '%s')
+        out_files = Hash.new do |hash, key|
+          output_file = sprintf(output_spec, key)
+          maybe_raise_on_existing_file(output_file)
+          out = HexaPDF::Document.new
+          out.config['output_file'] = output_file
+          hash[key] = out
+        end
+        doc.pages.each do |page|
+          out = out_files[page_size_name(page.box(:media).value)]
+          out.pages.add(out.import(page))
+        end
+        out_files.each_value do |out|
+          apply_encryption_options(out)
+          apply_optimization_options(out)
+          write_document(out, out.config['output_file'])
+        end
+      end
+      # Tries to retrieve a page size name based on the media box. If this is not possible, the
+      # returned page size name consists of width x height.
+      def page_size_name(media_box)
+        @page_name_cache ||= {}
+        return @page_name_cache[media_box] if @page_name_cache.key?(media_box)
+        paper_size = HexaPDF::Type::Page::PAPER_SIZE.find do |_name, box|
+          box.each_with_index.all? {|entry, index| (entry - media_box[index]).abs < 5 }
+        end
+        @page_name_cache[media_box] =
+          paper_size ? paper_size[0] : "%.0fx%.0f" % media_box.values_at(2, 3)
+      end
     end
   end