hexapdf 0.12.1 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +130 -0
  3. data/examples/019-acro_form.rb +41 -4
  4. data/lib/hexapdf/cli/command.rb +4 -2
  5. data/lib/hexapdf/cli/image2pdf.rb +2 -1
  6. data/lib/hexapdf/cli/info.rb +51 -2
  7. data/lib/hexapdf/cli/inspect.rb +30 -8
  8. data/lib/hexapdf/cli/merge.rb +1 -1
  9. data/lib/hexapdf/cli/split.rb +74 -14
  10. data/lib/hexapdf/configuration.rb +15 -0
  11. data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
  12. data/lib/hexapdf/content/parser.rb +1 -1
  13. data/lib/hexapdf/dictionary.rb +9 -6
  14. data/lib/hexapdf/dictionary_fields.rb +1 -9
  15. data/lib/hexapdf/document.rb +41 -16
  16. data/lib/hexapdf/document/files.rb +0 -1
  17. data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
  18. data/lib/hexapdf/encryption/security_handler.rb +1 -0
  19. data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
  20. data/lib/hexapdf/font/cmap.rb +1 -4
  21. data/lib/hexapdf/font/true_type/subsetter.rb +12 -3
  22. data/lib/hexapdf/font/true_type/table/head.rb +1 -0
  23. data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
  24. data/lib/hexapdf/font/true_type/table/post.rb +15 -10
  25. data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
  26. data/lib/hexapdf/font_loader/from_file.rb +18 -8
  27. data/lib/hexapdf/image_loader/png.rb +3 -2
  28. data/lib/hexapdf/importer.rb +3 -2
  29. data/lib/hexapdf/layout/line.rb +1 -1
  30. data/lib/hexapdf/layout/style.rb +23 -23
  31. data/lib/hexapdf/layout/text_layouter.rb +2 -2
  32. data/lib/hexapdf/layout/text_shaper.rb +3 -2
  33. data/lib/hexapdf/object.rb +52 -25
  34. data/lib/hexapdf/parser.rb +96 -4
  35. data/lib/hexapdf/pdf_array.rb +12 -5
  36. data/lib/hexapdf/revisions.rb +29 -21
  37. data/lib/hexapdf/serializer.rb +34 -8
  38. data/lib/hexapdf/task/optimize.rb +6 -4
  39. data/lib/hexapdf/tokenizer.rb +4 -3
  40. data/lib/hexapdf/type/acro_form/appearance_generator.rb +132 -28
  41. data/lib/hexapdf/type/acro_form/button_field.rb +21 -13
  42. data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
  43. data/lib/hexapdf/type/acro_form/field.rb +35 -5
  44. data/lib/hexapdf/type/acro_form/form.rb +139 -14
  45. data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
  46. data/lib/hexapdf/type/actions/uri.rb +3 -2
  47. data/lib/hexapdf/type/annotations/widget.rb +3 -4
  48. data/lib/hexapdf/type/catalog.rb +2 -2
  49. data/lib/hexapdf/type/cid_font.rb +1 -1
  50. data/lib/hexapdf/type/file_specification.rb +1 -1
  51. data/lib/hexapdf/type/font.rb +1 -1
  52. data/lib/hexapdf/type/font_simple.rb +4 -2
  53. data/lib/hexapdf/type/font_true_type.rb +6 -2
  54. data/lib/hexapdf/type/font_type0.rb +4 -4
  55. data/lib/hexapdf/type/form.rb +15 -2
  56. data/lib/hexapdf/type/image.rb +2 -2
  57. data/lib/hexapdf/type/page.rb +37 -13
  58. data/lib/hexapdf/type/page_tree_node.rb +29 -5
  59. data/lib/hexapdf/type/resources.rb +1 -0
  60. data/lib/hexapdf/type/trailer.rb +2 -3
  61. data/lib/hexapdf/utils/object_hash.rb +0 -1
  62. data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
  63. data/lib/hexapdf/version.rb +1 -1
  64. data/test/hexapdf/common_tokenizer_tests.rb +6 -1
  65. data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
  66. data/test/hexapdf/content/test_canvas.rb +3 -3
  67. data/test/hexapdf/content/test_color_space.rb +1 -1
  68. data/test/hexapdf/encryption/test_aes.rb +4 -4
  69. data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
  70. data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
  71. data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
  72. data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
  73. data/test/hexapdf/font/true_type/test_subsetter.rb +5 -0
  74. data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
  75. data/test/hexapdf/font_loader/test_from_file.rb +7 -0
  76. data/test/hexapdf/layout/test_style.rb +1 -1
  77. data/test/hexapdf/layout/test_text_layouter.rb +12 -5
  78. data/test/hexapdf/test_configuration.rb +2 -2
  79. data/test/hexapdf/test_dictionary.rb +8 -1
  80. data/test/hexapdf/test_dictionary_fields.rb +2 -2
  81. data/test/hexapdf/test_document.rb +18 -10
  82. data/test/hexapdf/test_object.rb +71 -26
  83. data/test/hexapdf/test_parser.rb +171 -53
  84. data/test/hexapdf/test_pdf_array.rb +8 -1
  85. data/test/hexapdf/test_revisions.rb +35 -0
  86. data/test/hexapdf/test_writer.rb +2 -2
  87. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +296 -38
  88. data/test/hexapdf/type/acro_form/test_button_field.rb +22 -2
  89. data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
  90. data/test/hexapdf/type/acro_form/test_field.rb +39 -0
  91. data/test/hexapdf/type/acro_form/test_form.rb +87 -15
  92. data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
  93. data/test/hexapdf/type/test_font_simple.rb +2 -1
  94. data/test/hexapdf/type/test_font_true_type.rb +6 -0
  95. data/test/hexapdf/type/test_form.rb +26 -1
  96. data/test/hexapdf/type/test_page.rb +45 -7
  97. data/test/hexapdf/type/test_page_tree_node.rb +42 -0
  98. data/test/hexapdf/utils/test_bit_field.rb +2 -0
  99. data/test/hexapdf/utils/test_object_hash.rb +5 -0
  100. data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
  101. data/test/test_helper.rb +2 -0
  102. metadata +6 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a0354d0b129396ae7c479e806b3115f0be7a7f7d74d47df13f9ad82e5f93df50
4
- data.tar.gz: 15f8a80efbffea1724ffdd54bc2f204816def3980ba59986336b742eed3ce6b3
3
+ metadata.gz: e4010e277168cec5c8cc5d584ec324064461e63756d18b538cd335235fe04e6d
4
+ data.tar.gz: 2b7a71463082a32605adee682c81cdde6b0eb48d360ca66249b08884f82e571b
5
5
  SHA512:
6
- metadata.gz: dc132b215e6d6bd2ab6684e269c559e1d55301403c2c035e32eb7fc213572b86e993f1307c219f6e0aec6bbd8da36ab6d10146c82b89e52f68ca5c90949ef819
7
- data.tar.gz: 9bd6252238d418844de0d9e08250dc6b7d8a620fc1e4f232bea6870986764ced86dadc0e4794512ae6abc347a6cf95569b5af7ff35c54833d57b2c2513a0d987
6
+ metadata.gz: 5748273dc4dc532cd365598e25c4a9cc5872011d2eb638c2986050aeed0a68d2dc5769fda075eb60cbcb76fccbfb1a5b52c3c58581cb6e969978c17d770013e6
7
+ data.tar.gz: 0ab3abf80967804486fa1f50f186b508fd792acfbd8c47646fa7d0c5b0245161e2833620142b2f05a1ee73b01145016dca7bf7781d579284160c9d2dd2c78d0c
@@ -1,3 +1,133 @@
1
+ ## 0.14.1 - 2021-01-21
2
+
3
+ ### Changed
4
+
5
+ * Validation message when checking for allowed values to include the invalid
6
+ object
7
+ * [HexaPDF::FontLoader::FromFile] to allow (re)using an existing font object
8
+ * [HexaPDF::Importer] internals to avoid problems with retained memory
9
+
10
+ ### Fixed
11
+
12
+ * Parsing of invalid PDF files where whitespace is missing after the integer
13
+ value of an indirect object
14
+ * [HexaPDF::Dictionary] so that adding new key-value pairs during validation is
15
+ possible
16
+
17
+
18
+ ## 0.14.0 - 2020-12-30
19
+
20
+ ### Added
21
+
22
+ * Support for creating AcroForm multiline text fields and their appearances
23
+ * Support for creating AcroForm comb text fields and their appearances
24
+ * Support for creating AcroForm password fields and their appearances
25
+ * Support for creating AcroForm file select fields and their appearances
26
+ * Support for creating AcroForm list box appearances
27
+ * [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index] and its setter
28
+ method
29
+ * [HexaPDF::Type::AcroForm::ChoiceField#update_widgets] to create appearances if
30
+ they don't exist
31
+ * Methods for caching data to [HexaPDF::Object]
32
+ * Support for splitting by page size to CLI command `hexapdf split`
33
+
34
+ ### Changed
35
+
36
+ * [HexaPDF::Utils::ObjectHash#oids] to be public instead of private
37
+ * Cross-reference table parsing to handle invalidly numbered main sections
38
+ * [HexaPDF::Document#cache] and [HexaPDF::Object#cache] to allow updating
39
+ values for existing keys
40
+ * Appearance creation methods of AcroForm objects to allow forcing the creation
41
+ of new appearances
42
+ * [HexaPDF::Type::AcroForm::AppearanceGenerator#create_text_appearances] to
43
+ re-use existing form objects
44
+ * AcroForm field creation methods to allow specifying often used field
45
+ properties
46
+
47
+ ### Fixed
48
+
49
+ * Missing usage of `:sort` flag for AcroForm choice fields
50
+ * Setting the `/I` field for AcroForm list boxes with multiple selection
51
+ * [HexaPDF::Layout::TextLayouter::SimpleLineWrapping] to remove glue items
52
+ (whitespace) before a hard line break
53
+ * Infinite loop when reconstructing the cross-reference table
54
+ * [HexaPDF::Type::AcroForm::ChoiceField] to support export values for option
55
+ items
56
+ * AcroForm text field appearance creation to only create a new appearance if the
57
+ field's value has changed
58
+ * AcroForm choice field appearance creation to only create a new appearance if
59
+ the involved dictionary fields' values have changed
60
+ * [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index=] to raise an error
61
+ if no option items are set
62
+ * [HexaPDF::PDFArray#to_ary] to return an array with preprocessed values
63
+ * [HexaPDF::Type::Form#contents=] to clear cached values to avoid returning e.g.
64
+ an invalid canvas object later
65
+ * [HexaPDF::Type::AcroForm::ButtonField#update_widgets] to create appearances if
66
+ they don't exist
67
+
68
+
69
+ ## 0.13.0 - 2020-11-15
70
+
71
+ ### Added
72
+
73
+ * Cross-reference table reconstruction for damaged PDFs, controllable via the
74
+ new 'parser.try_xref_reconstruction' option
75
+ * Two new `hexapdf inspect` commands for showing page objects and page content
76
+ streams by page number
77
+ * Flag `--check` to the CLI command `hexapdf info` for checking a file for
78
+ parse and validation errors
79
+ * [HexaPDF::Type::AcroForm::Field#embedded_widget?] for checking if a widget is
80
+ embedded in the field object
81
+ * [HexaPDF::Type::AcroForm::Field#delete_widget] for deleting a widget
82
+ * [HexaPDF::PDFArray#delete] for deleting an object from a PDF array
83
+ * [HexaPDF::Type::Page#ancestor_nodes] for retrieving all ancestor page tree
84
+ nodes of a page
85
+ * [HexaPDF::Type::PageTreeNode#move_page] for moving a page to another index
86
+
87
+ ### Changed
88
+
89
+ * **Breaking change**: Overhauled document/object validation interfaces and
90
+ internals to be more similar and to allow for reporting of multiple validation
91
+ problems
92
+ * Validation of TrueType fonts to ignore missing fields if the font name
93
+ suggests that the font is one of the standard 14 PDF fonts
94
+ * Option `-p` of CLI command `hexapdf image2pdf` to also allow lowercase page
95
+ size names
96
+
97
+ ### Fixed
98
+
99
+ * Reporting of cross-reference section entry parsing error
100
+ * PDF version used by default for dictionary fields
101
+ * Error in CLI command `hexapdf inspect` when parsing an invalid object number
102
+ * Output of error messages in CLI command `hexapdf inspect` to go to `$stderr`
103
+ * Bug in [HexaPDF::Type::AcroForm::TextField] validation due to missing nil
104
+ handling
105
+
106
+
107
+ ## 0.12.3 - 2020-08-22
108
+
109
+ ### Changed
110
+
111
+ * Allow any object responding to `#to_sym` when setting a radio button value
112
+
113
+ ### Fixed
114
+
115
+ * Error in the AcroForm appearance generator for text fields when the font is
116
+ not found in the default resources
117
+ * Parsing of long numbers when reading a file from IO
118
+ * Usage of unsupported method for Ruby 2.4 so that all tests pass again
119
+
120
+
121
+ ## 0.12.2 - 2020-08-17
122
+
123
+ ### Fixed
124
+
125
+ - Wrong origin for page canvases when bottom left corner of media box doesn't
126
+ coincide with origin of coordinate system
127
+ - Wrong origin for Form XObject canvas when bottom left corner of bounding box
128
+ doesn't coincide with origin of coordinate system
129
+
130
+
1
131
  ## 0.12.1 - 2020-08-16
2
132
 
3
133
  ### Added
@@ -42,10 +42,47 @@ rb = form.create_radio_button("Radio")
42
42
  end
43
43
  rb.field_value = :button0
44
44
 
45
- canvas.text("Text field", at: [50, 450])
46
- tx = form.create_text_field("Single Line")
47
- widget = tx.create_widget(page, Rect: [200, 445, 500, 465])
48
- tx.set_default_appearance_string(font_size: 16)
45
+ canvas.text("Text fields", at: [50, 450])
46
+
47
+ canvas.text("Single line", at: [70, 420])
48
+ tx = form.create_text_field("Single Line", font_size: 16)
49
+ widget = tx.create_widget(page, Rect: [200, 415, 500, 435])
49
50
  tx.field_value = "A sample test string!"
50
51
 
52
+ canvas.text("Multiline", at: [70, 390])
53
+ tx = form.create_multiline_text_field("Multiline", font_size: 0, align: :right)
54
+ widget = tx.create_widget(page, Rect: [200, 325, 500, 405])
55
+ widget.border_style(color: 0, width: 1)
56
+ tx.field_value = "A sample test string! " * 30 + "\nNew line\n\nAnother line"
57
+
58
+ canvas.text("Password", at: [70, 300])
59
+ tx = form.create_password_field("Password", font_size: 16)
60
+ widget = tx.create_widget(page, Rect: [200, 295, 500, 315])
61
+
62
+ canvas.text("File select", at: [70, 270])
63
+ tx = form.create_file_select_field("File Select", font_size: 16)
64
+ widget = tx.create_widget(page, Rect: [200, 265, 500, 285])
65
+ tx.field_value = "path/to/file.pdf"
66
+
67
+ canvas.text("Comb", at: [70, 240])
68
+ tx = form.create_comb_text_field("Comb field", max_chars: 10, font_size: 16, align: :center)
69
+ widget = tx.create_widget(page, Rect: [200, 220, 500, 255])
70
+ widget.border_style(color: [30, 128, 0], width: 1)
71
+ tx.field_value = 'Hello'
72
+
73
+ canvas.text("Combo Box", at: [50, 170])
74
+ cb = form.create_combo_box("Combo Box", font_size: 12, editable: true,
75
+ option_items: ['Value 1', 'Another value', 'Choose me!'])
76
+ widget = cb.create_widget(page, Rect: [200, 150, 500, 185])
77
+ widget.border_style(width: 1)
78
+ cb.field_value = 'Another value'
79
+
80
+ canvas.text("List Box", at: [50, 120])
81
+ lb = form.create_list_box("List Box", font_size: 15, align: :center, multi_select: true,
82
+ option_items: 1.upto(7).map {|i| "Value #{i}" })
83
+ widget = lb.create_widget(page, Rect: [200, 50, 500, 135])
84
+ widget.border_style(width: 1)
85
+ lb.list_box_top_index = 1
86
+ lb.field_value = ['Value 6', 'Value 2']
87
+
51
88
  doc.write('acro_form.pdf', optimize: true)
@@ -100,6 +100,7 @@ module HexaPDF
100
100
  def pdf_options(password)
101
101
  hash = {decryption_opts: {password: password}, config: {}}
102
102
  HexaPDF::GlobalConfiguration['filter.predictor.strict'] = command_parser.strict
103
+ hash[:config]['parser.try_xref_reconstruction'] = !command_parser.strict
103
104
  hash[:config]['parser.on_correctable_error'] =
104
105
  if command_parser.strict
105
106
  proc { true }
@@ -277,14 +278,15 @@ module HexaPDF
277
278
  #
278
279
  # See: #define_encryption_options
279
280
  def apply_encryption_options(doc)
280
- if @out_options.encryption == :add
281
+ case @out_options.encryption
282
+ when :add
281
283
  doc.encrypt(algorithm: @out_options.enc_algorithm,
282
284
  key_length: @out_options.enc_key_length,
283
285
  force_v4: @out_options.enc_force_v4,
284
286
  permissions: @out_options.enc_permissions,
285
287
  owner_password: @out_options.enc_owner_pwd,
286
288
  user_password: @out_options.enc_user_pwd)
287
- elsif @out_options.encryption == :remove
289
+ when :remove
288
290
  doc.encrypt(name: nil)
289
291
  end
290
292
  end
@@ -64,7 +64,8 @@ module HexaPDF
64
64
  orientation = :landscape
65
65
  page_size.delete_suffix!('-landscape')
66
66
  end
67
- HexaPDF::Type::Page.media_box(page_size.to_sym, orientation: orientation)
67
+ page_size = page_size.capitalize.to_sym
68
+ HexaPDF::Type::Page.media_box(page_size, orientation: orientation)
68
69
  end
69
70
  end
70
71
  options.on("--[no-]auto-rotate", "Automatically rotate pages based on image dimesions. " \
@@ -55,13 +55,21 @@ module HexaPDF
55
55
  long_desc(<<~EOF)
56
56
  This command extracts information from the Info dictionary of a PDF file as well
57
57
  as some other useful information like the used PDF version and encryption information.
58
+
59
+ If the --check option is specified, the PDF file will also be checked for parse and
60
+ validation errors. And if the process doesn't abort, HexaPDF is still able to handle the
61
+ file by correcting the errors.
58
62
  EOF
63
+ options.on("--check", "-c", "Check the PDF file for parse errors and validity") do |check|
64
+ @check_file = check
65
+ end
59
66
  options.on("--password PASSWORD", "-p", String,
60
67
  "The password for decryption. Use - for reading from standard input.") do |pwd|
61
68
  @password = (pwd == '-' ? read_password : pwd)
62
69
  end
63
70
  @password = nil
64
71
  @auto_decrypt = true
72
+ @check_file = false
65
73
  end
66
74
 
67
75
  def execute(file) #:nodoc:
@@ -79,8 +87,30 @@ module HexaPDF
79
87
  options = pdf_options(@password)
80
88
  options[:config]['document.auto_decrypt'] = @auto_decrypt
81
89
  HexaPDF::Document.open(file, **options) do |doc|
90
+ if @check_file
91
+ indirect_object = nil
92
+ validation_block = lambda do |msg, correctable, object|
93
+ object = indirect_object unless object.indirect? || object.type == :XXTrailer
94
+ object_type = if object.type == :XXTrailer
95
+ 'trailer'
96
+ elsif !object.type.to_s.start_with?("XX")
97
+ "object type #{object.type} (#{object.oid},#{object.gen})"
98
+ else
99
+ "object (#{object.oid},#{object.gen})"
100
+ end
101
+ object_type = "sub-object of #{object_type}" if object == indirect_object
102
+ puts "WARNING: Validation error for #{object_type}: #{msg} " \
103
+ "#{correctable ? '(correctable)' : ''}"
104
+ end
105
+ doc.trailer.validate(auto_correct: true, &validation_block)
106
+ doc.each(only_current: false, only_loaded: false) do |obj|
107
+ indirect_object = obj
108
+ obj.validate(auto_correct: true, &validation_block)
109
+ end
110
+ end
111
+
82
112
  output_line("File name", file)
83
- output_line("File size", File.stat(file).size.to_s + " bytes")
113
+ output_line("File size", File.stat(file).size.to_s << " bytes")
84
114
  @auto_decrypt && INFO_KEYS.each do |name|
85
115
  next unless doc.trailer.info.key?(name)
86
116
  output_line(name.to_s, doc.trailer.info[name].to_s)
@@ -110,10 +140,29 @@ module HexaPDF
110
140
  else
111
141
  raise
112
142
  end
143
+ rescue HexaPDF::MalformedPDFError => e
144
+ $stderr.puts "Error: PDF file #{file} is damaged and cannot be recovered"
145
+ $stderr.puts " #{e}"
146
+ end
147
+
148
+ # Use custom options if we are checking the PDF file for errors.
149
+ def pdf_options(password)
150
+ if @check_file
151
+ options = {decryption_opts: {password: password}, config: {}}
152
+ HexaPDF::GlobalConfiguration['filter.predictor.strict'] = false
153
+ options[:config]['parser.try_xref_reconstruction'] = true
154
+ options[:config]['parser.on_correctable_error'] = lambda do |_, msg, pos|
155
+ puts "WARNING: Parse error at position #{pos}: #{msg}"
156
+ false
157
+ end
158
+ options
159
+ else
160
+ super
161
+ end
113
162
  end
114
163
 
115
164
  def output_line(header, text) #:nodoc:
116
- puts((header + ":").ljust(COLUMN_WIDTH) << text)
165
+ puts(("#{header}:").ljust(COLUMN_WIDTH) << text)
117
166
  end
118
167
 
119
168
  end
@@ -122,22 +122,22 @@ module HexaPDF
122
122
  case command
123
123
  when /^\d+(,\d+)?$/, 'o', 'object'
124
124
  arg = (command.start_with?('o') ? data.shift : command)
125
- obj = pdf_object_from_string_reference(arg) rescue puts($!.message)
126
- if obj.data.stream && command_parser.verbosity_info?
125
+ obj = pdf_object_from_string_reference(arg) rescue $stderr.puts($!.message)
126
+ if obj&.data&.stream && command_parser.verbosity_info?
127
127
  $stderr.puts("Note: Object also has stream data")
128
128
  end
129
129
  serialize(obj.value, recursive: false) if obj
130
130
 
131
131
  when 'r', 'recursive'
132
132
  obj = if (obj = data.shift)
133
- pdf_object_from_string_reference(obj) rescue puts($!.message)
133
+ pdf_object_from_string_reference(obj) rescue $stderr.puts($!.message)
134
134
  else
135
135
  @doc.trailer
136
136
  end
137
137
  serialize(obj.value, recursive: true) if obj
138
138
 
139
139
  when 's', 'stream', 'raw', 'raw-stream'
140
- if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message)) &&
140
+ if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) &&
141
141
  obj.kind_of?(HexaPDF::Stream)
142
142
  source = (command.start_with?('raw') ? obj.stream_source : obj.stream_decoder)
143
143
  while source.alive? && (stream_data = source.resume)
@@ -148,7 +148,7 @@ module HexaPDF
148
148
  end
149
149
 
150
150
  when 'x', 'xref'
151
- if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message))
151
+ if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message))
152
152
  @doc.revisions.reverse_each do |rev|
153
153
  if (xref = rev.xref(obj))
154
154
  puts xref
@@ -178,6 +178,26 @@ module HexaPDF
178
178
  puts str
179
179
  end
180
180
 
181
+ when 'po', 'ps'
182
+ page_number_str = data.shift
183
+ unless page_number_str
184
+ $stderr.puts("Error: Missing PAGE argument to #{command}")
185
+ next
186
+ end
187
+ page_number = parse_pages_specification(page_number_str, @doc.pages.count).first&.first
188
+ unless page_number
189
+ $stderr.puts("Error: Invalid page number #{page_number_str}")
190
+ next
191
+ end
192
+ page = @doc.pages[page_number]
193
+ if command.start_with?('ps')
194
+ $stdout.write(page.contents)
195
+ else
196
+ puts "#{page.oid} #{page.gen} obj"
197
+ serialize(page.value, recursive: false)
198
+ puts "endobj"
199
+ end
200
+
181
201
  when 'pc', 'page-count'
182
202
  puts @doc.pages.count
183
203
 
@@ -217,9 +237,9 @@ module HexaPDF
217
237
  if str.nil?
218
238
  raise "Error: Missing argument object identifier OID[,GEN]"
219
239
  elsif !str.match?(/^\d+(,\d+)?$/)
220
- raise "Error: Invalid argument: Must be of form OID[,GEN]"
240
+ raise "Error: Invalid argument: Must be of form OID[,GEN], not '#{str}'"
221
241
  elsif !(obj = @doc.object(pdf_reference_from_string(str)))
222
- raise "Error: No object with the given object identifier found"
242
+ raise "Error: No object with the given object identifier '#{str}' found"
223
243
  else
224
244
  obj
225
245
  end
@@ -240,7 +260,7 @@ module HexaPDF
240
260
  puts "<<"
241
261
  (recursive ? val.sort : val).each do |k, v|
242
262
  next if v.nil? || (v.respond_to?(:null?) && v.null?)
243
- print ' ' * (indent + 1) + @serializer.serialize_symbol(k) + " "
263
+ print '%s%s ' % [' ' * (indent + 1), @serializer.serialize_symbol(k)]
244
264
  serialize(v, recursive: recursive, seen: seen, indent: indent + 1)
245
265
  puts
246
266
  end
@@ -283,6 +303,8 @@ module HexaPDF
283
303
  ["c[atalog]", "Print the catalog dictionary"],
284
304
  ["t[railer]", "Print the trailer dictionary"],
285
305
  ["p[ages] [RANGE]", "Print information about pages"],
306
+ ["po PAGE", "Print the page object"],
307
+ ["ps PAGE", "Print the content stream of the page"],
286
308
  ["pc | page-count", "Print the number of pages"],
287
309
  ["search REGEXP", "Print objects matching the pattern"],
288
310
  ["h[elp]", "Show the help"],
@@ -122,7 +122,7 @@ module HexaPDF
122
122
 
123
123
  # Assemble pages
124
124
  target = (@initial_empty ? HexaPDF::Document.new : @files.first.file)
125
- page_tree = target.add(Type: :Pages)
125
+ page_tree = target.add({Type: :Pages})
126
126
  import_pages(page_tree)
127
127
  target.catalog[:Pages] = page_tree
128
128
  remove_unused_pages(target)
@@ -44,16 +44,28 @@ module HexaPDF
44
44
 
45
45
  def initialize #:nodoc:
46
46
  super('split', takes_commands: false)
47
- short_desc("Split a PDF file into individual pages")
47
+ short_desc("Split a PDF file")
48
48
  long_desc(<<~EOF)
49
- If no OUTPUT_SPEC is specified, the pages are named <PDF>_0001.pdf, <PDF>_0002.pdf, ...
50
- and so on. To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a
51
- printf-style format definition like '%04d' to specify the place where the page number
52
- should be inserted.
49
+ The default strategy is to split a PDF into individual pages, i.e. splitting is done by
50
+ page number. It is also possible to split by page size where pages with the same page size
51
+ get put into the same output PDF.
52
+
53
+ If no OUTPUT_SPEC is specified, the resulting PDF files are named <PDF>_0001.pdf,
54
+ <PDF>_0002.pdf, ... when splitting by page number and <PDF>_A4.pdf, <PDF>_Letter.pdf, ...
55
+ when splitting by page size.
56
+
57
+ To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a printf-style
58
+ format definition like '%04d' to specify the place where the page number should be
59
+ inserted. In case of splitting by page size, the place of the format defintion is replaced
60
+ with the name of the page size, e.g. A4 or Letter.
53
61
 
54
62
  The optimization and encryption options are applied to each created output file.
55
63
  EOF
56
64
 
65
+ options.on("--strategy STRATEGY", "-s", [:page_number, :page_size], "Defines how the PDF " \
66
+ "file should be split: page_number or page_size (default: page_number)") do |s|
67
+ @strategy = s
68
+ end
57
69
  options.on("--password PASSWORD", "-p", String,
58
70
  "The password for decryption. Use - for reading from standard input.") do |pwd|
59
71
  @password = (pwd == '-' ? read_password : pwd)
@@ -62,23 +74,71 @@ module HexaPDF
62
74
  define_encryption_options
63
75
 
64
76
  @password = nil
77
+ @strategy = :page_number
65
78
  end
66
79
 
67
80
  def execute(pdf, output_spec = pdf.sub(/\.pdf$/i, '_%04d.pdf')) #:nodoc:
68
- output_spec = output_spec.sub('%', '%<page>')
69
81
  with_document(pdf, password: @password) do |doc|
70
- doc.pages.each_with_index do |page, index|
71
- output_file = sprintf(output_spec, page: index + 1)
72
- maybe_raise_on_existing_file(output_file)
73
- out = HexaPDF::Document.new
74
- out.pages.add(out.import(page))
75
- apply_encryption_options(out)
76
- apply_optimization_options(out)
77
- write_document(out, output_file)
82
+ if @strategy == :page_number
83
+ split_by_page_number(doc, output_spec)
84
+ else
85
+ split_by_page_size(doc, output_spec)
78
86
  end
79
87
  end
80
88
  end
81
89
 
90
+ private
91
+
92
+ # Splits the document into individual pages.
93
+ def split_by_page_number(doc, output_spec)
94
+ doc.pages.each_with_index do |page, index|
95
+ output_file = sprintf(output_spec, index + 1)
96
+ maybe_raise_on_existing_file(output_file)
97
+ out = HexaPDF::Document.new
98
+ out.pages.add(out.import(page))
99
+ apply_encryption_options(out)
100
+ apply_optimization_options(out)
101
+ write_document(out, output_file)
102
+ end
103
+ end
104
+
105
+ # Splits the document into files based on the page sizes.
106
+ def split_by_page_size(doc, output_spec)
107
+ output_spec = output_spec.sub(/%.*?[a-zA-Z]/, '%s')
108
+ out_files = Hash.new do |hash, key|
109
+ output_file = sprintf(output_spec, key)
110
+ maybe_raise_on_existing_file(output_file)
111
+ out = HexaPDF::Document.new
112
+ out.config['output_file'] = output_file
113
+ hash[key] = out
114
+ end
115
+
116
+ doc.pages.each do |page|
117
+ out = out_files[page_size_name(page.box(:media).value)]
118
+ out.pages.add(out.import(page))
119
+ end
120
+
121
+ out_files.each_value do |out|
122
+ apply_encryption_options(out)
123
+ apply_optimization_options(out)
124
+ write_document(out, out.config['output_file'])
125
+ end
126
+ end
127
+
128
+ # Tries to retrieve a page size name based on the media box. If this is not possible, the
129
+ # returned page size name consists of width x height.
130
+ def page_size_name(media_box)
131
+ @page_name_cache ||= {}
132
+ return @page_name_cache[media_box] if @page_name_cache.key?(media_box)
133
+
134
+ paper_size = HexaPDF::Type::Page::PAPER_SIZE.find do |_name, box|
135
+ box.each_with_index.all? {|entry, index| (entry - media_box[index]).abs < 5 }
136
+ end
137
+
138
+ @page_name_cache[media_box] =
139
+ paper_size ? paper_size[0] : "%.0fx%.0f" % media_box.values_at(2, 3)
140
+ end
141
+
82
142
  end
83
143
 
84
144
  end