hexapdf 0.12.3 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +132 -0
  3. data/examples/019-acro_form.rb +41 -4
  4. data/lib/hexapdf/cli/command.rb +4 -2
  5. data/lib/hexapdf/cli/image2pdf.rb +2 -1
  6. data/lib/hexapdf/cli/info.rb +51 -2
  7. data/lib/hexapdf/cli/inspect.rb +30 -8
  8. data/lib/hexapdf/cli/merge.rb +1 -1
  9. data/lib/hexapdf/cli/split.rb +74 -14
  10. data/lib/hexapdf/configuration.rb +15 -0
  11. data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
  12. data/lib/hexapdf/dictionary.rb +12 -6
  13. data/lib/hexapdf/dictionary_fields.rb +2 -10
  14. data/lib/hexapdf/document.rb +41 -16
  15. data/lib/hexapdf/document/files.rb +0 -1
  16. data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
  17. data/lib/hexapdf/encryption/security_handler.rb +1 -0
  18. data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
  19. data/lib/hexapdf/font/cmap.rb +1 -4
  20. data/lib/hexapdf/font/true_type/subsetter.rb +16 -3
  21. data/lib/hexapdf/font/true_type/table/head.rb +1 -0
  22. data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
  23. data/lib/hexapdf/font/true_type/table/post.rb +15 -10
  24. data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
  25. data/lib/hexapdf/font_loader/from_file.rb +18 -8
  26. data/lib/hexapdf/image_loader/png.rb +3 -2
  27. data/lib/hexapdf/importer.rb +3 -2
  28. data/lib/hexapdf/layout/line.rb +1 -1
  29. data/lib/hexapdf/layout/style.rb +23 -23
  30. data/lib/hexapdf/layout/text_layouter.rb +2 -2
  31. data/lib/hexapdf/layout/text_shaper.rb +3 -2
  32. data/lib/hexapdf/object.rb +52 -25
  33. data/lib/hexapdf/parser.rb +107 -7
  34. data/lib/hexapdf/pdf_array.rb +15 -5
  35. data/lib/hexapdf/revisions.rb +29 -21
  36. data/lib/hexapdf/serializer.rb +37 -10
  37. data/lib/hexapdf/task/optimize.rb +6 -4
  38. data/lib/hexapdf/tokenizer.rb +22 -0
  39. data/lib/hexapdf/type/acro_form/appearance_generator.rb +130 -27
  40. data/lib/hexapdf/type/acro_form/button_field.rb +5 -2
  41. data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
  42. data/lib/hexapdf/type/acro_form/field.rb +35 -5
  43. data/lib/hexapdf/type/acro_form/form.rb +139 -14
  44. data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
  45. data/lib/hexapdf/type/actions/uri.rb +3 -2
  46. data/lib/hexapdf/type/annotations/widget.rb +3 -4
  47. data/lib/hexapdf/type/catalog.rb +2 -2
  48. data/lib/hexapdf/type/cid_font.rb +1 -1
  49. data/lib/hexapdf/type/file_specification.rb +1 -1
  50. data/lib/hexapdf/type/font.rb +1 -1
  51. data/lib/hexapdf/type/font_simple.rb +4 -2
  52. data/lib/hexapdf/type/font_true_type.rb +6 -2
  53. data/lib/hexapdf/type/font_type0.rb +4 -4
  54. data/lib/hexapdf/type/form.rb +6 -2
  55. data/lib/hexapdf/type/image.rb +2 -2
  56. data/lib/hexapdf/type/page.rb +21 -12
  57. data/lib/hexapdf/type/page_tree_node.rb +29 -5
  58. data/lib/hexapdf/type/resources.rb +5 -0
  59. data/lib/hexapdf/type/trailer.rb +2 -3
  60. data/lib/hexapdf/utils/object_hash.rb +0 -1
  61. data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
  62. data/lib/hexapdf/version.rb +1 -1
  63. data/test/hexapdf/common_tokenizer_tests.rb +2 -2
  64. data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
  65. data/test/hexapdf/content/test_canvas.rb +3 -3
  66. data/test/hexapdf/content/test_color_space.rb +1 -1
  67. data/test/hexapdf/encryption/test_aes.rb +4 -4
  68. data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
  69. data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
  70. data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
  71. data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
  72. data/test/hexapdf/font/true_type/test_subsetter.rb +10 -0
  73. data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
  74. data/test/hexapdf/font_loader/test_from_file.rb +7 -0
  75. data/test/hexapdf/layout/test_text_layouter.rb +12 -5
  76. data/test/hexapdf/test_configuration.rb +2 -2
  77. data/test/hexapdf/test_dictionary.rb +8 -1
  78. data/test/hexapdf/test_dictionary_fields.rb +9 -2
  79. data/test/hexapdf/test_document.rb +18 -10
  80. data/test/hexapdf/test_object.rb +71 -26
  81. data/test/hexapdf/test_parser.rb +205 -51
  82. data/test/hexapdf/test_pdf_array.rb +8 -1
  83. data/test/hexapdf/test_revisions.rb +35 -0
  84. data/test/hexapdf/test_serializer.rb +7 -0
  85. data/test/hexapdf/test_tokenizer.rb +28 -0
  86. data/test/hexapdf/test_writer.rb +2 -2
  87. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +288 -35
  88. data/test/hexapdf/type/acro_form/test_button_field.rb +15 -0
  89. data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
  90. data/test/hexapdf/type/acro_form/test_field.rb +39 -0
  91. data/test/hexapdf/type/acro_form/test_form.rb +87 -15
  92. data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
  93. data/test/hexapdf/type/test_font_simple.rb +2 -1
  94. data/test/hexapdf/type/test_font_true_type.rb +6 -0
  95. data/test/hexapdf/type/test_form.rb +8 -1
  96. data/test/hexapdf/type/test_page.rb +8 -1
  97. data/test/hexapdf/type/test_page_tree_node.rb +42 -0
  98. data/test/hexapdf/type/test_resources.rb +6 -0
  99. data/test/hexapdf/utils/test_bit_field.rb +2 -0
  100. data/test/hexapdf/utils/test_object_hash.rb +5 -0
  101. data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
  102. data/test/test_helper.rb +2 -0
  103. metadata +6 -12
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 889b4bf1bc77da0a3fdfc62d2b5b09042aa1b5a567d5ed80ae382e6cdeb193f9
4
- data.tar.gz: 67f217de3dbd01653e9df4e8f8af7e8dba3745cd772e6d6ab930411ff3d1cfb3
3
+ metadata.gz: c43d8e9e117db1717ddfee73a54e4384743b8aa35863ab5bd19ffe57b8ce5674
4
+ data.tar.gz: 1020c8a3de8fcdf201500c1c0d22dfb99ed27daebac7baac92748f8127efc992
5
5
  SHA512:
6
- metadata.gz: 71affdceb736e0645c45b181a585b3a425135c0b22fba1daf28d89aaa6e73e5226f18a1e420fb75325653c87274f66664526d8ca55baaaa5251b4f822617b986
7
- data.tar.gz: 63aceaac41dd2ea797f92e7335a381bea5d1bdd2f7388c583431323e7ac9fae0855a404a84fbed70222130bd5eded126dae5385be2291d76c91021633d03a3bb
6
+ metadata.gz: e19eea4e88077afb7e8532fa6fe9ab2a03ffc5588749b72277462a971ebcec877ee72868d0ab698744117d46566be98e65c10225649d3bd1b4cd6e64e9625767
7
+ data.tar.gz: 6626a9feba0af0b46f293c1069a0d53b458a0dc29d08b82253f14f9bb98a878b914042faccc433b73f2f0e35d4da47c58a1bdebd2f3dee2fefb24c076a4e6bb3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,135 @@
1
+ ## 0.14.3 - 2021-02-16
2
+
3
+ ### Fixed
4
+
5
+ * Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
6
+ text output
7
+ * [HexaPDF::Serializer] to handle infinite recursion problem
8
+ * Cross-reference table reconstruction to avoid an O(n^2) performance problem
9
+ * [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
10
+ containing a single value instead of an array
11
+ * Processing of invalid PDF files missing a required value in appearance streams
12
+ * Processing of invalid empty arrays that should be rectangles by converting
13
+ them to PDF null objects
14
+ * Processing of invalid PDF files containing indirect objects with offset 0
15
+ * Processing of invalid PDF files containing a space/CR or space/LF combination
16
+ after the 'stream' keyword
17
+
18
+
19
+ ## 0.14.2 - 2021-01-22
20
+
21
+ ### Fixed
22
+
23
+ * [HexaPDF::Font::TrueType::Subsetter#use_glyph] to really avoid using subset
24
+ glyph ID 41 (`)`)
25
+
26
+
27
+ ## 0.14.1 - 2021-01-21
28
+
29
+ ### Changed
30
+
31
+ * Validation message when checking for allowed values to include the invalid
32
+ object
33
+ * [HexaPDF::FontLoader::FromFile] to allow (re)using an existing font object
34
+ * [HexaPDF::Importer] internals to avoid problems with retained memory
35
+
36
+ ### Fixed
37
+
38
+ * Parsing of invalid PDF files where whitespace is missing after the integer
39
+ value of an indirect object
40
+ * [HexaPDF::Dictionary] so that adding new key-value pairs during validation is
41
+ possible
42
+
43
+
44
+ ## 0.14.0 - 2020-12-30
45
+
46
+ ### Added
47
+
48
+ * Support for creating AcroForm multiline text fields and their appearances
49
+ * Support for creating AcroForm comb text fields and their appearances
50
+ * Support for creating AcroForm password fields and their appearances
51
+ * Support for creating AcroForm file select fields and their appearances
52
+ * Support for creating AcroForm list box appearances
53
+ * [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index] and its setter
54
+ method
55
+ * [HexaPDF::Type::AcroForm::ChoiceField#update_widgets] to create appearances if
56
+ they don't exist
57
+ * Methods for caching data to [HexaPDF::Object]
58
+ * Support for splitting by page size to CLI command `hexapdf split`
59
+
60
+ ### Changed
61
+
62
+ * [HexaPDF::Utils::ObjectHash#oids] to be public instead of private
63
+ * Cross-reference table parsing to handle invalidly numbered main sections
64
+ * [HexaPDF::Document#cache] and [HexaPDF::Object#cache] to allow updating
65
+ values for existing keys
66
+ * Appearance creation methods of AcroForm objects to allow forcing the creation
67
+ of new appearances
68
+ * [HexaPDF::Type::AcroForm::AppearanceGenerator#create_text_appearances] to
69
+ re-use existing form objects
70
+ * AcroForm field creation methods to allow specifying often used field
71
+ properties
72
+
73
+ ### Fixed
74
+
75
+ * Missing usage of `:sort` flag for AcroForm choice fields
76
+ * Setting the `/I` field for AcroForm list boxes with multiple selection
77
+ * [HexaPDF::Layout::TextLayouter::SimpleLineWrapping] to remove glue items
78
+ (whitespace) before a hard line break
79
+ * Infinite loop when reconstructing the cross-reference table
80
+ * [HexaPDF::Type::AcroForm::ChoiceField] to support export values for option
81
+ items
82
+ * AcroForm text field appearance creation to only create a new appearance if the
83
+ field's value has changed
84
+ * AcroForm choice field appearance creation to only create a new appearance if
85
+ the involved dictionary fields' values have changed
86
+ * [HexaPDF::Type::AcroForm::ChoiceField#list_box_top_index=] to raise an error
87
+ if no option items are set
88
+ * [HexaPDF::PDFArray#to_ary] to return an array with preprocessed values
89
+ * [HexaPDF::Type::Form#contents=] to clear cached values to avoid returning e.g.
90
+ an invalid canvas object later
91
+ * [HexaPDF::Type::AcroForm::ButtonField#update_widgets] to create appearances if
92
+ they don't exist
93
+
94
+
95
+ ## 0.13.0 - 2020-11-15
96
+
97
+ ### Added
98
+
99
+ * Cross-reference table reconstruction for damaged PDFs, controllable via the
100
+ new 'parser.try_xref_reconstruction' option
101
+ * Two new `hexapdf inspect` commands for showing page objects and page content
102
+ streams by page number
103
+ * Flag `--check` to the CLI command `hexapdf info` for checking a file for
104
+ parse and validation errors
105
+ * [HexaPDF::Type::AcroForm::Field#embedded_widget?] for checking if a widget is
106
+ embedded in the field object
107
+ * [HexaPDF::Type::AcroForm::Field#delete_widget] for deleting a widget
108
+ * [HexaPDF::PDFArray#delete] for deleting an object from a PDF array
109
+ * [HexaPDF::Type::Page#ancestor_nodes] for retrieving all ancestor page tree
110
+ nodes of a page
111
+ * [HexaPDF::Type::PageTreeNode#move_page] for moving a page to another index
112
+
113
+ ### Changed
114
+
115
+ * **Breaking change**: Overhauled document/object validation interfaces and
116
+ internals to be more similar and to allow for reporting of multiple validation
117
+ problems
118
+ * Validation of TrueType fonts to ignore missing fields if the font name
119
+ suggests that the font is one of the standard 14 PDF fonts
120
+ * Option `-p` of CLI command `hexapdf image2pdf` to also allow lowercase page
121
+ size names
122
+
123
+ ### Fixed
124
+
125
+ * Reporting of cross-reference section entry parsing error
126
+ * PDF version used by default for dictionary fields
127
+ * Error in CLI command `hexapdf inspect` when parsing an invalid object number
128
+ * Output of error messages in CLI command `hexapdf inspect` to go to `$stderr`
129
+ * Bug in [HexaPDF::Type::AcroForm::TextField] validation due to missing nil
130
+ handling
131
+
132
+
1
133
  ## 0.12.3 - 2020-08-22
2
134
 
3
135
  ### Changed
@@ -42,10 +42,47 @@ rb = form.create_radio_button("Radio")
42
42
  end
43
43
  rb.field_value = :button0
44
44
 
45
- canvas.text("Text field", at: [50, 450])
46
- tx = form.create_text_field("Single Line")
47
- widget = tx.create_widget(page, Rect: [200, 445, 500, 465])
48
- tx.set_default_appearance_string(font_size: 16)
45
+ canvas.text("Text fields", at: [50, 450])
46
+
47
+ canvas.text("Single line", at: [70, 420])
48
+ tx = form.create_text_field("Single Line", font_size: 16)
49
+ widget = tx.create_widget(page, Rect: [200, 415, 500, 435])
49
50
  tx.field_value = "A sample test string!"
50
51
 
52
+ canvas.text("Multiline", at: [70, 390])
53
+ tx = form.create_multiline_text_field("Multiline", font_size: 0, align: :right)
54
+ widget = tx.create_widget(page, Rect: [200, 325, 500, 405])
55
+ widget.border_style(color: 0, width: 1)
56
+ tx.field_value = "A sample test string! " * 30 + "\nNew line\n\nAnother line"
57
+
58
+ canvas.text("Password", at: [70, 300])
59
+ tx = form.create_password_field("Password", font_size: 16)
60
+ widget = tx.create_widget(page, Rect: [200, 295, 500, 315])
61
+
62
+ canvas.text("File select", at: [70, 270])
63
+ tx = form.create_file_select_field("File Select", font_size: 16)
64
+ widget = tx.create_widget(page, Rect: [200, 265, 500, 285])
65
+ tx.field_value = "path/to/file.pdf"
66
+
67
+ canvas.text("Comb", at: [70, 240])
68
+ tx = form.create_comb_text_field("Comb field", max_chars: 10, font_size: 16, align: :center)
69
+ widget = tx.create_widget(page, Rect: [200, 220, 500, 255])
70
+ widget.border_style(color: [30, 128, 0], width: 1)
71
+ tx.field_value = 'Hello'
72
+
73
+ canvas.text("Combo Box", at: [50, 170])
74
+ cb = form.create_combo_box("Combo Box", font_size: 12, editable: true,
75
+ option_items: ['Value 1', 'Another value', 'Choose me!'])
76
+ widget = cb.create_widget(page, Rect: [200, 150, 500, 185])
77
+ widget.border_style(width: 1)
78
+ cb.field_value = 'Another value'
79
+
80
+ canvas.text("List Box", at: [50, 120])
81
+ lb = form.create_list_box("List Box", font_size: 15, align: :center, multi_select: true,
82
+ option_items: 1.upto(7).map {|i| "Value #{i}" })
83
+ widget = lb.create_widget(page, Rect: [200, 50, 500, 135])
84
+ widget.border_style(width: 1)
85
+ lb.list_box_top_index = 1
86
+ lb.field_value = ['Value 6', 'Value 2']
87
+
51
88
  doc.write('acro_form.pdf', optimize: true)
@@ -100,6 +100,7 @@ module HexaPDF
100
100
  def pdf_options(password)
101
101
  hash = {decryption_opts: {password: password}, config: {}}
102
102
  HexaPDF::GlobalConfiguration['filter.predictor.strict'] = command_parser.strict
103
+ hash[:config]['parser.try_xref_reconstruction'] = !command_parser.strict
103
104
  hash[:config]['parser.on_correctable_error'] =
104
105
  if command_parser.strict
105
106
  proc { true }
@@ -277,14 +278,15 @@ module HexaPDF
277
278
  #
278
279
  # See: #define_encryption_options
279
280
  def apply_encryption_options(doc)
280
- if @out_options.encryption == :add
281
+ case @out_options.encryption
282
+ when :add
281
283
  doc.encrypt(algorithm: @out_options.enc_algorithm,
282
284
  key_length: @out_options.enc_key_length,
283
285
  force_v4: @out_options.enc_force_v4,
284
286
  permissions: @out_options.enc_permissions,
285
287
  owner_password: @out_options.enc_owner_pwd,
286
288
  user_password: @out_options.enc_user_pwd)
287
- elsif @out_options.encryption == :remove
289
+ when :remove
288
290
  doc.encrypt(name: nil)
289
291
  end
290
292
  end
@@ -64,7 +64,8 @@ module HexaPDF
64
64
  orientation = :landscape
65
65
  page_size.delete_suffix!('-landscape')
66
66
  end
67
- HexaPDF::Type::Page.media_box(page_size.to_sym, orientation: orientation)
67
+ page_size = page_size.capitalize.to_sym
68
+ HexaPDF::Type::Page.media_box(page_size, orientation: orientation)
68
69
  end
69
70
  end
70
71
  options.on("--[no-]auto-rotate", "Automatically rotate pages based on image dimesions. " \
@@ -55,13 +55,21 @@ module HexaPDF
55
55
  long_desc(<<~EOF)
56
56
  This command extracts information from the Info dictionary of a PDF file as well
57
57
  as some other useful information like the used PDF version and encryption information.
58
+
59
+ If the --check option is specified, the PDF file will also be checked for parse and
60
+ validation errors. And if the process doesn't abort, HexaPDF is still able to handle the
61
+ file by correcting the errors.
58
62
  EOF
63
+ options.on("--check", "-c", "Check the PDF file for parse errors and validity") do |check|
64
+ @check_file = check
65
+ end
59
66
  options.on("--password PASSWORD", "-p", String,
60
67
  "The password for decryption. Use - for reading from standard input.") do |pwd|
61
68
  @password = (pwd == '-' ? read_password : pwd)
62
69
  end
63
70
  @password = nil
64
71
  @auto_decrypt = true
72
+ @check_file = false
65
73
  end
66
74
 
67
75
  def execute(file) #:nodoc:
@@ -79,8 +87,30 @@ module HexaPDF
79
87
  options = pdf_options(@password)
80
88
  options[:config]['document.auto_decrypt'] = @auto_decrypt
81
89
  HexaPDF::Document.open(file, **options) do |doc|
90
+ if @check_file
91
+ indirect_object = nil
92
+ validation_block = lambda do |msg, correctable, object|
93
+ object = indirect_object unless object.indirect? || object.type == :XXTrailer
94
+ object_type = if object.type == :XXTrailer
95
+ 'trailer'
96
+ elsif !object.type.to_s.start_with?("XX")
97
+ "object type #{object.type} (#{object.oid},#{object.gen})"
98
+ else
99
+ "object (#{object.oid},#{object.gen})"
100
+ end
101
+ object_type = "sub-object of #{object_type}" if object == indirect_object
102
+ puts "WARNING: Validation error for #{object_type}: #{msg} " \
103
+ "#{correctable ? '(correctable)' : ''}"
104
+ end
105
+ doc.trailer.validate(auto_correct: true, &validation_block)
106
+ doc.each(only_current: false, only_loaded: false) do |obj|
107
+ indirect_object = obj
108
+ obj.validate(auto_correct: true, &validation_block)
109
+ end
110
+ end
111
+
82
112
  output_line("File name", file)
83
- output_line("File size", File.stat(file).size.to_s + " bytes")
113
+ output_line("File size", File.stat(file).size.to_s << " bytes")
84
114
  @auto_decrypt && INFO_KEYS.each do |name|
85
115
  next unless doc.trailer.info.key?(name)
86
116
  output_line(name.to_s, doc.trailer.info[name].to_s)
@@ -110,10 +140,29 @@ module HexaPDF
110
140
  else
111
141
  raise
112
142
  end
143
+ rescue HexaPDF::MalformedPDFError => e
144
+ $stderr.puts "Error: PDF file #{file} is damaged and cannot be recovered"
145
+ $stderr.puts " #{e}"
146
+ end
147
+
148
+ # Use custom options if we are checking the PDF file for errors.
149
+ def pdf_options(password)
150
+ if @check_file
151
+ options = {decryption_opts: {password: password}, config: {}}
152
+ HexaPDF::GlobalConfiguration['filter.predictor.strict'] = false
153
+ options[:config]['parser.try_xref_reconstruction'] = true
154
+ options[:config]['parser.on_correctable_error'] = lambda do |_, msg, pos|
155
+ puts "WARNING: Parse error at position #{pos}: #{msg}"
156
+ false
157
+ end
158
+ options
159
+ else
160
+ super
161
+ end
113
162
  end
114
163
 
115
164
  def output_line(header, text) #:nodoc:
116
- puts((header + ":").ljust(COLUMN_WIDTH) << text)
165
+ puts(("#{header}:").ljust(COLUMN_WIDTH) << text)
117
166
  end
118
167
 
119
168
  end
@@ -122,22 +122,22 @@ module HexaPDF
122
122
  case command
123
123
  when /^\d+(,\d+)?$/, 'o', 'object'
124
124
  arg = (command.start_with?('o') ? data.shift : command)
125
- obj = pdf_object_from_string_reference(arg) rescue puts($!.message)
126
- if obj.data.stream && command_parser.verbosity_info?
125
+ obj = pdf_object_from_string_reference(arg) rescue $stderr.puts($!.message)
126
+ if obj&.data&.stream && command_parser.verbosity_info?
127
127
  $stderr.puts("Note: Object also has stream data")
128
128
  end
129
129
  serialize(obj.value, recursive: false) if obj
130
130
 
131
131
  when 'r', 'recursive'
132
132
  obj = if (obj = data.shift)
133
- pdf_object_from_string_reference(obj) rescue puts($!.message)
133
+ pdf_object_from_string_reference(obj) rescue $stderr.puts($!.message)
134
134
  else
135
135
  @doc.trailer
136
136
  end
137
137
  serialize(obj.value, recursive: true) if obj
138
138
 
139
139
  when 's', 'stream', 'raw', 'raw-stream'
140
- if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message)) &&
140
+ if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message)) &&
141
141
  obj.kind_of?(HexaPDF::Stream)
142
142
  source = (command.start_with?('raw') ? obj.stream_source : obj.stream_decoder)
143
143
  while source.alive? && (stream_data = source.resume)
@@ -148,7 +148,7 @@ module HexaPDF
148
148
  end
149
149
 
150
150
  when 'x', 'xref'
151
- if (obj = pdf_object_from_string_reference(data.shift) rescue puts($!.message))
151
+ if (obj = pdf_object_from_string_reference(data.shift) rescue $stderr.puts($!.message))
152
152
  @doc.revisions.reverse_each do |rev|
153
153
  if (xref = rev.xref(obj))
154
154
  puts xref
@@ -178,6 +178,26 @@ module HexaPDF
178
178
  puts str
179
179
  end
180
180
 
181
+ when 'po', 'ps'
182
+ page_number_str = data.shift
183
+ unless page_number_str
184
+ $stderr.puts("Error: Missing PAGE argument to #{command}")
185
+ next
186
+ end
187
+ page_number = parse_pages_specification(page_number_str, @doc.pages.count).first&.first
188
+ unless page_number
189
+ $stderr.puts("Error: Invalid page number #{page_number_str}")
190
+ next
191
+ end
192
+ page = @doc.pages[page_number]
193
+ if command.start_with?('ps')
194
+ $stdout.write(page.contents)
195
+ else
196
+ puts "#{page.oid} #{page.gen} obj"
197
+ serialize(page.value, recursive: false)
198
+ puts "endobj"
199
+ end
200
+
181
201
  when 'pc', 'page-count'
182
202
  puts @doc.pages.count
183
203
 
@@ -217,9 +237,9 @@ module HexaPDF
217
237
  if str.nil?
218
238
  raise "Error: Missing argument object identifier OID[,GEN]"
219
239
  elsif !str.match?(/^\d+(,\d+)?$/)
220
- raise "Error: Invalid argument: Must be of form OID[,GEN]"
240
+ raise "Error: Invalid argument: Must be of form OID[,GEN], not '#{str}'"
221
241
  elsif !(obj = @doc.object(pdf_reference_from_string(str)))
222
- raise "Error: No object with the given object identifier found"
242
+ raise "Error: No object with the given object identifier '#{str}' found"
223
243
  else
224
244
  obj
225
245
  end
@@ -240,7 +260,7 @@ module HexaPDF
240
260
  puts "<<"
241
261
  (recursive ? val.sort : val).each do |k, v|
242
262
  next if v.nil? || (v.respond_to?(:null?) && v.null?)
243
- print ' ' * (indent + 1) + @serializer.serialize_symbol(k) + " "
263
+ print '%s%s ' % [' ' * (indent + 1), @serializer.serialize_symbol(k)]
244
264
  serialize(v, recursive: recursive, seen: seen, indent: indent + 1)
245
265
  puts
246
266
  end
@@ -283,6 +303,8 @@ module HexaPDF
283
303
  ["c[atalog]", "Print the catalog dictionary"],
284
304
  ["t[railer]", "Print the trailer dictionary"],
285
305
  ["p[ages] [RANGE]", "Print information about pages"],
306
+ ["po PAGE", "Print the page object"],
307
+ ["ps PAGE", "Print the content stream of the page"],
286
308
  ["pc | page-count", "Print the number of pages"],
287
309
  ["search REGEXP", "Print objects matching the pattern"],
288
310
  ["h[elp]", "Show the help"],
@@ -122,7 +122,7 @@ module HexaPDF
122
122
 
123
123
  # Assemble pages
124
124
  target = (@initial_empty ? HexaPDF::Document.new : @files.first.file)
125
- page_tree = target.add(Type: :Pages)
125
+ page_tree = target.add({Type: :Pages})
126
126
  import_pages(page_tree)
127
127
  target.catalog[:Pages] = page_tree
128
128
  remove_unused_pages(target)
@@ -44,16 +44,28 @@ module HexaPDF
44
44
 
45
45
  def initialize #:nodoc:
46
46
  super('split', takes_commands: false)
47
- short_desc("Split a PDF file into individual pages")
47
+ short_desc("Split a PDF file")
48
48
  long_desc(<<~EOF)
49
- If no OUTPUT_SPEC is specified, the pages are named <PDF>_0001.pdf, <PDF>_0002.pdf, ...
50
- and so on. To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a
51
- printf-style format definition like '%04d' to specify the place where the page number
52
- should be inserted.
49
+ The default strategy is to split a PDF into individual pages, i.e. splitting is done by
50
+ page number. It is also possible to split by page size where pages with the same page size
51
+ get put into the same output PDF.
52
+
53
+ If no OUTPUT_SPEC is specified, the resulting PDF files are named <PDF>_0001.pdf,
54
+ <PDF>_0002.pdf, ... when splitting by page number and <PDF>_A4.pdf, <PDF>_Letter.pdf, ...
55
+ when splitting by page size.
56
+
57
+ To specify a custom name, provide the OUTPUT_SPEC argument. It can contain a printf-style
58
+ format definition like '%04d' to specify the place where the page number should be
59
+ inserted. In case of splitting by page size, the place of the format defintion is replaced
60
+ with the name of the page size, e.g. A4 or Letter.
53
61
 
54
62
  The optimization and encryption options are applied to each created output file.
55
63
  EOF
56
64
 
65
+ options.on("--strategy STRATEGY", "-s", [:page_number, :page_size], "Defines how the PDF " \
66
+ "file should be split: page_number or page_size (default: page_number)") do |s|
67
+ @strategy = s
68
+ end
57
69
  options.on("--password PASSWORD", "-p", String,
58
70
  "The password for decryption. Use - for reading from standard input.") do |pwd|
59
71
  @password = (pwd == '-' ? read_password : pwd)
@@ -62,23 +74,71 @@ module HexaPDF
62
74
  define_encryption_options
63
75
 
64
76
  @password = nil
77
+ @strategy = :page_number
65
78
  end
66
79
 
67
80
  def execute(pdf, output_spec = pdf.sub(/\.pdf$/i, '_%04d.pdf')) #:nodoc:
68
- output_spec = output_spec.sub('%', '%<page>')
69
81
  with_document(pdf, password: @password) do |doc|
70
- doc.pages.each_with_index do |page, index|
71
- output_file = sprintf(output_spec, page: index + 1)
72
- maybe_raise_on_existing_file(output_file)
73
- out = HexaPDF::Document.new
74
- out.pages.add(out.import(page))
75
- apply_encryption_options(out)
76
- apply_optimization_options(out)
77
- write_document(out, output_file)
82
+ if @strategy == :page_number
83
+ split_by_page_number(doc, output_spec)
84
+ else
85
+ split_by_page_size(doc, output_spec)
78
86
  end
79
87
  end
80
88
  end
81
89
 
90
+ private
91
+
92
+ # Splits the document into individual pages.
93
+ def split_by_page_number(doc, output_spec)
94
+ doc.pages.each_with_index do |page, index|
95
+ output_file = sprintf(output_spec, index + 1)
96
+ maybe_raise_on_existing_file(output_file)
97
+ out = HexaPDF::Document.new
98
+ out.pages.add(out.import(page))
99
+ apply_encryption_options(out)
100
+ apply_optimization_options(out)
101
+ write_document(out, output_file)
102
+ end
103
+ end
104
+
105
+ # Splits the document into files based on the page sizes.
106
+ def split_by_page_size(doc, output_spec)
107
+ output_spec = output_spec.sub(/%.*?[a-zA-Z]/, '%s')
108
+ out_files = Hash.new do |hash, key|
109
+ output_file = sprintf(output_spec, key)
110
+ maybe_raise_on_existing_file(output_file)
111
+ out = HexaPDF::Document.new
112
+ out.config['output_file'] = output_file
113
+ hash[key] = out
114
+ end
115
+
116
+ doc.pages.each do |page|
117
+ out = out_files[page_size_name(page.box(:media).value)]
118
+ out.pages.add(out.import(page))
119
+ end
120
+
121
+ out_files.each_value do |out|
122
+ apply_encryption_options(out)
123
+ apply_optimization_options(out)
124
+ write_document(out, out.config['output_file'])
125
+ end
126
+ end
127
+
128
+ # Tries to retrieve a page size name based on the media box. If this is not possible, the
129
+ # returned page size name consists of width x height.
130
+ def page_size_name(media_box)
131
+ @page_name_cache ||= {}
132
+ return @page_name_cache[media_box] if @page_name_cache.key?(media_box)
133
+
134
+ paper_size = HexaPDF::Type::Page::PAPER_SIZE.find do |_name, box|
135
+ box.each_with_index.all? {|entry, index| (entry - media_box[index]).abs < 5 }
136
+ end
137
+
138
+ @page_name_cache[media_box] =
139
+ paper_size ? paper_size[0] : "%.0fx%.0f" % media_box.values_at(2, 3)
140
+ end
141
+
82
142
  end
83
143
 
84
144
  end