hexapdf 0.12.3 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +132 -0
  3. data/examples/019-acro_form.rb +41 -4
  4. data/lib/hexapdf/cli/command.rb +4 -2
  5. data/lib/hexapdf/cli/image2pdf.rb +2 -1
  6. data/lib/hexapdf/cli/info.rb +51 -2
  7. data/lib/hexapdf/cli/inspect.rb +30 -8
  8. data/lib/hexapdf/cli/merge.rb +1 -1
  9. data/lib/hexapdf/cli/split.rb +74 -14
  10. data/lib/hexapdf/configuration.rb +15 -0
  11. data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
  12. data/lib/hexapdf/dictionary.rb +12 -6
  13. data/lib/hexapdf/dictionary_fields.rb +2 -10
  14. data/lib/hexapdf/document.rb +41 -16
  15. data/lib/hexapdf/document/files.rb +0 -1
  16. data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
  17. data/lib/hexapdf/encryption/security_handler.rb +1 -0
  18. data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
  19. data/lib/hexapdf/font/cmap.rb +1 -4
  20. data/lib/hexapdf/font/true_type/subsetter.rb +16 -3
  21. data/lib/hexapdf/font/true_type/table/head.rb +1 -0
  22. data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
  23. data/lib/hexapdf/font/true_type/table/post.rb +15 -10
  24. data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
  25. data/lib/hexapdf/font_loader/from_file.rb +18 -8
  26. data/lib/hexapdf/image_loader/png.rb +3 -2
  27. data/lib/hexapdf/importer.rb +3 -2
  28. data/lib/hexapdf/layout/line.rb +1 -1
  29. data/lib/hexapdf/layout/style.rb +23 -23
  30. data/lib/hexapdf/layout/text_layouter.rb +2 -2
  31. data/lib/hexapdf/layout/text_shaper.rb +3 -2
  32. data/lib/hexapdf/object.rb +52 -25
  33. data/lib/hexapdf/parser.rb +107 -7
  34. data/lib/hexapdf/pdf_array.rb +15 -5
  35. data/lib/hexapdf/revisions.rb +29 -21
  36. data/lib/hexapdf/serializer.rb +37 -10
  37. data/lib/hexapdf/task/optimize.rb +6 -4
  38. data/lib/hexapdf/tokenizer.rb +22 -0
  39. data/lib/hexapdf/type/acro_form/appearance_generator.rb +130 -27
  40. data/lib/hexapdf/type/acro_form/button_field.rb +5 -2
  41. data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
  42. data/lib/hexapdf/type/acro_form/field.rb +35 -5
  43. data/lib/hexapdf/type/acro_form/form.rb +139 -14
  44. data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
  45. data/lib/hexapdf/type/actions/uri.rb +3 -2
  46. data/lib/hexapdf/type/annotations/widget.rb +3 -4
  47. data/lib/hexapdf/type/catalog.rb +2 -2
  48. data/lib/hexapdf/type/cid_font.rb +1 -1
  49. data/lib/hexapdf/type/file_specification.rb +1 -1
  50. data/lib/hexapdf/type/font.rb +1 -1
  51. data/lib/hexapdf/type/font_simple.rb +4 -2
  52. data/lib/hexapdf/type/font_true_type.rb +6 -2
  53. data/lib/hexapdf/type/font_type0.rb +4 -4
  54. data/lib/hexapdf/type/form.rb +6 -2
  55. data/lib/hexapdf/type/image.rb +2 -2
  56. data/lib/hexapdf/type/page.rb +21 -12
  57. data/lib/hexapdf/type/page_tree_node.rb +29 -5
  58. data/lib/hexapdf/type/resources.rb +5 -0
  59. data/lib/hexapdf/type/trailer.rb +2 -3
  60. data/lib/hexapdf/utils/object_hash.rb +0 -1
  61. data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
  62. data/lib/hexapdf/version.rb +1 -1
  63. data/test/hexapdf/common_tokenizer_tests.rb +2 -2
  64. data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
  65. data/test/hexapdf/content/test_canvas.rb +3 -3
  66. data/test/hexapdf/content/test_color_space.rb +1 -1
  67. data/test/hexapdf/encryption/test_aes.rb +4 -4
  68. data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
  69. data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
  70. data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
  71. data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
  72. data/test/hexapdf/font/true_type/test_subsetter.rb +10 -0
  73. data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
  74. data/test/hexapdf/font_loader/test_from_file.rb +7 -0
  75. data/test/hexapdf/layout/test_text_layouter.rb +12 -5
  76. data/test/hexapdf/test_configuration.rb +2 -2
  77. data/test/hexapdf/test_dictionary.rb +8 -1
  78. data/test/hexapdf/test_dictionary_fields.rb +9 -2
  79. data/test/hexapdf/test_document.rb +18 -10
  80. data/test/hexapdf/test_object.rb +71 -26
  81. data/test/hexapdf/test_parser.rb +205 -51
  82. data/test/hexapdf/test_pdf_array.rb +8 -1
  83. data/test/hexapdf/test_revisions.rb +35 -0
  84. data/test/hexapdf/test_serializer.rb +7 -0
  85. data/test/hexapdf/test_tokenizer.rb +28 -0
  86. data/test/hexapdf/test_writer.rb +2 -2
  87. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +288 -35
  88. data/test/hexapdf/type/acro_form/test_button_field.rb +15 -0
  89. data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
  90. data/test/hexapdf/type/acro_form/test_field.rb +39 -0
  91. data/test/hexapdf/type/acro_form/test_form.rb +87 -15
  92. data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
  93. data/test/hexapdf/type/test_font_simple.rb +2 -1
  94. data/test/hexapdf/type/test_font_true_type.rb +6 -0
  95. data/test/hexapdf/type/test_form.rb +8 -1
  96. data/test/hexapdf/type/test_page.rb +8 -1
  97. data/test/hexapdf/type/test_page_tree_node.rb +42 -0
  98. data/test/hexapdf/type/test_resources.rb +6 -0
  99. data/test/hexapdf/utils/test_bit_field.rb +2 -0
  100. data/test/hexapdf/utils/test_object_hash.rb +5 -0
  101. data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
  102. data/test/test_helper.rb +2 -0
  103. metadata +6 -12
@@ -162,9 +162,10 @@ module HexaPDF
162
162
  io.seek(length, IO::SEEK_CUR)
163
163
  end
164
164
  when 'tRNS' # PNG s11.3.2
165
- if @color_type == INDEXED
165
+ case @color_type
166
+ when INDEXED
166
167
  trns = io.read(length).unpack('C*')
167
- elsif @color_type == TRUECOLOR || @color_type == GREYSCALE
168
+ when TRUECOLOR, GREYSCALE
168
169
  dict[:Mask] = io.read(length).unpack('n*').map {|val| [val, val] }.flatten
169
170
  else
170
171
  io.seek(length, IO::SEEK_CUR)
@@ -90,7 +90,7 @@ module HexaPDF
90
90
  #
91
91
  # An error is raised if the object doesn't belong to the +source+ document.
92
92
  def import(object)
93
- mapped_object = @mapper[object.data] if object.kind_of?(HexaPDF::Object)
93
+ mapped_object = @mapper[object.data]&.__getobj__ if object.kind_of?(HexaPDF::Object)
94
94
  if object.kind_of?(HexaPDF::Object) && object.document? && @source != object.document
95
95
  raise HexaPDF::Error, "Import error: Incorrect document object for importer"
96
96
  elsif mapped_object && mapped_object == @destination.object(mapped_object)
@@ -118,7 +118,8 @@ module HexaPDF
118
118
  if object.type == :Catalog || object.type == :Pages
119
119
  @mapper[object.data] = nil
120
120
  else
121
- obj = @mapper[object.data] = object.dup
121
+ obj = object.dup
122
+ @mapper[object.data] = NullableWeakRef.new(obj)
122
123
  obj.document = @destination.__getobj__
123
124
  obj.instance_variable_set(:@data, obj.data.dup)
124
125
  obj.data.oid = 0
@@ -198,7 +198,7 @@ module HexaPDF
198
198
  # Note: The cache is not cleared!
199
199
  def add(item)
200
200
  last = @items.last
201
- if last.class == item.class && item.kind_of?(TextFragment) && last.style == item.style
201
+ if last.instance_of?(item.class) && item.kind_of?(TextFragment) && last.style == item.style
202
202
  if last.items.frozen?
203
203
  @items[-1] = last = last.dup
204
204
  last.items = last.items.dup
@@ -524,7 +524,7 @@ module HexaPDF
524
524
  # Style.new(font_size: 15, align: :center, valign: center)
525
525
  def initialize(**properties)
526
526
  update(**properties)
527
- @scaled_item_widths = {}
527
+ @scaled_item_widths = {}.compare_by_identity
528
528
  end
529
529
 
530
530
  # Duplicates the complex properties that can be modified, as well as the cache.
@@ -883,41 +883,41 @@ module HexaPDF
883
883
  [:text_rise, 0],
884
884
  [:font_features, {}],
885
885
  [:text_rendering_mode, "Content::TextRenderingMode::FILL",
886
- setter: "Content::TextRenderingMode.normalize(value)"],
886
+ {setter: "Content::TextRenderingMode.normalize(value)"}],
887
887
  [:subscript, false,
888
- setter: "value; superscript(false) if superscript",
889
- valid_values: [true, false]],
888
+ {setter: "value; superscript(false) if superscript",
889
+ valid_values: [true, false]}],
890
890
  [:superscript, false,
891
- setter: "value; subscript(false) if subscript",
892
- valid_values: [true, false]],
893
- [:underline, false, valid_values: [true, false]],
894
- [:strikeout, false, valid_values: [true, false]],
891
+ {setter: "value; subscript(false) if subscript",
892
+ valid_values: [true, false]}],
893
+ [:underline, false, {valid_values: [true, false]}],
894
+ [:strikeout, false, {valid_values: [true, false]}],
895
895
  [:fill_color, "default_color"],
896
896
  [:fill_alpha, 1],
897
897
  [:stroke_color, "default_color"],
898
898
  [:stroke_alpha, 1],
899
899
  [:stroke_width, 1],
900
900
  [:stroke_cap_style, "Content::LineCapStyle::BUTT_CAP",
901
- setter: "Content::LineCapStyle.normalize(value)"],
901
+ {setter: "Content::LineCapStyle.normalize(value)"}],
902
902
  [:stroke_join_style, "Content::LineJoinStyle::MITER_JOIN",
903
- setter: "Content::LineJoinStyle.normalize(value)"],
903
+ {setter: "Content::LineJoinStyle.normalize(value)"}],
904
904
  [:stroke_miter_limit, 10.0],
905
905
  [:stroke_dash_pattern, "Content::LineDashPattern.new",
906
- setter: "Content::LineDashPattern.normalize(value, phase)", extra_args: ", phase = 0"],
907
- [:align, :left, valid_values: [:left, :center, :right, :justify]],
908
- [:valign, :top, valid_values: [:top, :center, :bottom]],
906
+ {setter: "Content::LineDashPattern.normalize(value, phase)", extra_args: ", phase = 0"}],
907
+ [:align, :left, {valid_values: [:left, :center, :right, :justify]}],
908
+ [:valign, :top, {valid_values: [:top, :center, :bottom]}],
909
909
  [:text_indent, 0],
910
910
  [:line_spacing, "LineSpacing.new(type: :single)",
911
- setter: "LineSpacing.new(**(value.kind_of?(Symbol) ? {type: value, value: extra_arg} : value))",
912
- extra_args: ", extra_arg = nil"],
913
- [:last_line_gap, false, valid_values: [true, false]],
911
+ {setter: "LineSpacing.new(**(value.kind_of?(Symbol) ? {type: value, value: extra_arg} : value))",
912
+ extra_args: ", extra_arg = nil"}],
913
+ [:last_line_gap, false, {valid_values: [true, false]}],
914
914
  [:background_color, nil],
915
- [:padding, "Quad.new(0)", setter: "Quad.new(value)"],
916
- [:margin, "Quad.new(0)", setter: "Quad.new(value)"],
917
- [:border, "Border.new", setter: "Border.new(**value)"],
918
- [:overlays, "Layers.new", setter: "Layers.new(value)"],
919
- [:underlays, "Layers.new", setter: "Layers.new(value)"],
920
- [:position, :default, valid_values: [:default, :float, :flow, :absolute]],
915
+ [:padding, "Quad.new(0)", {setter: "Quad.new(value)"}],
916
+ [:margin, "Quad.new(0)", {setter: "Quad.new(value)"}],
917
+ [:border, "Border.new", {setter: "Border.new(**value)"}],
918
+ [:overlays, "Layers.new", {setter: "Layers.new(value)"}],
919
+ [:underlays, "Layers.new", {setter: "Layers.new(value)"}],
920
+ [:position, :default, {valid_values: [:default, :float, :flow, :absolute]}],
921
921
  [:position_hint, nil],
922
922
  ].each do |name, default, options = {}|
923
923
  default = default.inspect unless default.kind_of?(String)
@@ -1075,7 +1075,7 @@ module HexaPDF
1075
1075
  # The item may be a (singleton) glyph object or an integer/float, i.e. items that can appear
1076
1076
  # inside a TextFragment.
1077
1077
  def scaled_item_width(item)
1078
- @scaled_item_widths[item.object_id] ||=
1078
+ @scaled_item_widths[item] ||=
1079
1079
  begin
1080
1080
  if item.kind_of?(Numeric)
1081
1081
  -item * scaled_font_size
@@ -388,7 +388,7 @@ module HexaPDF
388
388
  end
389
389
  when :penalty
390
390
  if item.penalty <= -Penalty::INFINITY
391
- add_box_item(item.item) if item.item
391
+ add_box_item(item.item) if item.width > 0
392
392
  break unless yield(create_unjustified_line, item)
393
393
  reset_after_line_break(index + 1)
394
394
  elsif item.penalty >= Penalty::INFINITY
@@ -458,7 +458,7 @@ module HexaPDF
458
458
  end
459
459
  when :penalty
460
460
  if item.penalty <= -Penalty::INFINITY
461
- add_box_item(item.item) if item.item
461
+ add_box_item(item.item) if item.width > 0
462
462
  break unless (action = yield(create_unjustified_line, item))
463
463
  reset_after_line_break_variable_width(index + 1, true, action)
464
464
  elsif item.penalty >= Penalty::INFINITY
@@ -68,9 +68,10 @@ module HexaPDF
68
68
  text_fragment.clear_cache
69
69
  end
70
70
  if text_fragment.style.font_features[:kern] && font.wrapped_font.features.include?(:kern)
71
- if font.font_type == :TrueType
71
+ case font.font_type
72
+ when :TrueType
72
73
  process_true_type_kerning(text_fragment)
73
- elsif font.font_type == :Type1
74
+ when :Type1
74
75
  process_type1_kerning(text_fragment)
75
76
  end
76
77
  text_fragment.clear_cache
@@ -122,9 +122,6 @@ module HexaPDF
122
122
 
123
123
  include Comparable
124
124
 
125
- # A list of classes whose objects cannot be duplicated.
126
- NOT_DUPLICATABLE_CLASSES = [NilClass, FalseClass, TrueClass, Symbol, Integer, Float].freeze
127
-
128
125
  # :call-seq:
129
126
  # HexaPDF::Object.deep_copy(object) -> copy
130
127
  #
@@ -139,8 +136,6 @@ module HexaPDF
139
136
  (object.indirect? || object.must_be_indirect? ? object : deep_copy(object.value))
140
137
  when HexaPDF::Reference
141
138
  object
142
- when *NOT_DUPLICATABLE_CLASSES
143
- object
144
139
  else
145
140
  object.dup
146
141
  end
@@ -251,29 +246,31 @@ module HexaPDF
251
246
  end
252
247
 
253
248
  # :call-seq:
254
- # obj.validate(auto_correct: true) -> true or false
255
- # obj.validate(auto_correct: true) {|msg, correctable| block } -> true or false
249
+ # obj.validate(auto_correct: true) -> true or false
250
+ # obj.validate(auto_correct: true) {|msg, correctable, obj| block } -> true or false
256
251
  #
257
- # Validates the object and, optionally, corrects problems when the option +auto_correct+ is set.
258
- # The validation routine itself has to be implemented in the #perform_validation method - see
259
- # its documentation for more information.
252
+ # Validates the object, optionally corrects problems when the option +auto_correct+ is set and
253
+ # returns +true+ if the object is deemed valid and +false+ otherwise.
260
254
  #
261
255
  # If a block is given, it is called on validation problems with a problem description and
262
- # whether the problem is correctable.
256
+ # whether the problem is automatically correctable. The third argument to the block is usually
257
+ # this object but may be another object if during auto-correction a new object was created and
258
+ # validated.
263
259
  #
264
- # Returns +true+ if the object is deemed valid and +false+ otherwise.
260
+ # The validation routine itself has to be implemented in the #perform_validation method - see
261
+ # its documentation for more information.
265
262
  #
266
263
  # *Note*: Even if the return value is +true+ there may be problems since HexaPDF doesn't
267
264
  # currently implement the full PDF spec. However, if the return value is +false+, there is
268
265
  # certainly a problem!
269
266
  def validate(auto_correct: true)
270
- catch do |catch_tag|
271
- perform_validation do |msg, correctable|
272
- yield(msg, correctable) if block_given?
273
- throw(catch_tag, false) unless auto_correct && correctable
274
- end
275
- true
267
+ result = true
268
+ perform_validation do |msg, correctable, object|
269
+ yield(msg, correctable, object || self) if block_given?
270
+ result = false unless correctable
271
+ return false unless auto_correct
276
272
  end
273
+ result
277
274
  end
278
275
 
279
276
  # Makes a deep copy of the source PDF object and resets the object identifier.
@@ -287,6 +284,28 @@ module HexaPDF
287
284
  obj
288
285
  end
289
286
 
287
+ # Caches and returns the given +value+ or the value of the block under the given cache key. If
288
+ # there is already a cached value for the key and +update+ is +false+, it is just returned.
289
+ #
290
+ # Set +update+ to +true+ to force an update of the cached value.
291
+ #
292
+ # This uses Document#cache internally.
293
+ def cache(key, value = Document::UNSET, update: false, &block)
294
+ document.cache(@data, key, value, update: update, &block)
295
+ end
296
+
297
+ # Returns +true+ if there is a cached value for the given key.
298
+ #
299
+ # This uses Document#cached? internally.
300
+ def cached?(key)
301
+ document.cached?(@data, key)
302
+ end
303
+
304
+ # Clears the cache for this object.
305
+ def clear_cache
306
+ document.clear_cache(@data)
307
+ end
308
+
290
309
  # Compares this object to another object.
291
310
  #
292
311
  # If the other object does not respond to +oid+ or +gen+, +nil+ is returned. Otherwise objects
@@ -339,17 +358,25 @@ module HexaPDF
339
358
  # are also performed!
340
359
  #
341
360
  # When the validation routine finds that the object is invalid, it has to yield a problem
342
- # description and whether the problem can be corrected. After yielding, the problem has to be
343
- # corrected which poses no problem because the #validate method makes sure that the yield only
344
- # returns if the problem is actually correctable and if it should be corrected.
361
+ # description and whether the problem can be corrected. An optional third argument may contain
362
+ # the object that gets validated if it is different from this object (may happen when
363
+ # auto-correction is used).
345
364
  #
346
- # Here is a sample validation routine for stream objects:
365
+ # After yielding, the problem has to be corrected if it is correctable. If it is not correctable
366
+ # and not correcting would lead to exceptions the method has to return early.
367
+ #
368
+ # Here is a sample validation routine for a dictionary object type:
347
369
  #
348
370
  # def perform_validation
349
371
  # super
350
- # unless value.kind_of?(Hash)
351
- # yield("A stream object needs a Hash as value")
352
- # self.value = {}
372
+ #
373
+ # if value[:SomeKey].length != 7
374
+ # yield("Length of /SomeKey is invalid")
375
+ # # No need to return early here because following check doesn't rely on /SomeKey
376
+ # end
377
+ #
378
+ # if value[:OtherKey] % 2 == 0
379
+ # yield("/OtherKey needs to contain an odd number of elements")
353
380
  # end
354
381
  # end
355
382
  def perform_validation(&block)
@@ -59,6 +59,7 @@ module HexaPDF
59
59
  @tokenizer = Tokenizer.new(io)
60
60
  @document = document
61
61
  @object_stream_data = {}
62
+ @reconstructed_revision = nil
62
63
  retrieve_pdf_header_offset_and_version
63
64
  end
64
65
 
@@ -71,7 +72,13 @@ module HexaPDF
71
72
  obj, oid, gen, stream =
72
73
  case xref_entry.type
73
74
  when :in_use
74
- parse_indirect_object(xref_entry.pos)
75
+ if xref_entry.pos == 0 && xref_entry.oid != 0
76
+ # Handle seen-in-the-wild objects with invalid offset 0
77
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
78
+ [nil, xref_entry.oid, xref_entry.gen, nil]
79
+ else
80
+ parse_indirect_object(xref_entry.pos)
81
+ end
75
82
  when :free
76
83
  [nil, xref_entry.oid, xref_entry.gen, nil]
77
84
  when :compressed
@@ -82,10 +89,12 @@ module HexaPDF
82
89
 
83
90
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
84
91
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
85
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
92
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
86
93
  end
87
94
 
88
95
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
96
+ rescue HexaPDF::MalformedPDFError
97
+ reconstructed_revision.object(xref_entry)
89
98
  end
90
99
 
91
100
  # Parses the indirect object at the specified offset.
@@ -110,7 +119,15 @@ module HexaPDF
110
119
  maybe_raise("No indirect object value between 'obj' and 'endobj'", pos: @tokenizer.pos)
111
120
  object = nil
112
121
  else
113
- object = @tokenizer.next_object
122
+ begin
123
+ object = @tokenizer.next_object
124
+ rescue MalformedPDFError
125
+ # Handle often found invalid indirect object with missing whitespace after number
126
+ maybe_raise("Invalid object value after 'obj'", pos: @tokenizer.pos,
127
+ force: !(tok.kind_of?(Tokenizer::Token) && tok =~ /\A\d+endobj\z/))
128
+ object = tok.to_i
129
+ @tokenizer.pos -= 6
130
+ end
114
131
  end
115
132
 
116
133
  tok = @tokenizer.next_token
@@ -122,7 +139,9 @@ module HexaPDF
122
139
  tok1 = @tokenizer.next_byte
123
140
  tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
124
141
  if tok1 != 10 && tok1 != 13
125
- raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
142
+ tok2 = @tokenizer.next_byte
143
+ maybe_raise("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos,
144
+ force: tok1 != 32 || (tok2 != 10 && tok2 != 13)) # 32=space
126
145
  elsif tok1 == 13 && tok2 != 10
127
146
  maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
128
147
  pos: @tokenizer.pos)
@@ -235,14 +254,14 @@ module HexaPDF
235
254
  @tokenizer.skip_whitespace
236
255
  start.upto(start + number_of_entries - 1) do |oid|
237
256
  pos, gen, type = @tokenizer.next_xref_entry do |matched_size|
238
- maybe_raise("Invalid cross-reference subsection entry", pos: @tokenizer.pos,
239
- force: matched_size == 20)
257
+ maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos,
258
+ force: !matched_size)
240
259
  end
241
260
  if xref.entry?(oid)
242
261
  next
243
262
  elsif type == 'n'
244
263
  if pos == 0 || gen > 65535
245
- maybe_raise("Invalid in use cross-reference entry in cross-reference section",
264
+ maybe_raise("Invalid in use cross-reference entry",
246
265
  pos: @tokenizer.pos)
247
266
  xref.add_free_entry(oid, gen)
248
267
  else
@@ -264,6 +283,27 @@ module HexaPDF
264
283
  raise_malformed("Trailer is #{trailer.class} instead of dictionary ", pos: @tokenizer.pos)
265
284
  end
266
285
 
286
+ unless trailer[:Prev] || xref.max_oid == 0 || xref.entry?(0)
287
+ first_entry = xref[xref.oids[0]]
288
+ test_entry = xref[xref.oids[-1]]
289
+ @tokenizer.pos = test_entry.pos + @header_offset
290
+ test_oid = @tokenizer.next_token
291
+ first_oid = first_entry.oid
292
+
293
+ force_failure = !first_entry.free? || first_entry.gen != 65535 ||
294
+ !test_oid.kind_of?(Integer) || xref.oids[-1] - test_oid != first_oid
295
+ maybe_raise("Main cross-reference section has invalid numbering",
296
+ pos: offset + @header_offset, force: force_failure)
297
+
298
+ new_xref = XRefSection.new
299
+ xref.oids.each do |oid|
300
+ entry = xref[oid]
301
+ entry.oid -= first_oid
302
+ new_xref.send(:[]=, entry.oid, entry.gen, entry)
303
+ end
304
+ xref = new_xref
305
+ end
306
+
267
307
  [xref, trailer]
268
308
  end
269
309
 
@@ -313,6 +353,11 @@ module HexaPDF
313
353
  @startxref_offset = lines[eof_index - 1].to_i
314
354
  end
315
355
 
356
+ # Returns the reconstructed revision.
357
+ def reconstructed_revision
358
+ @reconstructed_revision ||= reconstruct_revision
359
+ end
360
+
316
361
  # Returns the PDF version number that is stored in the file header.
317
362
  #
318
363
  # See: PDF1.7 s7.5.2
@@ -338,6 +383,61 @@ module HexaPDF
338
383
  @header_version = $1
339
384
  end
340
385
 
386
+ # Tries to reconstruct the PDF document's main cross-reference table by serially parsing the
387
+ # file and returning a Revision object for loading the found objects.
388
+ #
389
+ # If the file contains multiple cross-reference sections, all objects will be put into a single
390
+ # cross-reference table, later objects overwriting prior ones.
391
+ def reconstruct_revision
392
+ raise unless @document.config['parser.try_xref_reconstruction']
393
+ msg = "#{$!} - trying cross-reference table reconstruction"
394
+ @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
395
+
396
+ xref = XRefSection.new
397
+ @tokenizer.pos = 0
398
+ while true
399
+ @tokenizer.skip_whitespace
400
+ pos = @tokenizer.pos
401
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
402
+ next_new_line_pos = @tokenizer.pos
403
+ @tokenizer.pos = pos
404
+
405
+ token = @tokenizer.next_integer_or_keyword rescue nil
406
+ if token.kind_of?(Integer)
407
+ gen = @tokenizer.next_integer_or_keyword rescue nil
408
+ tok = @tokenizer.next_integer_or_keyword rescue nil
409
+ if @tokenizer.pos > next_new_line_pos
410
+ @tokenizer.pos = next_new_line_pos
411
+ elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
412
+ xref.add_in_use_entry(token, gen, pos)
413
+ @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/)
414
+ end
415
+ elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
416
+ obj = @tokenizer.next_object rescue nil
417
+ # Use last trailer found in case of multiple revisions but use first trailer in case of
418
+ # linearized file.
419
+ trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?)
420
+ elsif token == Tokenizer::NO_MORE_TOKENS
421
+ break
422
+ else
423
+ @tokenizer.pos = next_new_line_pos
424
+ end
425
+ end
426
+
427
+ trailer&.delete(:Prev) # no need for this and may wreak havoc
428
+ if !trailer || trailer.empty?
429
+ raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
430
+ end
431
+
432
+ loader = lambda do |xref_entry|
433
+ obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
434
+ @document.wrap(obj, oid: oid, gen: gen, stream: stream)
435
+ end
436
+
437
+ Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
438
+ loader: loader)
439
+ end
440
+
341
441
  # Raises a HexaPDF::MalformedPDFError with the given message and source position.
342
442
  def raise_malformed(msg, pos: nil)
343
443
  raise HexaPDF::MalformedPDFError.new(msg, pos: pos)