hexapdf 0.12.3 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +132 -0
  3. data/examples/019-acro_form.rb +41 -4
  4. data/lib/hexapdf/cli/command.rb +4 -2
  5. data/lib/hexapdf/cli/image2pdf.rb +2 -1
  6. data/lib/hexapdf/cli/info.rb +51 -2
  7. data/lib/hexapdf/cli/inspect.rb +30 -8
  8. data/lib/hexapdf/cli/merge.rb +1 -1
  9. data/lib/hexapdf/cli/split.rb +74 -14
  10. data/lib/hexapdf/configuration.rb +15 -0
  11. data/lib/hexapdf/content/graphic_object/arc.rb +3 -3
  12. data/lib/hexapdf/dictionary.rb +12 -6
  13. data/lib/hexapdf/dictionary_fields.rb +2 -10
  14. data/lib/hexapdf/document.rb +41 -16
  15. data/lib/hexapdf/document/files.rb +0 -1
  16. data/lib/hexapdf/encryption/fast_arc4.rb +1 -1
  17. data/lib/hexapdf/encryption/security_handler.rb +1 -0
  18. data/lib/hexapdf/encryption/standard_security_handler.rb +1 -0
  19. data/lib/hexapdf/font/cmap.rb +1 -4
  20. data/lib/hexapdf/font/true_type/subsetter.rb +16 -3
  21. data/lib/hexapdf/font/true_type/table/head.rb +1 -0
  22. data/lib/hexapdf/font/true_type/table/os2.rb +2 -0
  23. data/lib/hexapdf/font/true_type/table/post.rb +15 -10
  24. data/lib/hexapdf/font_loader/from_configuration.rb +2 -2
  25. data/lib/hexapdf/font_loader/from_file.rb +18 -8
  26. data/lib/hexapdf/image_loader/png.rb +3 -2
  27. data/lib/hexapdf/importer.rb +3 -2
  28. data/lib/hexapdf/layout/line.rb +1 -1
  29. data/lib/hexapdf/layout/style.rb +23 -23
  30. data/lib/hexapdf/layout/text_layouter.rb +2 -2
  31. data/lib/hexapdf/layout/text_shaper.rb +3 -2
  32. data/lib/hexapdf/object.rb +52 -25
  33. data/lib/hexapdf/parser.rb +107 -7
  34. data/lib/hexapdf/pdf_array.rb +15 -5
  35. data/lib/hexapdf/revisions.rb +29 -21
  36. data/lib/hexapdf/serializer.rb +37 -10
  37. data/lib/hexapdf/task/optimize.rb +6 -4
  38. data/lib/hexapdf/tokenizer.rb +22 -0
  39. data/lib/hexapdf/type/acro_form/appearance_generator.rb +130 -27
  40. data/lib/hexapdf/type/acro_form/button_field.rb +5 -2
  41. data/lib/hexapdf/type/acro_form/choice_field.rb +68 -14
  42. data/lib/hexapdf/type/acro_form/field.rb +35 -5
  43. data/lib/hexapdf/type/acro_form/form.rb +139 -14
  44. data/lib/hexapdf/type/acro_form/text_field.rb +70 -4
  45. data/lib/hexapdf/type/actions/uri.rb +3 -2
  46. data/lib/hexapdf/type/annotations/widget.rb +3 -4
  47. data/lib/hexapdf/type/catalog.rb +2 -2
  48. data/lib/hexapdf/type/cid_font.rb +1 -1
  49. data/lib/hexapdf/type/file_specification.rb +1 -1
  50. data/lib/hexapdf/type/font.rb +1 -1
  51. data/lib/hexapdf/type/font_simple.rb +4 -2
  52. data/lib/hexapdf/type/font_true_type.rb +6 -2
  53. data/lib/hexapdf/type/font_type0.rb +4 -4
  54. data/lib/hexapdf/type/form.rb +6 -2
  55. data/lib/hexapdf/type/image.rb +2 -2
  56. data/lib/hexapdf/type/page.rb +21 -12
  57. data/lib/hexapdf/type/page_tree_node.rb +29 -5
  58. data/lib/hexapdf/type/resources.rb +5 -0
  59. data/lib/hexapdf/type/trailer.rb +2 -3
  60. data/lib/hexapdf/utils/object_hash.rb +0 -1
  61. data/lib/hexapdf/utils/sorted_tree_node.rb +18 -15
  62. data/lib/hexapdf/version.rb +1 -1
  63. data/test/hexapdf/common_tokenizer_tests.rb +2 -2
  64. data/test/hexapdf/content/graphic_object/test_arc.rb +4 -4
  65. data/test/hexapdf/content/test_canvas.rb +3 -3
  66. data/test/hexapdf/content/test_color_space.rb +1 -1
  67. data/test/hexapdf/encryption/test_aes.rb +4 -4
  68. data/test/hexapdf/encryption/test_standard_security_handler.rb +11 -11
  69. data/test/hexapdf/filter/test_ascii85_decode.rb +1 -1
  70. data/test/hexapdf/filter/test_ascii_hex_decode.rb +1 -1
  71. data/test/hexapdf/font/true_type/table/test_post.rb +1 -1
  72. data/test/hexapdf/font/true_type/test_subsetter.rb +10 -0
  73. data/test/hexapdf/font_loader/test_from_configuration.rb +7 -3
  74. data/test/hexapdf/font_loader/test_from_file.rb +7 -0
  75. data/test/hexapdf/layout/test_text_layouter.rb +12 -5
  76. data/test/hexapdf/test_configuration.rb +2 -2
  77. data/test/hexapdf/test_dictionary.rb +8 -1
  78. data/test/hexapdf/test_dictionary_fields.rb +9 -2
  79. data/test/hexapdf/test_document.rb +18 -10
  80. data/test/hexapdf/test_object.rb +71 -26
  81. data/test/hexapdf/test_parser.rb +205 -51
  82. data/test/hexapdf/test_pdf_array.rb +8 -1
  83. data/test/hexapdf/test_revisions.rb +35 -0
  84. data/test/hexapdf/test_serializer.rb +7 -0
  85. data/test/hexapdf/test_tokenizer.rb +28 -0
  86. data/test/hexapdf/test_writer.rb +2 -2
  87. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +288 -35
  88. data/test/hexapdf/type/acro_form/test_button_field.rb +15 -0
  89. data/test/hexapdf/type/acro_form/test_choice_field.rb +92 -9
  90. data/test/hexapdf/type/acro_form/test_field.rb +39 -0
  91. data/test/hexapdf/type/acro_form/test_form.rb +87 -15
  92. data/test/hexapdf/type/acro_form/test_text_field.rb +77 -1
  93. data/test/hexapdf/type/test_font_simple.rb +2 -1
  94. data/test/hexapdf/type/test_font_true_type.rb +6 -0
  95. data/test/hexapdf/type/test_form.rb +8 -1
  96. data/test/hexapdf/type/test_page.rb +8 -1
  97. data/test/hexapdf/type/test_page_tree_node.rb +42 -0
  98. data/test/hexapdf/type/test_resources.rb +6 -0
  99. data/test/hexapdf/utils/test_bit_field.rb +2 -0
  100. data/test/hexapdf/utils/test_object_hash.rb +5 -0
  101. data/test/hexapdf/utils/test_sorted_tree_node.rb +10 -9
  102. data/test/test_helper.rb +2 -0
  103. metadata +6 -12
@@ -162,9 +162,10 @@ module HexaPDF
162
162
  io.seek(length, IO::SEEK_CUR)
163
163
  end
164
164
  when 'tRNS' # PNG s11.3.2
165
- if @color_type == INDEXED
165
+ case @color_type
166
+ when INDEXED
166
167
  trns = io.read(length).unpack('C*')
167
- elsif @color_type == TRUECOLOR || @color_type == GREYSCALE
168
+ when TRUECOLOR, GREYSCALE
168
169
  dict[:Mask] = io.read(length).unpack('n*').map {|val| [val, val] }.flatten
169
170
  else
170
171
  io.seek(length, IO::SEEK_CUR)
@@ -90,7 +90,7 @@ module HexaPDF
90
90
  #
91
91
  # An error is raised if the object doesn't belong to the +source+ document.
92
92
  def import(object)
93
- mapped_object = @mapper[object.data] if object.kind_of?(HexaPDF::Object)
93
+ mapped_object = @mapper[object.data]&.__getobj__ if object.kind_of?(HexaPDF::Object)
94
94
  if object.kind_of?(HexaPDF::Object) && object.document? && @source != object.document
95
95
  raise HexaPDF::Error, "Import error: Incorrect document object for importer"
96
96
  elsif mapped_object && mapped_object == @destination.object(mapped_object)
@@ -118,7 +118,8 @@ module HexaPDF
118
118
  if object.type == :Catalog || object.type == :Pages
119
119
  @mapper[object.data] = nil
120
120
  else
121
- obj = @mapper[object.data] = object.dup
121
+ obj = object.dup
122
+ @mapper[object.data] = NullableWeakRef.new(obj)
122
123
  obj.document = @destination.__getobj__
123
124
  obj.instance_variable_set(:@data, obj.data.dup)
124
125
  obj.data.oid = 0
@@ -198,7 +198,7 @@ module HexaPDF
198
198
  # Note: The cache is not cleared!
199
199
  def add(item)
200
200
  last = @items.last
201
- if last.class == item.class && item.kind_of?(TextFragment) && last.style == item.style
201
+ if last.instance_of?(item.class) && item.kind_of?(TextFragment) && last.style == item.style
202
202
  if last.items.frozen?
203
203
  @items[-1] = last = last.dup
204
204
  last.items = last.items.dup
@@ -524,7 +524,7 @@ module HexaPDF
524
524
  # Style.new(font_size: 15, align: :center, valign: center)
525
525
  def initialize(**properties)
526
526
  update(**properties)
527
- @scaled_item_widths = {}
527
+ @scaled_item_widths = {}.compare_by_identity
528
528
  end
529
529
 
530
530
  # Duplicates the complex properties that can be modified, as well as the cache.
@@ -883,41 +883,41 @@ module HexaPDF
883
883
  [:text_rise, 0],
884
884
  [:font_features, {}],
885
885
  [:text_rendering_mode, "Content::TextRenderingMode::FILL",
886
- setter: "Content::TextRenderingMode.normalize(value)"],
886
+ {setter: "Content::TextRenderingMode.normalize(value)"}],
887
887
  [:subscript, false,
888
- setter: "value; superscript(false) if superscript",
889
- valid_values: [true, false]],
888
+ {setter: "value; superscript(false) if superscript",
889
+ valid_values: [true, false]}],
890
890
  [:superscript, false,
891
- setter: "value; subscript(false) if subscript",
892
- valid_values: [true, false]],
893
- [:underline, false, valid_values: [true, false]],
894
- [:strikeout, false, valid_values: [true, false]],
891
+ {setter: "value; subscript(false) if subscript",
892
+ valid_values: [true, false]}],
893
+ [:underline, false, {valid_values: [true, false]}],
894
+ [:strikeout, false, {valid_values: [true, false]}],
895
895
  [:fill_color, "default_color"],
896
896
  [:fill_alpha, 1],
897
897
  [:stroke_color, "default_color"],
898
898
  [:stroke_alpha, 1],
899
899
  [:stroke_width, 1],
900
900
  [:stroke_cap_style, "Content::LineCapStyle::BUTT_CAP",
901
- setter: "Content::LineCapStyle.normalize(value)"],
901
+ {setter: "Content::LineCapStyle.normalize(value)"}],
902
902
  [:stroke_join_style, "Content::LineJoinStyle::MITER_JOIN",
903
- setter: "Content::LineJoinStyle.normalize(value)"],
903
+ {setter: "Content::LineJoinStyle.normalize(value)"}],
904
904
  [:stroke_miter_limit, 10.0],
905
905
  [:stroke_dash_pattern, "Content::LineDashPattern.new",
906
- setter: "Content::LineDashPattern.normalize(value, phase)", extra_args: ", phase = 0"],
907
- [:align, :left, valid_values: [:left, :center, :right, :justify]],
908
- [:valign, :top, valid_values: [:top, :center, :bottom]],
906
+ {setter: "Content::LineDashPattern.normalize(value, phase)", extra_args: ", phase = 0"}],
907
+ [:align, :left, {valid_values: [:left, :center, :right, :justify]}],
908
+ [:valign, :top, {valid_values: [:top, :center, :bottom]}],
909
909
  [:text_indent, 0],
910
910
  [:line_spacing, "LineSpacing.new(type: :single)",
911
- setter: "LineSpacing.new(**(value.kind_of?(Symbol) ? {type: value, value: extra_arg} : value))",
912
- extra_args: ", extra_arg = nil"],
913
- [:last_line_gap, false, valid_values: [true, false]],
911
+ {setter: "LineSpacing.new(**(value.kind_of?(Symbol) ? {type: value, value: extra_arg} : value))",
912
+ extra_args: ", extra_arg = nil"}],
913
+ [:last_line_gap, false, {valid_values: [true, false]}],
914
914
  [:background_color, nil],
915
- [:padding, "Quad.new(0)", setter: "Quad.new(value)"],
916
- [:margin, "Quad.new(0)", setter: "Quad.new(value)"],
917
- [:border, "Border.new", setter: "Border.new(**value)"],
918
- [:overlays, "Layers.new", setter: "Layers.new(value)"],
919
- [:underlays, "Layers.new", setter: "Layers.new(value)"],
920
- [:position, :default, valid_values: [:default, :float, :flow, :absolute]],
915
+ [:padding, "Quad.new(0)", {setter: "Quad.new(value)"}],
916
+ [:margin, "Quad.new(0)", {setter: "Quad.new(value)"}],
917
+ [:border, "Border.new", {setter: "Border.new(**value)"}],
918
+ [:overlays, "Layers.new", {setter: "Layers.new(value)"}],
919
+ [:underlays, "Layers.new", {setter: "Layers.new(value)"}],
920
+ [:position, :default, {valid_values: [:default, :float, :flow, :absolute]}],
921
921
  [:position_hint, nil],
922
922
  ].each do |name, default, options = {}|
923
923
  default = default.inspect unless default.kind_of?(String)
@@ -1075,7 +1075,7 @@ module HexaPDF
1075
1075
  # The item may be a (singleton) glyph object or an integer/float, i.e. items that can appear
1076
1076
  # inside a TextFragment.
1077
1077
  def scaled_item_width(item)
1078
- @scaled_item_widths[item.object_id] ||=
1078
+ @scaled_item_widths[item] ||=
1079
1079
  begin
1080
1080
  if item.kind_of?(Numeric)
1081
1081
  -item * scaled_font_size
@@ -388,7 +388,7 @@ module HexaPDF
388
388
  end
389
389
  when :penalty
390
390
  if item.penalty <= -Penalty::INFINITY
391
- add_box_item(item.item) if item.item
391
+ add_box_item(item.item) if item.width > 0
392
392
  break unless yield(create_unjustified_line, item)
393
393
  reset_after_line_break(index + 1)
394
394
  elsif item.penalty >= Penalty::INFINITY
@@ -458,7 +458,7 @@ module HexaPDF
458
458
  end
459
459
  when :penalty
460
460
  if item.penalty <= -Penalty::INFINITY
461
- add_box_item(item.item) if item.item
461
+ add_box_item(item.item) if item.width > 0
462
462
  break unless (action = yield(create_unjustified_line, item))
463
463
  reset_after_line_break_variable_width(index + 1, true, action)
464
464
  elsif item.penalty >= Penalty::INFINITY
@@ -68,9 +68,10 @@ module HexaPDF
68
68
  text_fragment.clear_cache
69
69
  end
70
70
  if text_fragment.style.font_features[:kern] && font.wrapped_font.features.include?(:kern)
71
- if font.font_type == :TrueType
71
+ case font.font_type
72
+ when :TrueType
72
73
  process_true_type_kerning(text_fragment)
73
- elsif font.font_type == :Type1
74
+ when :Type1
74
75
  process_type1_kerning(text_fragment)
75
76
  end
76
77
  text_fragment.clear_cache
@@ -122,9 +122,6 @@ module HexaPDF
122
122
 
123
123
  include Comparable
124
124
 
125
- # A list of classes whose objects cannot be duplicated.
126
- NOT_DUPLICATABLE_CLASSES = [NilClass, FalseClass, TrueClass, Symbol, Integer, Float].freeze
127
-
128
125
  # :call-seq:
129
126
  # HexaPDF::Object.deep_copy(object) -> copy
130
127
  #
@@ -139,8 +136,6 @@ module HexaPDF
139
136
  (object.indirect? || object.must_be_indirect? ? object : deep_copy(object.value))
140
137
  when HexaPDF::Reference
141
138
  object
142
- when *NOT_DUPLICATABLE_CLASSES
143
- object
144
139
  else
145
140
  object.dup
146
141
  end
@@ -251,29 +246,31 @@ module HexaPDF
251
246
  end
252
247
 
253
248
  # :call-seq:
254
- # obj.validate(auto_correct: true) -> true or false
255
- # obj.validate(auto_correct: true) {|msg, correctable| block } -> true or false
249
+ # obj.validate(auto_correct: true) -> true or false
250
+ # obj.validate(auto_correct: true) {|msg, correctable, obj| block } -> true or false
256
251
  #
257
- # Validates the object and, optionally, corrects problems when the option +auto_correct+ is set.
258
- # The validation routine itself has to be implemented in the #perform_validation method - see
259
- # its documentation for more information.
252
+ # Validates the object, optionally corrects problems when the option +auto_correct+ is set and
253
+ # returns +true+ if the object is deemed valid and +false+ otherwise.
260
254
  #
261
255
  # If a block is given, it is called on validation problems with a problem description and
262
- # whether the problem is correctable.
256
+ # whether the problem is automatically correctable. The third argument to the block is usually
257
+ # this object but may be another object if during auto-correction a new object was created and
258
+ # validated.
263
259
  #
264
- # Returns +true+ if the object is deemed valid and +false+ otherwise.
260
+ # The validation routine itself has to be implemented in the #perform_validation method - see
261
+ # its documentation for more information.
265
262
  #
266
263
  # *Note*: Even if the return value is +true+ there may be problems since HexaPDF doesn't
267
264
  # currently implement the full PDF spec. However, if the return value is +false+, there is
268
265
  # certainly a problem!
269
266
  def validate(auto_correct: true)
270
- catch do |catch_tag|
271
- perform_validation do |msg, correctable|
272
- yield(msg, correctable) if block_given?
273
- throw(catch_tag, false) unless auto_correct && correctable
274
- end
275
- true
267
+ result = true
268
+ perform_validation do |msg, correctable, object|
269
+ yield(msg, correctable, object || self) if block_given?
270
+ result = false unless correctable
271
+ return false unless auto_correct
276
272
  end
273
+ result
277
274
  end
278
275
 
279
276
  # Makes a deep copy of the source PDF object and resets the object identifier.
@@ -287,6 +284,28 @@ module HexaPDF
287
284
  obj
288
285
  end
289
286
 
287
+ # Caches and returns the given +value+ or the value of the block under the given cache key. If
288
+ # there is already a cached value for the key and +update+ is +false+, it is just returned.
289
+ #
290
+ # Set +update+ to +true+ to force an update of the cached value.
291
+ #
292
+ # This uses Document#cache internally.
293
+ def cache(key, value = Document::UNSET, update: false, &block)
294
+ document.cache(@data, key, value, update: update, &block)
295
+ end
296
+
297
+ # Returns +true+ if there is a cached value for the given key.
298
+ #
299
+ # This uses Document#cached? internally.
300
+ def cached?(key)
301
+ document.cached?(@data, key)
302
+ end
303
+
304
+ # Clears the cache for this object.
305
+ def clear_cache
306
+ document.clear_cache(@data)
307
+ end
308
+
290
309
  # Compares this object to another object.
291
310
  #
292
311
  # If the other object does not respond to +oid+ or +gen+, +nil+ is returned. Otherwise objects
@@ -339,17 +358,25 @@ module HexaPDF
339
358
  # are also performed!
340
359
  #
341
360
  # When the validation routine finds that the object is invalid, it has to yield a problem
342
- # description and whether the problem can be corrected. After yielding, the problem has to be
343
- # corrected which poses no problem because the #validate method makes sure that the yield only
344
- # returns if the problem is actually correctable and if it should be corrected.
361
+ # description and whether the problem can be corrected. An optional third argument may contain
362
+ # the object that gets validated if it is different from this object (may happen when
363
+ # auto-correction is used).
345
364
  #
346
- # Here is a sample validation routine for stream objects:
365
+ # After yielding, the problem has to be corrected if it is correctable. If it is not correctable
366
+ # and not correcting would lead to exceptions the method has to return early.
367
+ #
368
+ # Here is a sample validation routine for a dictionary object type:
347
369
  #
348
370
  # def perform_validation
349
371
  # super
350
- # unless value.kind_of?(Hash)
351
- # yield("A stream object needs a Hash as value")
352
- # self.value = {}
372
+ #
373
+ # if value[:SomeKey].length != 7
374
+ # yield("Length of /SomeKey is invalid")
375
+ # # No need to return early here because following check doesn't rely on /SomeKey
376
+ # end
377
+ #
378
+ # if value[:OtherKey] % 2 == 0
379
+ # yield("/OtherKey needs to contain an odd number of elements")
353
380
  # end
354
381
  # end
355
382
  def perform_validation(&block)
@@ -59,6 +59,7 @@ module HexaPDF
59
59
  @tokenizer = Tokenizer.new(io)
60
60
  @document = document
61
61
  @object_stream_data = {}
62
+ @reconstructed_revision = nil
62
63
  retrieve_pdf_header_offset_and_version
63
64
  end
64
65
 
@@ -71,7 +72,13 @@ module HexaPDF
71
72
  obj, oid, gen, stream =
72
73
  case xref_entry.type
73
74
  when :in_use
74
- parse_indirect_object(xref_entry.pos)
75
+ if xref_entry.pos == 0 && xref_entry.oid != 0
76
+ # Handle seen-in-the-wild objects with invalid offset 0
77
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
78
+ [nil, xref_entry.oid, xref_entry.gen, nil]
79
+ else
80
+ parse_indirect_object(xref_entry.pos)
81
+ end
75
82
  when :free
76
83
  [nil, xref_entry.oid, xref_entry.gen, nil]
77
84
  when :compressed
@@ -82,10 +89,12 @@ module HexaPDF
82
89
 
83
90
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
84
91
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
85
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
92
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
86
93
  end
87
94
 
88
95
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
96
+ rescue HexaPDF::MalformedPDFError
97
+ reconstructed_revision.object(xref_entry)
89
98
  end
90
99
 
91
100
  # Parses the indirect object at the specified offset.
@@ -110,7 +119,15 @@ module HexaPDF
110
119
  maybe_raise("No indirect object value between 'obj' and 'endobj'", pos: @tokenizer.pos)
111
120
  object = nil
112
121
  else
113
- object = @tokenizer.next_object
122
+ begin
123
+ object = @tokenizer.next_object
124
+ rescue MalformedPDFError
125
+ # Handle often found invalid indirect object with missing whitespace after number
126
+ maybe_raise("Invalid object value after 'obj'", pos: @tokenizer.pos,
127
+ force: !(tok.kind_of?(Tokenizer::Token) && tok =~ /\A\d+endobj\z/))
128
+ object = tok.to_i
129
+ @tokenizer.pos -= 6
130
+ end
114
131
  end
115
132
 
116
133
  tok = @tokenizer.next_token
@@ -122,7 +139,9 @@ module HexaPDF
122
139
  tok1 = @tokenizer.next_byte
123
140
  tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
124
141
  if tok1 != 10 && tok1 != 13
125
- raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
142
+ tok2 = @tokenizer.next_byte
143
+ maybe_raise("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos,
144
+ force: tok1 != 32 || (tok2 != 10 && tok2 != 13)) # 32=space
126
145
  elsif tok1 == 13 && tok2 != 10
127
146
  maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
128
147
  pos: @tokenizer.pos)
@@ -235,14 +254,14 @@ module HexaPDF
235
254
  @tokenizer.skip_whitespace
236
255
  start.upto(start + number_of_entries - 1) do |oid|
237
256
  pos, gen, type = @tokenizer.next_xref_entry do |matched_size|
238
- maybe_raise("Invalid cross-reference subsection entry", pos: @tokenizer.pos,
239
- force: matched_size == 20)
257
+ maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos,
258
+ force: !matched_size)
240
259
  end
241
260
  if xref.entry?(oid)
242
261
  next
243
262
  elsif type == 'n'
244
263
  if pos == 0 || gen > 65535
245
- maybe_raise("Invalid in use cross-reference entry in cross-reference section",
264
+ maybe_raise("Invalid in use cross-reference entry",
246
265
  pos: @tokenizer.pos)
247
266
  xref.add_free_entry(oid, gen)
248
267
  else
@@ -264,6 +283,27 @@ module HexaPDF
264
283
  raise_malformed("Trailer is #{trailer.class} instead of dictionary ", pos: @tokenizer.pos)
265
284
  end
266
285
 
286
+ unless trailer[:Prev] || xref.max_oid == 0 || xref.entry?(0)
287
+ first_entry = xref[xref.oids[0]]
288
+ test_entry = xref[xref.oids[-1]]
289
+ @tokenizer.pos = test_entry.pos + @header_offset
290
+ test_oid = @tokenizer.next_token
291
+ first_oid = first_entry.oid
292
+
293
+ force_failure = !first_entry.free? || first_entry.gen != 65535 ||
294
+ !test_oid.kind_of?(Integer) || xref.oids[-1] - test_oid != first_oid
295
+ maybe_raise("Main cross-reference section has invalid numbering",
296
+ pos: offset + @header_offset, force: force_failure)
297
+
298
+ new_xref = XRefSection.new
299
+ xref.oids.each do |oid|
300
+ entry = xref[oid]
301
+ entry.oid -= first_oid
302
+ new_xref.send(:[]=, entry.oid, entry.gen, entry)
303
+ end
304
+ xref = new_xref
305
+ end
306
+
267
307
  [xref, trailer]
268
308
  end
269
309
 
@@ -313,6 +353,11 @@ module HexaPDF
313
353
  @startxref_offset = lines[eof_index - 1].to_i
314
354
  end
315
355
 
356
+ # Returns the reconstructed revision.
357
+ def reconstructed_revision
358
+ @reconstructed_revision ||= reconstruct_revision
359
+ end
360
+
316
361
  # Returns the PDF version number that is stored in the file header.
317
362
  #
318
363
  # See: PDF1.7 s7.5.2
@@ -338,6 +383,61 @@ module HexaPDF
338
383
  @header_version = $1
339
384
  end
340
385
 
386
+ # Tries to reconstruct the PDF document's main cross-reference table by serially parsing the
387
+ # file and returning a Revision object for loading the found objects.
388
+ #
389
+ # If the file contains multiple cross-reference sections, all objects will be put into a single
390
+ # cross-reference table, later objects overwriting prior ones.
391
+ def reconstruct_revision
392
+ raise unless @document.config['parser.try_xref_reconstruction']
393
+ msg = "#{$!} - trying cross-reference table reconstruction"
394
+ @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
395
+
396
+ xref = XRefSection.new
397
+ @tokenizer.pos = 0
398
+ while true
399
+ @tokenizer.skip_whitespace
400
+ pos = @tokenizer.pos
401
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
402
+ next_new_line_pos = @tokenizer.pos
403
+ @tokenizer.pos = pos
404
+
405
+ token = @tokenizer.next_integer_or_keyword rescue nil
406
+ if token.kind_of?(Integer)
407
+ gen = @tokenizer.next_integer_or_keyword rescue nil
408
+ tok = @tokenizer.next_integer_or_keyword rescue nil
409
+ if @tokenizer.pos > next_new_line_pos
410
+ @tokenizer.pos = next_new_line_pos
411
+ elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
412
+ xref.add_in_use_entry(token, gen, pos)
413
+ @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/)
414
+ end
415
+ elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
416
+ obj = @tokenizer.next_object rescue nil
417
+ # Use last trailer found in case of multiple revisions but use first trailer in case of
418
+ # linearized file.
419
+ trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?)
420
+ elsif token == Tokenizer::NO_MORE_TOKENS
421
+ break
422
+ else
423
+ @tokenizer.pos = next_new_line_pos
424
+ end
425
+ end
426
+
427
+ trailer&.delete(:Prev) # no need for this and may wreak havoc
428
+ if !trailer || trailer.empty?
429
+ raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
430
+ end
431
+
432
+ loader = lambda do |xref_entry|
433
+ obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
434
+ @document.wrap(obj, oid: oid, gen: gen, stream: stream)
435
+ end
436
+
437
+ Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
438
+ loader: loader)
439
+ end
440
+
341
441
  # Raises a HexaPDF::MalformedPDFError with the given message and source position.
342
442
  def raise_malformed(msg, pos: nil)
343
443
  raise HexaPDF::MalformedPDFError.new(msg, pos: pos)