hexapdf 0.14.2 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +96 -0
  3. data/lib/hexapdf/cli/form.rb +30 -8
  4. data/lib/hexapdf/configuration.rb +19 -4
  5. data/lib/hexapdf/content/canvas.rb +1 -0
  6. data/lib/hexapdf/dictionary.rb +3 -0
  7. data/lib/hexapdf/dictionary_fields.rb +1 -1
  8. data/lib/hexapdf/encryption/security_handler.rb +7 -2
  9. data/lib/hexapdf/encryption/standard_security_handler.rb +12 -0
  10. data/lib/hexapdf/error.rb +4 -3
  11. data/lib/hexapdf/filter.rb +1 -0
  12. data/lib/hexapdf/filter/crypt.rb +60 -0
  13. data/lib/hexapdf/font/true_type/subsetter.rb +5 -1
  14. data/lib/hexapdf/font/type1/afm_parser.rb +2 -1
  15. data/lib/hexapdf/parser.rb +46 -14
  16. data/lib/hexapdf/pdf_array.rb +3 -0
  17. data/lib/hexapdf/revision.rb +16 -0
  18. data/lib/hexapdf/serializer.rb +10 -3
  19. data/lib/hexapdf/tokenizer.rb +44 -3
  20. data/lib/hexapdf/type/acro_form.rb +1 -0
  21. data/lib/hexapdf/type/acro_form/appearance_generator.rb +32 -17
  22. data/lib/hexapdf/type/acro_form/button_field.rb +8 -4
  23. data/lib/hexapdf/type/acro_form/field.rb +1 -0
  24. data/lib/hexapdf/type/acro_form/form.rb +37 -0
  25. data/lib/hexapdf/type/acro_form/signature_field.rb +223 -0
  26. data/lib/hexapdf/type/annotation.rb +13 -9
  27. data/lib/hexapdf/type/annotations/widget.rb +3 -1
  28. data/lib/hexapdf/type/font_descriptor.rb +9 -2
  29. data/lib/hexapdf/type/page.rb +81 -0
  30. data/lib/hexapdf/type/resources.rb +4 -0
  31. data/lib/hexapdf/type/xref_stream.rb +7 -0
  32. data/lib/hexapdf/utils/graphics_helpers.rb +4 -4
  33. data/lib/hexapdf/version.rb +1 -1
  34. data/test/hexapdf/content/test_canvas.rb +21 -0
  35. data/test/hexapdf/encryption/test_security_handler.rb +15 -0
  36. data/test/hexapdf/encryption/test_standard_security_handler.rb +26 -0
  37. data/test/hexapdf/filter/test_crypt.rb +21 -0
  38. data/test/hexapdf/font/true_type/test_subsetter.rb +2 -0
  39. data/test/hexapdf/font/type1/test_afm_parser.rb +5 -0
  40. data/test/hexapdf/test_dictionary_fields.rb +7 -0
  41. data/test/hexapdf/test_parser.rb +82 -2
  42. data/test/hexapdf/test_revision.rb +21 -0
  43. data/test/hexapdf/test_serializer.rb +10 -0
  44. data/test/hexapdf/test_tokenizer.rb +50 -0
  45. data/test/hexapdf/test_writer.rb +2 -2
  46. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +24 -3
  47. data/test/hexapdf/type/acro_form/test_button_field.rb +13 -7
  48. data/test/hexapdf/type/acro_form/test_field.rb +5 -0
  49. data/test/hexapdf/type/acro_form/test_form.rb +46 -2
  50. data/test/hexapdf/type/acro_form/test_signature_field.rb +38 -0
  51. data/test/hexapdf/type/annotations/test_widget.rb +2 -0
  52. data/test/hexapdf/type/test_annotation.rb +20 -10
  53. data/test/hexapdf/type/test_font_descriptor.rb +7 -0
  54. data/test/hexapdf/type/test_page.rb +187 -49
  55. data/test/hexapdf/type/test_resources.rb +6 -0
  56. data/test/hexapdf/type/test_xref_stream.rb +7 -0
  57. data/test/hexapdf/utils/test_graphics_helpers.rb +8 -0
  58. metadata +6 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22367739c2160e7a5dbc9e8b20bfefb06ef96664ccb814e4ab719a9bce7b68f3
4
- data.tar.gz: 8f703fa1e8e7b9d0b966e37becbecbbb7a4c1d81d0a823d770fa3d531efb1a4a
3
+ metadata.gz: 1a6ef1bdd17664ef0b9474b931d31ad69c57c77928ce2b69a3bbd5dadda0dce6
4
+ data.tar.gz: 11b87436d19cc5498fd6a77f0b6c410e717809264153964668be0f5199b9354d
5
5
  SHA512:
6
- metadata.gz: b05954e3c3890cbbc40d8171e6f1d7f6375569d69b5c90fc0e74b5ea5553ff5a45f1090a3741f50bfab7a6e0725b1e79f5c0eab7b92e6f7e518f38eb1eb6a3f8
7
- data.tar.gz: 2a3441b7ee7ca89e1417ea4134c3ab7444b4f791f5cba274361e719626fe9b0b08c903425b53a7fb7b04be3cab96a4edbb5c502dc1e7b4e3cdc809f8a9ebafb6
6
+ metadata.gz: 525a55832758b5eecd1a7f2daf5f220e1afa7ff8e88ca2d65998e658585f290ff2018450e50423f2331b7f195865eab8b1c62562ecdbf3671b46d4da770aed12
7
+ data.tar.gz: 0b0e18c7f79f0e2a54080fefad1dd4d94e15157f72e5360a3ebd827fc0cc2037ae6e06302155426e7f0900e97ee0cee678e069bd8ef05a9333d684c50e1343a5
data/CHANGELOG.md CHANGED
@@ -1,3 +1,99 @@
1
+ ## 0.15.2 - 2021-05-01
2
+
3
+ ### Fixed
4
+
5
+ * Handling of unencrypted metadata streams
6
+
7
+
8
+ ## 0.15.1 - 2021-04-15
9
+
10
+ ### Fixed
11
+
12
+ * Potential division by zero when calculating the scaling for XObjects
13
+ * Handling of XObjects with a width or height of zero when drawing on canvas
14
+
15
+
16
+ ## 0.15.0 - 2021-04-12
17
+
18
+ ### Added
19
+
20
+ * [HexaPDF::Type::Page#flatten_annotations] for flattening the annotations of a
21
+ page
22
+ * [HexaPDF::Type::AcroForm::Form#flatten] for flattening interactive forms
23
+ * [HexaPDF::Revision#update] for updating the stored wrapper class of a PDF
24
+ object
25
+ * [HexaPDF::Type::AcroForm::SignatureField] for working with AcroForm signature
26
+ fields
27
+ * Support for form field flattening to the `hexapdf form` CLI command
28
+
29
+ ### Changed
30
+
31
+ * **Breaking change**: Overhauled the interface for accessing appearances of
32
+ annotations to make it more convenient
33
+ * Validation of [HexaPDF::Type::FontDescriptor] to delete invalid `/FontWeight`
34
+ value
35
+ * [HexaPDF::MalformedPDFError#pos] an accessor instead of a reader and update
36
+ the exception message
37
+ * Configuration option 'acro_form.fallback_font' to allow a callable object for
38
+ more advanced fallback font handling
39
+
40
+ ### Fixed
41
+
42
+ * [HexaPDF::Type::Annotations::Widget#background_color] to correctly handle
43
+ empty background color arrays
44
+ * [HexaPDF::Type::AcroForm::Field#delete_widget] to update the wrapper object
45
+ stored in the document in case the widget is embedded
46
+ * Processing of invalid PDF files containing a space,CR,LF combination after
47
+ the 'stream' keyword
48
+ * Cross-reference stream reconstruction with respect to detection of linearized
49
+ files
50
+ * Detection of existing appearances for AcroForm push button fields when
51
+ creating appearances
52
+
53
+
54
+ ## 0.14.4 - 2021-02-27
55
+
56
+ ### Added
57
+
58
+ * Support for the Crypt filters
59
+
60
+ ### Changed
61
+
62
+ * [HexaPDF::MalformedPDFError] to make the `pos` argument optional
63
+
64
+ ### Fixed
65
+
66
+ * Handling of invalid floating point numbers NaN, Inf and -Inf when serializing
67
+ * Processing of invalid PDF files containing NaN and Inf instead of numbers
68
+ * Bug in Type1 font AFM parser that occured if the file doesn't end with a new
69
+ line character
70
+ * Cross-reference table reconstruction to handle the case of an entry specifying
71
+ a non-existent indirect object
72
+ * Cross-reference table reconstruction to handle trailers specified by cross-
73
+ reference streams
74
+ * Cross-reference table reconstruction to use the set security handle for
75
+ decrypting indirect objects
76
+ * Parsing of cross-reference streams where data is missing
77
+
78
+
79
+ ## 0.14.3 - 2021-02-16
80
+
81
+ ### Fixed
82
+
83
+ * Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
84
+ text output
85
+ * [HexaPDF::Serializer] to handle infinite recursion problem
86
+ * Cross-reference table reconstruction to avoid an O(n^2) performance problem
87
+ * [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
88
+ containing a single value instead of an array
89
+ * Processing of invalid PDF files missing a required value in appearance streams
90
+ * Processing of invalid empty arrays that should be rectangles by converting
91
+ them to PDF null objects
92
+ * Processing of invalid PDF files containing indirect objects with offset 0
93
+ * Processing of invalid PDF files containing a space/CR or space/LF combination
94
+ after the 'stream' keyword
95
+
96
+
1
97
  ## 0.14.2 - 2021-01-22
2
98
 
3
99
  ### Fixed
@@ -52,18 +52,26 @@ module HexaPDF
52
52
  If the the output file name is not given, all form fields are listed in page order. Use
53
53
  the global --verbose option to show additional information like field type and location.
54
54
 
55
- If the output file name is given, the fields can be interactively filled out. By
56
- additionally using the --template option, the data for the fields is read from the given
57
- template file instead of the standard input.
55
+ If the output file name is given, the fields can be filled out interactively, via a
56
+ template or just flattened by using the respective options. Form field flattening can also
57
+ be activated in addition to filling out the form. If neither --fill, --template nor
58
+ --flatten is specified, --fill is implied.
58
59
  EOF
59
60
 
60
61
  options.on("--password PASSWORD", "-p", String,
61
62
  "The password for decryption. Use - for reading from standard input.") do |pwd|
62
63
  @password = (pwd == '-' ? read_password : pwd)
63
64
  end
65
+ options.on("--fill", "Fill out the form") do
66
+ @fill = true
67
+ end
64
68
  options.on("--template TEMPLATE_FILE", "-t TEMPLATE_FILE",
65
- "Use the template file for the field values") do |template|
69
+ "Use the template file for the field values (implies --fill)") do |template|
66
70
  @template = template
71
+ @fill = true
72
+ end
73
+ options.on('--flatten', 'Flatten the form fields') do
74
+ @flatten = true
67
75
  end
68
76
  options.on("--[no-]viewer-override", "Let the PDF viewer override the visual " \
69
77
  "appearance. Default: use setting from input PDF") do |need_appearances|
@@ -75,6 +83,8 @@ module HexaPDF
75
83
  end
76
84
 
77
85
  @password = nil
86
+ @fill = false
87
+ @flatten = false
78
88
  @template = nil
79
89
  @need_appearances = nil
80
90
  @incremental = true
@@ -82,16 +92,28 @@ module HexaPDF
82
92
 
83
93
  def execute(in_file, out_file = nil) #:nodoc:
84
94
  maybe_raise_on_existing_file(out_file) if out_file
95
+ if (@fill || @flatten) && !out_file
96
+ raise "Output file missing"
97
+ end
85
98
  with_document(in_file, password: @password, out_file: out_file,
86
99
  incremental: @incremental) do |doc|
87
100
  if !doc.acro_form
88
101
  raise "This PDF doesn't contain an interactive form"
89
102
  elsif out_file
90
103
  doc.acro_form[:NeedAppearances] = @need_appearances unless @need_appearances.nil?
91
- if @template
92
- fill_form_with_template(doc)
93
- else
94
- fill_form(doc)
104
+ if @fill || !@flatten
105
+ if @template
106
+ fill_form_with_template(doc)
107
+ else
108
+ fill_form(doc)
109
+ end
110
+ end
111
+ if @flatten
112
+ unless doc.acro_form.flatten.empty?
113
+ $stderr.puts "Warning: Not all form fields could be flattened"
114
+ doc.catalog.delete(:AcroForm)
115
+ doc.delete(doc.acro_form)
116
+ end
95
117
  end
96
118
  else
97
119
  list_form_fields(doc)
@@ -164,9 +164,20 @@ module HexaPDF
164
164
  # acro_form.fallback_font::
165
165
  # The font that should be used when a variable text field references a font that cannot be used.
166
166
  #
167
- # Can either be the name of a font, like 'Helvetica', or an array consisting of the font name
168
- # and a hash of font options, like ['Helvetica', variant: :italic]. If set to +nil+, the use of
169
- # the fallback font is disabled.
167
+ # Can be one of the following:
168
+ #
169
+ # * The name of a font, like 'Helvetica'.
170
+ #
171
+ # * An array consisting of the font name and a hash of font options, like ['Helvetica',
172
+ # variant: :italic].
173
+ #
174
+ # * A callable object receiving the field and the font object (or +nil+ if no valid font object
175
+ # was found) and which has to return either a font name or an array consisting of the font
176
+ # name and a hash of font options. This way the response can be different depending on the
177
+ # original font and it would also allow e.g. modifying the configured fonts to add custom
178
+ # ones.
179
+ #
180
+ # If set to +nil+, the use of the fallback font is disabled.
170
181
  #
171
182
  # Default is 'Helvetica'.
172
183
  #
@@ -393,7 +404,7 @@ module HexaPDF
393
404
  DCTDecode: 'HexaPDF::Filter::PassThrough',
394
405
  DCT: 'HexaPDF::Filter::PassThrough',
395
406
  JPXDecode: 'HexaPDF::Filter::PassThrough',
396
- Crypt: nil,
407
+ Crypt: 'HexaPDF::Filter::Crypt',
397
408
  Encryption: 'HexaPDF::Filter::Encryption',
398
409
  },
399
410
  'font.map' => {},
@@ -516,6 +527,9 @@ module HexaPDF
516
527
  XXAcroFormField: 'HexaPDF::Type::AcroForm::Field',
517
528
  XXAppearanceDictionary: 'HexaPDF::Type::Annotation::AppearanceDictionary',
518
529
  Border: 'HexaPDF::Type::Annotation::Border',
530
+ SigFieldLock: 'HexaPDF::Type::AcroForm::SignatureField::LockDictionary',
531
+ SV: 'HexaPDF::Type::AcroForm::SignatureField::SeedValueDictionary',
532
+ SVCert: 'HexaPDF::Type::AcroForm::SignatureField::CertificateSeedValueDictionary',
519
533
  },
520
534
  'object.subtype_map' => {
521
535
  nil => {
@@ -561,6 +575,7 @@ module HexaPDF
561
575
  Tx: 'HexaPDF::Type::AcroForm::TextField',
562
576
  Btn: 'HexaPDF::Type::AcroForm::ButtonField',
563
577
  Ch: 'HexaPDF::Type::AcroForm::ChoiceField',
578
+ Sig: 'HexaPDF::Type::AcroForm::SignatureField',
564
579
  },
565
580
  })
566
581
 
@@ -1260,6 +1260,7 @@ module HexaPDF
1260
1260
  unless obj.kind_of?(HexaPDF::Stream)
1261
1261
  obj = context.document.images.add(obj)
1262
1262
  end
1263
+ return obj if obj.width == 0 || obj.height == 0
1263
1264
 
1264
1265
  width, height = calculate_dimensions(obj.width, obj.height,
1265
1266
  rwidth: width, rheight: height)
@@ -156,6 +156,9 @@ module HexaPDF
156
156
  #
157
157
  # * Returns the default value if one is specified and no value is available.
158
158
  #
159
+ # Note: If field information is available for the entry, a Hash or Array value will always be
160
+ # wrapped by Dictionary or PDFArray. Otherwise, the value will be returned as-is.
161
+ #
159
162
  # Note: This method may throw a "can't add a new key into hash during iteration" error in
160
163
  # certain cases because it potentially modifies the underlying hash!
161
164
  def [](name)
@@ -344,7 +344,7 @@ module HexaPDF
344
344
  # Wraps a given array in the Rectangle class. Otherwise returns +nil+.
345
345
  def self.convert(data, _type, document)
346
346
  return unless data.kind_of?(Array) || data.kind_of?(HexaPDF::PDFArray)
347
- document.wrap(data, type: Rectangle)
347
+ data.empty? ? document.wrap(nil) : document.wrap(data, type: Rectangle)
348
348
  end
349
349
 
350
350
  end
@@ -268,7 +268,7 @@ module HexaPDF
268
268
  str.replace(string_algorithm.decrypt(key, str))
269
269
  end
270
270
 
271
- if obj.kind_of?(HexaPDF::Stream)
271
+ if obj.kind_of?(HexaPDF::Stream) && obj.raw_stream.filter[0] != :Crypt
272
272
  unless string_algorithm == stream_algorithm
273
273
  key = object_key(obj.oid, obj.gen, stream_algorithm)
274
274
  end
@@ -300,7 +300,12 @@ module HexaPDF
300
300
  obj.raw_stream.key == key && obj.raw_stream.algorithm == stream_algorithm
301
301
  obj.raw_stream.undecrypted_fiber
302
302
  else
303
- stream_algorithm.encryption_fiber(key, result)
303
+ filter = obj[:Filter]
304
+ if filter == :Crypt || (filter.kind_of?(PDFArray) && filter[0] == :Crypt)
305
+ result
306
+ else
307
+ stream_algorithm.encryption_fiber(key, result)
308
+ end
304
309
  end
305
310
  end
306
311
 
@@ -240,6 +240,18 @@ module HexaPDF
240
240
  end
241
241
  end
242
242
 
243
+ def decrypt(obj) #:nodoc:
244
+ if obj.type == :Metadata && obj == document.catalog.value[:Metadata] && !dict[:EncryptMetadata]
245
+ obj
246
+ else
247
+ super
248
+ end
249
+ end
250
+
251
+ def encrypt_stream(obj) #:nodoc
252
+ obj == document.catalog.value[:Metadata] && !dict[:EncryptMetadata] ? obj.stream_encoder : super
253
+ end
254
+
243
255
  private
244
256
 
245
257
  # Prepares the security handler for use in encrypting the document.
data/lib/hexapdf/error.rb CHANGED
@@ -43,12 +43,13 @@ module HexaPDF
43
43
  class MalformedPDFError < Error
44
44
 
45
45
  # The byte position in the PDF file where the error occured.
46
- attr_reader :pos
46
+ attr_accessor :pos
47
47
 
48
48
  # Creates a new malformed PDF error object for the given exception message.
49
49
  #
50
- # The byte position where the error occured has to be given via the +pos+ argument.
51
- def initialize(message, pos:)
50
+ # The byte position where the error occured can either be given via the +pos+ argument or later
51
+ # via the #pos accessor but must be set before the exception message is retrieved.
52
+ def initialize(message, pos: nil)
52
53
  super(message)
53
54
  @pos = pos
54
55
  end
@@ -95,6 +95,7 @@ module HexaPDF
95
95
  autoload(:Predictor, 'hexapdf/filter/predictor')
96
96
 
97
97
  autoload(:Encryption, 'hexapdf/filter/encryption')
98
+ autoload(:Crypt, 'hexapdf/filter/crypt')
98
99
 
99
100
  autoload(:PassThrough, 'hexapdf/filter/pass_through')
100
101
 
@@ -0,0 +1,60 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2020 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'hexapdf/error'
38
+
39
+ module HexaPDF
40
+ module Filter
41
+
42
+ # This filter module implements the Crypt filter. The only supported part is using the Identity
43
+ # filter.
44
+ module Crypt
45
+
46
+ # See HexaPDF::Filter
47
+ def self.decoder(source, options)
48
+ if !options || !options.key?(:Name) || options[:Name] == :Identity
49
+ source
50
+ else
51
+ raise FilterError, "Handling of Crypt filters besides Identity is not implemented"
52
+ end
53
+ end
54
+
55
+ singleton_class.send(:alias_method, :encoder, :decoder)
56
+
57
+ end
58
+
59
+ end
60
+ end
@@ -67,7 +67,11 @@ module HexaPDF
67
67
  # they never appear in the output (PDF serialization would need to escape them)
68
68
  if @last_id == 13 || @last_id == 40 || @last_id == 92
69
69
  @glyph_map[:"s#{@last_id}"] = @last_id
70
- @last_id += (@last_id == 40 ? 2 : 1)
70
+ if @last_id == 40
71
+ @last_id += 1
72
+ @glyph_map[:"s#{@last_id}"] = @last_id
73
+ end
74
+ @last_id += 1
71
75
  end
72
76
  @glyph_map[glyph_id] = @last_id
73
77
  end
@@ -207,7 +207,8 @@ module HexaPDF
207
207
 
208
208
  # Returns the rest of the line, with whitespace stripped.
209
209
  def parse_string
210
- line = @line.strip!
210
+ @line.strip!
211
+ line = @line
211
212
  @line = ''
212
213
  line
213
214
  end
@@ -56,10 +56,12 @@ module HexaPDF
56
56
  # PDF references are resolved using the associated Document object.
57
57
  def initialize(io, document)
58
58
  @io = io
59
- @tokenizer = Tokenizer.new(io)
59
+ on_correctable_error = document.config['parser.on_correctable_error'].curry[document]
60
+ @tokenizer = Tokenizer.new(io, on_correctable_error: on_correctable_error)
60
61
  @document = document
61
62
  @object_stream_data = {}
62
63
  @reconstructed_revision = nil
64
+ @in_reconstruct_revision = false
63
65
  retrieve_pdf_header_offset_and_version
64
66
  end
65
67
 
@@ -72,7 +74,13 @@ module HexaPDF
72
74
  obj, oid, gen, stream =
73
75
  case xref_entry.type
74
76
  when :in_use
75
- parse_indirect_object(xref_entry.pos)
77
+ if xref_entry.pos == 0 && xref_entry.oid != 0
78
+ # Handle seen-in-the-wild objects with invalid offset 0
79
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
80
+ [nil, xref_entry.oid, xref_entry.gen, nil]
81
+ else
82
+ parse_indirect_object(xref_entry.pos)
83
+ end
76
84
  when :free
77
85
  [nil, xref_entry.oid, xref_entry.gen, nil]
78
86
  when :compressed
@@ -83,12 +91,13 @@ module HexaPDF
83
91
 
84
92
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
85
93
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
86
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
94
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
87
95
  end
88
96
 
89
97
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
90
98
  rescue HexaPDF::MalformedPDFError
91
- reconstructed_revision.object(xref_entry)
99
+ reconstructed_revision.object(xref_entry) ||
100
+ @document.wrap(nil, oid: xref_entry.oid, gen: xref_entry.gen)
92
101
  end
93
102
 
94
103
  # Parses the indirect object at the specified offset.
@@ -131,7 +140,11 @@ module HexaPDF
131
140
  raise_malformed("A stream needs a dictionary, not a(n) #{object.class}", pos: offset)
132
141
  end
133
142
  tok1 = @tokenizer.next_byte
134
- tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
143
+ if tok1 == 32 # space
144
+ maybe_raise("Keyword stream followed by space instead of LF or CR/LF", pos: @tokenizer.pos)
145
+ tok1 = @tokenizer.next_byte
146
+ end
147
+ tok2 = @tokenizer.next_byte if tok1 == 13 # CR
135
148
  if tok1 != 10 && tok1 != 13
136
149
  raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
137
150
  elsif tok1 == 13 && tok2 != 10
@@ -203,7 +216,12 @@ module HexaPDF
203
216
  unless obj.respond_to?(:xref_section)
204
217
  raise_malformed("Object is not a cross-reference stream", pos: pos)
205
218
  end
206
- xref_section = obj.xref_section
219
+ begin
220
+ xref_section = obj.xref_section
221
+ rescue MalformedPDFError => e
222
+ e.pos = pos
223
+ raise
224
+ end
207
225
  trailer = obj.trailer
208
226
  unless xref_section.entry?(obj.oid, obj.gen)
209
227
  maybe_raise("Cross-reference stream doesn't contain entry for itself", pos: pos)
@@ -381,34 +399,42 @@ module HexaPDF
381
399
  # If the file contains multiple cross-reference sections, all objects will be put into a single
382
400
  # cross-reference table, later objects overwriting prior ones.
383
401
  def reconstruct_revision
402
+ return if @in_reconstruct_revision
403
+ @in_reconstruct_revision = true
404
+
384
405
  raise unless @document.config['parser.try_xref_reconstruction']
385
406
  msg = "#{$!} - trying cross-reference table reconstruction"
386
407
  @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
387
408
 
388
409
  xref = XRefSection.new
389
410
  @tokenizer.pos = 0
411
+ linearized = nil
390
412
  while true
391
413
  @tokenizer.skip_whitespace
392
414
  pos = @tokenizer.pos
393
- @tokenizer.scan_until(/(\n|\r\n?)+/)
415
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
394
416
  next_new_line_pos = @tokenizer.pos
395
417
  @tokenizer.pos = pos
396
418
 
397
- token = @tokenizer.next_token rescue nil
419
+ token = @tokenizer.next_integer_or_keyword rescue nil
398
420
  if token.kind_of?(Integer)
399
- gen = @tokenizer.next_token rescue nil
400
- tok = @tokenizer.next_token rescue nil
421
+ gen = @tokenizer.next_integer_or_keyword rescue nil
422
+ tok = @tokenizer.next_integer_or_keyword rescue nil
401
423
  if @tokenizer.pos > next_new_line_pos
402
424
  @tokenizer.pos = next_new_line_pos
403
425
  elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
404
426
  xref.add_in_use_entry(token, gen, pos)
427
+ if linearized.nil?
428
+ obj = @tokenizer.next_object rescue nil
429
+ linearized = obj.kind_of?(Hash) && obj.key?(:Linearized)
430
+ end
405
431
  @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/)
406
432
  end
407
433
  elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
408
434
  obj = @tokenizer.next_object rescue nil
409
435
  # Use last trailer found in case of multiple revisions but use first trailer in case of
410
436
  # linearized file.
411
- trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?)
437
+ trailer = obj if obj.kind_of?(Hash) && (!linearized || trailer.nil?)
412
438
  elsif token == Tokenizer::NO_MORE_TOKENS
413
439
  break
414
440
  else
@@ -416,16 +442,22 @@ module HexaPDF
416
442
  end
417
443
  end
418
444
 
419
- trailer&.delete(:Prev) # no need for this and may wreak havoc
420
445
  if !trailer || trailer.empty?
421
- raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
446
+ _, trailer = load_revision(startxref_offset) rescue nil
447
+ unless trailer
448
+ @in_reconstruct_revision = false
449
+ raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
450
+ end
422
451
  end
452
+ trailer&.delete(:Prev) # no need for this and may wreak havoc
423
453
 
424
454
  loader = lambda do |xref_entry|
425
455
  obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
426
- @document.wrap(obj, oid: oid, gen: gen, stream: stream)
456
+ obj = @document.wrap(obj, oid: oid, gen: gen, stream: stream)
457
+ @document.security_handler ? @document.security_handler.decrypt(obj) : obj
427
458
  end
428
459
 
460
+ @in_reconstruct_revision = false
429
461
  Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
430
462
  loader: loader)
431
463
  end