hexapdf 0.14.1 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +97 -0
  3. data/lib/hexapdf/cli/form.rb +30 -8
  4. data/lib/hexapdf/configuration.rb +19 -4
  5. data/lib/hexapdf/content/canvas.rb +1 -0
  6. data/lib/hexapdf/dictionary.rb +3 -0
  7. data/lib/hexapdf/dictionary_fields.rb +1 -1
  8. data/lib/hexapdf/encryption/security_handler.rb +7 -2
  9. data/lib/hexapdf/error.rb +4 -3
  10. data/lib/hexapdf/filter.rb +1 -0
  11. data/lib/hexapdf/filter/crypt.rb +60 -0
  12. data/lib/hexapdf/font/true_type/subsetter.rb +7 -3
  13. data/lib/hexapdf/font/type1/afm_parser.rb +2 -1
  14. data/lib/hexapdf/parser.rb +46 -14
  15. data/lib/hexapdf/pdf_array.rb +3 -0
  16. data/lib/hexapdf/revision.rb +16 -0
  17. data/lib/hexapdf/serializer.rb +10 -3
  18. data/lib/hexapdf/tokenizer.rb +44 -3
  19. data/lib/hexapdf/type/acro_form.rb +1 -0
  20. data/lib/hexapdf/type/acro_form/appearance_generator.rb +32 -17
  21. data/lib/hexapdf/type/acro_form/button_field.rb +8 -4
  22. data/lib/hexapdf/type/acro_form/field.rb +1 -0
  23. data/lib/hexapdf/type/acro_form/form.rb +37 -0
  24. data/lib/hexapdf/type/acro_form/signature_field.rb +223 -0
  25. data/lib/hexapdf/type/annotation.rb +13 -9
  26. data/lib/hexapdf/type/annotations/widget.rb +3 -1
  27. data/lib/hexapdf/type/font_descriptor.rb +9 -2
  28. data/lib/hexapdf/type/page.rb +81 -0
  29. data/lib/hexapdf/type/resources.rb +4 -0
  30. data/lib/hexapdf/type/xref_stream.rb +7 -0
  31. data/lib/hexapdf/utils/graphics_helpers.rb +4 -4
  32. data/lib/hexapdf/version.rb +1 -1
  33. data/test/hexapdf/content/test_canvas.rb +21 -0
  34. data/test/hexapdf/encryption/test_security_handler.rb +15 -0
  35. data/test/hexapdf/filter/test_crypt.rb +21 -0
  36. data/test/hexapdf/font/true_type/test_subsetter.rb +7 -2
  37. data/test/hexapdf/font/type1/test_afm_parser.rb +5 -0
  38. data/test/hexapdf/test_dictionary_fields.rb +7 -0
  39. data/test/hexapdf/test_parser.rb +82 -2
  40. data/test/hexapdf/test_revision.rb +21 -0
  41. data/test/hexapdf/test_serializer.rb +10 -0
  42. data/test/hexapdf/test_tokenizer.rb +50 -0
  43. data/test/hexapdf/test_writer.rb +2 -2
  44. data/test/hexapdf/type/acro_form/test_appearance_generator.rb +24 -3
  45. data/test/hexapdf/type/acro_form/test_button_field.rb +13 -7
  46. data/test/hexapdf/type/acro_form/test_field.rb +5 -0
  47. data/test/hexapdf/type/acro_form/test_form.rb +46 -2
  48. data/test/hexapdf/type/acro_form/test_signature_field.rb +38 -0
  49. data/test/hexapdf/type/annotations/test_widget.rb +2 -0
  50. data/test/hexapdf/type/test_annotation.rb +20 -10
  51. data/test/hexapdf/type/test_font_descriptor.rb +7 -0
  52. data/test/hexapdf/type/test_page.rb +187 -49
  53. data/test/hexapdf/type/test_resources.rb +6 -0
  54. data/test/hexapdf/type/test_xref_stream.rb +7 -0
  55. data/test/hexapdf/utils/test_graphics_helpers.rb +8 -0
  56. metadata +6 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e4010e277168cec5c8cc5d584ec324064461e63756d18b538cd335235fe04e6d
4
- data.tar.gz: 2b7a71463082a32605adee682c81cdde6b0eb48d360ca66249b08884f82e571b
3
+ metadata.gz: 310d9fc74134cb2840118b3637c35a3037909d707532116680ef2fe6f42c43d3
4
+ data.tar.gz: 76f05b220e101114d4a4136d8c07520cfe35e1e532652ea4e43593a4b812284c
5
5
  SHA512:
6
- metadata.gz: 5748273dc4dc532cd365598e25c4a9cc5872011d2eb638c2986050aeed0a68d2dc5769fda075eb60cbcb76fccbfb1a5b52c3c58581cb6e969978c17d770013e6
7
- data.tar.gz: 0ab3abf80967804486fa1f50f186b508fd792acfbd8c47646fa7d0c5b0245161e2833620142b2f05a1ee73b01145016dca7bf7781d579284160c9d2dd2c78d0c
6
+ metadata.gz: 6e811e637b859f3e327ece6af28174bd6602cec0585af596c9b174127ab7276752c3e519e668757933945f2906f4a6856a96e68575eb9139f32365bfa6b8a36b
7
+ data.tar.gz: 54e94c7d6704a340d1e5a5ad7b35e0715b06b6604ecec4671a079cea6049f786308258a44378280597d59e66c17e4eefd6317e94fe9a465671c2d093a7395ad3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,100 @@
1
+ ## 0.15.1 - 2021-04-15
2
+
3
+ ### Fixed
4
+
5
+ * Potential division by zero when calculating the scaling for XObjects
6
+ * Handling of XObjects with a width or height of zero when drawing on canvas
7
+
8
+
9
+ ## 0.15.0 - 2021-04-12
10
+
11
+ ### Added
12
+
13
+ * [HexaPDF::Type::Page#flatten_annotations] for flattening the annotations of a
14
+ page
15
+ * [HexaPDF::Type::AcroForm::Form#flatten] for flattening interactive forms
16
+ * [HexaPDF::Revision#update] for updating the stored wrapper class of a PDF
17
+ object
18
+ * [HexaPDF::Type::AcroForm::SignatureField] for working with AcroForm signature
19
+ fields
20
+ * Support for form field flattening to the `hexapdf form` CLI command
21
+
22
+ ### Changed
23
+
24
+ * **Breaking change**: Overhauled the interface for accessing appearances of
25
+ annotations to make it more convenient
26
+ * Validation of [HexaPDF::Type::FontDescriptor] to delete invalid `/FontWeight`
27
+ value
28
+ * [HexaPDF::MalformedPDFError#pos] an accessor instead of a reader and update
29
+ the exception message
30
+ * Configuration option 'acro_form.fallback_font' to allow a callable object for
31
+ more advanced fallback font handling
32
+
33
+ ### Fixed
34
+
35
+ * [HexaPDF::Type::Annotations::Widget#background_color] to correctly handle
36
+ empty background color arrays
37
+ * [HexaPDF::Type::AcroForm::Field#delete_widget] to update the wrapper object
38
+ stored in the document in case the widget is embedded
39
+ * Processing of invalid PDF files containing a space,CR,LF combination after
40
+ the 'stream' keyword
41
+ * Cross-reference stream reconstruction with respect to detection of linearized
42
+ files
43
+ * Detection of existing appearances for AcroForm push button fields when
44
+ creating appearances
45
+
46
+
47
+ ## 0.14.4 - 2021-02-27
48
+
49
+ ### Added
50
+
51
+ * Support for the Crypt filters
52
+
53
+ ### Changed
54
+
55
+ * [HexaPDF::MalformedPDFError] to make the `pos` argument optional
56
+
57
+ ### Fixed
58
+
59
+ * Handling of invalid floating point numbers NaN, Inf and -Inf when serializing
60
+ * Processing of invalid PDF files containing NaN and Inf instead of numbers
61
+ * Bug in Type1 font AFM parser that occured if the file doesn't end with a new
62
+ line character
63
+ * Cross-reference table reconstruction to handle the case of an entry specifying
64
+ a non-existent indirect object
65
+ * Cross-reference table reconstruction to handle trailers specified by cross-
66
+ reference streams
67
+ * Cross-reference table reconstruction to use the set security handle for
68
+ decrypting indirect objects
69
+ * Parsing of cross-reference streams where data is missing
70
+
71
+
72
+ ## 0.14.3 - 2021-02-16
73
+
74
+ ### Fixed
75
+
76
+ * Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
77
+ text output
78
+ * [HexaPDF::Serializer] to handle infinite recursion problem
79
+ * Cross-reference table reconstruction to avoid an O(n^2) performance problem
80
+ * [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
81
+ containing a single value instead of an array
82
+ * Processing of invalid PDF files missing a required value in appearance streams
83
+ * Processing of invalid empty arrays that should be rectangles by converting
84
+ them to PDF null objects
85
+ * Processing of invalid PDF files containing indirect objects with offset 0
86
+ * Processing of invalid PDF files containing a space/CR or space/LF combination
87
+ after the 'stream' keyword
88
+
89
+
90
+ ## 0.14.2 - 2021-01-22
91
+
92
+ ### Fixed
93
+
94
+ * [HexaPDF::Font::TrueType::Subsetter#use_glyph] to really avoid using subset
95
+ glyph ID 41 (`)`)
96
+
97
+
1
98
  ## 0.14.1 - 2021-01-21
2
99
 
3
100
  ### Changed
@@ -52,18 +52,26 @@ module HexaPDF
52
52
  If the the output file name is not given, all form fields are listed in page order. Use
53
53
  the global --verbose option to show additional information like field type and location.
54
54
 
55
- If the output file name is given, the fields can be interactively filled out. By
56
- additionally using the --template option, the data for the fields is read from the given
57
- template file instead of the standard input.
55
+ If the output file name is given, the fields can be filled out interactively, via a
56
+ template or just flattened by using the respective options. Form field flattening can also
57
+ be activated in addition to filling out the form. If neither --fill, --template nor
58
+ --flatten is specified, --fill is implied.
58
59
  EOF
59
60
 
60
61
  options.on("--password PASSWORD", "-p", String,
61
62
  "The password for decryption. Use - for reading from standard input.") do |pwd|
62
63
  @password = (pwd == '-' ? read_password : pwd)
63
64
  end
65
+ options.on("--fill", "Fill out the form") do
66
+ @fill = true
67
+ end
64
68
  options.on("--template TEMPLATE_FILE", "-t TEMPLATE_FILE",
65
- "Use the template file for the field values") do |template|
69
+ "Use the template file for the field values (implies --fill)") do |template|
66
70
  @template = template
71
+ @fill = true
72
+ end
73
+ options.on('--flatten', 'Flatten the form fields') do
74
+ @flatten = true
67
75
  end
68
76
  options.on("--[no-]viewer-override", "Let the PDF viewer override the visual " \
69
77
  "appearance. Default: use setting from input PDF") do |need_appearances|
@@ -75,6 +83,8 @@ module HexaPDF
75
83
  end
76
84
 
77
85
  @password = nil
86
+ @fill = false
87
+ @flatten = false
78
88
  @template = nil
79
89
  @need_appearances = nil
80
90
  @incremental = true
@@ -82,16 +92,28 @@ module HexaPDF
82
92
 
83
93
  def execute(in_file, out_file = nil) #:nodoc:
84
94
  maybe_raise_on_existing_file(out_file) if out_file
95
+ if (@fill || @flatten) && !out_file
96
+ raise "Output file missing"
97
+ end
85
98
  with_document(in_file, password: @password, out_file: out_file,
86
99
  incremental: @incremental) do |doc|
87
100
  if !doc.acro_form
88
101
  raise "This PDF doesn't contain an interactive form"
89
102
  elsif out_file
90
103
  doc.acro_form[:NeedAppearances] = @need_appearances unless @need_appearances.nil?
91
- if @template
92
- fill_form_with_template(doc)
93
- else
94
- fill_form(doc)
104
+ if @fill || !@flatten
105
+ if @template
106
+ fill_form_with_template(doc)
107
+ else
108
+ fill_form(doc)
109
+ end
110
+ end
111
+ if @flatten
112
+ unless doc.acro_form.flatten.empty?
113
+ $stderr.puts "Warning: Not all form fields could be flattened"
114
+ doc.catalog.delete(:AcroForm)
115
+ doc.delete(doc.acro_form)
116
+ end
95
117
  end
96
118
  else
97
119
  list_form_fields(doc)
@@ -164,9 +164,20 @@ module HexaPDF
164
164
  # acro_form.fallback_font::
165
165
  # The font that should be used when a variable text field references a font that cannot be used.
166
166
  #
167
- # Can either be the name of a font, like 'Helvetica', or an array consisting of the font name
168
- # and a hash of font options, like ['Helvetica', variant: :italic]. If set to +nil+, the use of
169
- # the fallback font is disabled.
167
+ # Can be one of the following:
168
+ #
169
+ # * The name of a font, like 'Helvetica'.
170
+ #
171
+ # * An array consisting of the font name and a hash of font options, like ['Helvetica',
172
+ # variant: :italic].
173
+ #
174
+ # * A callable object receiving the field and the font object (or +nil+ if no valid font object
175
+ # was found) and which has to return either a font name or an array consisting of the font
176
+ # name and a hash of font options. This way the response can be different depending on the
177
+ # original font and it would also allow e.g. modifying the configured fonts to add custom
178
+ # ones.
179
+ #
180
+ # If set to +nil+, the use of the fallback font is disabled.
170
181
  #
171
182
  # Default is 'Helvetica'.
172
183
  #
@@ -393,7 +404,7 @@ module HexaPDF
393
404
  DCTDecode: 'HexaPDF::Filter::PassThrough',
394
405
  DCT: 'HexaPDF::Filter::PassThrough',
395
406
  JPXDecode: 'HexaPDF::Filter::PassThrough',
396
- Crypt: nil,
407
+ Crypt: 'HexaPDF::Filter::Crypt',
397
408
  Encryption: 'HexaPDF::Filter::Encryption',
398
409
  },
399
410
  'font.map' => {},
@@ -516,6 +527,9 @@ module HexaPDF
516
527
  XXAcroFormField: 'HexaPDF::Type::AcroForm::Field',
517
528
  XXAppearanceDictionary: 'HexaPDF::Type::Annotation::AppearanceDictionary',
518
529
  Border: 'HexaPDF::Type::Annotation::Border',
530
+ SigFieldLock: 'HexaPDF::Type::AcroForm::SignatureField::LockDictionary',
531
+ SV: 'HexaPDF::Type::AcroForm::SignatureField::SeedValueDictionary',
532
+ SVCert: 'HexaPDF::Type::AcroForm::SignatureField::CertificateSeedValueDictionary',
519
533
  },
520
534
  'object.subtype_map' => {
521
535
  nil => {
@@ -561,6 +575,7 @@ module HexaPDF
561
575
  Tx: 'HexaPDF::Type::AcroForm::TextField',
562
576
  Btn: 'HexaPDF::Type::AcroForm::ButtonField',
563
577
  Ch: 'HexaPDF::Type::AcroForm::ChoiceField',
578
+ Sig: 'HexaPDF::Type::AcroForm::SignatureField',
564
579
  },
565
580
  })
566
581
 
@@ -1260,6 +1260,7 @@ module HexaPDF
1260
1260
  unless obj.kind_of?(HexaPDF::Stream)
1261
1261
  obj = context.document.images.add(obj)
1262
1262
  end
1263
+ return obj if obj.width == 0 || obj.height == 0
1263
1264
 
1264
1265
  width, height = calculate_dimensions(obj.width, obj.height,
1265
1266
  rwidth: width, rheight: height)
@@ -156,6 +156,9 @@ module HexaPDF
156
156
  #
157
157
  # * Returns the default value if one is specified and no value is available.
158
158
  #
159
+ # Note: If field information is available for the entry, a Hash or Array value will always be
160
+ # wrapped by Dictionary or PDFArray. Otherwise, the value will be returned as-is.
161
+ #
159
162
  # Note: This method may throw a "can't add a new key into hash during iteration" error in
160
163
  # certain cases because it potentially modifies the underlying hash!
161
164
  def [](name)
@@ -344,7 +344,7 @@ module HexaPDF
344
344
  # Wraps a given array in the Rectangle class. Otherwise returns +nil+.
345
345
  def self.convert(data, _type, document)
346
346
  return unless data.kind_of?(Array) || data.kind_of?(HexaPDF::PDFArray)
347
- document.wrap(data, type: Rectangle)
347
+ data.empty? ? document.wrap(nil) : document.wrap(data, type: Rectangle)
348
348
  end
349
349
 
350
350
  end
@@ -268,7 +268,7 @@ module HexaPDF
268
268
  str.replace(string_algorithm.decrypt(key, str))
269
269
  end
270
270
 
271
- if obj.kind_of?(HexaPDF::Stream)
271
+ if obj.kind_of?(HexaPDF::Stream) && obj.raw_stream.filter[0] != :Crypt
272
272
  unless string_algorithm == stream_algorithm
273
273
  key = object_key(obj.oid, obj.gen, stream_algorithm)
274
274
  end
@@ -300,7 +300,12 @@ module HexaPDF
300
300
  obj.raw_stream.key == key && obj.raw_stream.algorithm == stream_algorithm
301
301
  obj.raw_stream.undecrypted_fiber
302
302
  else
303
- stream_algorithm.encryption_fiber(key, result)
303
+ filter = obj[:Filter]
304
+ if filter == :Crypt || (filter.kind_of?(PDFArray) && filter[0] == :Crypt)
305
+ result
306
+ else
307
+ stream_algorithm.encryption_fiber(key, result)
308
+ end
304
309
  end
305
310
  end
306
311
 
data/lib/hexapdf/error.rb CHANGED
@@ -43,12 +43,13 @@ module HexaPDF
43
43
  class MalformedPDFError < Error
44
44
 
45
45
  # The byte position in the PDF file where the error occured.
46
- attr_reader :pos
46
+ attr_accessor :pos
47
47
 
48
48
  # Creates a new malformed PDF error object for the given exception message.
49
49
  #
50
- # The byte position where the error occured has to be given via the +pos+ argument.
51
- def initialize(message, pos:)
50
+ # The byte position where the error occured can either be given via the +pos+ argument or later
51
+ # via the #pos accessor but must be set before the exception message is retrieved.
52
+ def initialize(message, pos: nil)
52
53
  super(message)
53
54
  @pos = pos
54
55
  end
@@ -95,6 +95,7 @@ module HexaPDF
95
95
  autoload(:Predictor, 'hexapdf/filter/predictor')
96
96
 
97
97
  autoload(:Encryption, 'hexapdf/filter/encryption')
98
+ autoload(:Crypt, 'hexapdf/filter/crypt')
98
99
 
99
100
  autoload(:PassThrough, 'hexapdf/filter/pass_through')
100
101
 
@@ -0,0 +1,60 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2020 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'hexapdf/error'
38
+
39
+ module HexaPDF
40
+ module Filter
41
+
42
+ # This filter module implements the Crypt filter. The only supported part is using the Identity
43
+ # filter.
44
+ module Crypt
45
+
46
+ # See HexaPDF::Filter
47
+ def self.decoder(source, options)
48
+ if !options || !options.key?(:Name) || options[:Name] == :Identity
49
+ source
50
+ else
51
+ raise FilterError, "Handling of Crypt filters besides Identity is not implemented"
52
+ end
53
+ end
54
+
55
+ singleton_class.send(:alias_method, :encoder, :decoder)
56
+
57
+ end
58
+
59
+ end
60
+ end
@@ -63,10 +63,14 @@ module HexaPDF
63
63
  def use_glyph(glyph_id)
64
64
  return @glyph_map[glyph_id] if @glyph_map.key?(glyph_id)
65
65
  @last_id += 1
66
- # Handle codes for ASCII characters \r, (, ) and \ specially so that they never appear in
67
- # the output (PDF serialization would need to escape them)
68
- if @last_id == 13 || @last_id == 40 || @last_id == 41 || @last_id == 92
66
+ # Handle codes for ASCII characters \r (13), (, ) (40, 41) and \ (92) specially so that
67
+ # they never appear in the output (PDF serialization would need to escape them)
68
+ if @last_id == 13 || @last_id == 40 || @last_id == 92
69
69
  @glyph_map[:"s#{@last_id}"] = @last_id
70
+ if @last_id == 40
71
+ @last_id += 1
72
+ @glyph_map[:"s#{@last_id}"] = @last_id
73
+ end
70
74
  @last_id += 1
71
75
  end
72
76
  @glyph_map[glyph_id] = @last_id
@@ -207,7 +207,8 @@ module HexaPDF
207
207
 
208
208
  # Returns the rest of the line, with whitespace stripped.
209
209
  def parse_string
210
- line = @line.strip!
210
+ @line.strip!
211
+ line = @line
211
212
  @line = ''
212
213
  line
213
214
  end
@@ -56,10 +56,12 @@ module HexaPDF
56
56
  # PDF references are resolved using the associated Document object.
57
57
  def initialize(io, document)
58
58
  @io = io
59
- @tokenizer = Tokenizer.new(io)
59
+ on_correctable_error = document.config['parser.on_correctable_error'].curry[document]
60
+ @tokenizer = Tokenizer.new(io, on_correctable_error: on_correctable_error)
60
61
  @document = document
61
62
  @object_stream_data = {}
62
63
  @reconstructed_revision = nil
64
+ @in_reconstruct_revision = false
63
65
  retrieve_pdf_header_offset_and_version
64
66
  end
65
67
 
@@ -72,7 +74,13 @@ module HexaPDF
72
74
  obj, oid, gen, stream =
73
75
  case xref_entry.type
74
76
  when :in_use
75
- parse_indirect_object(xref_entry.pos)
77
+ if xref_entry.pos == 0 && xref_entry.oid != 0
78
+ # Handle seen-in-the-wild objects with invalid offset 0
79
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
80
+ [nil, xref_entry.oid, xref_entry.gen, nil]
81
+ else
82
+ parse_indirect_object(xref_entry.pos)
83
+ end
76
84
  when :free
77
85
  [nil, xref_entry.oid, xref_entry.gen, nil]
78
86
  when :compressed
@@ -83,12 +91,13 @@ module HexaPDF
83
91
 
84
92
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
85
93
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
86
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
94
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
87
95
  end
88
96
 
89
97
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
90
98
  rescue HexaPDF::MalformedPDFError
91
- reconstructed_revision.object(xref_entry)
99
+ reconstructed_revision.object(xref_entry) ||
100
+ @document.wrap(nil, oid: xref_entry.oid, gen: xref_entry.gen)
92
101
  end
93
102
 
94
103
  # Parses the indirect object at the specified offset.
@@ -131,7 +140,11 @@ module HexaPDF
131
140
  raise_malformed("A stream needs a dictionary, not a(n) #{object.class}", pos: offset)
132
141
  end
133
142
  tok1 = @tokenizer.next_byte
134
- tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
143
+ if tok1 == 32 # space
144
+ maybe_raise("Keyword stream followed by space instead of LF or CR/LF", pos: @tokenizer.pos)
145
+ tok1 = @tokenizer.next_byte
146
+ end
147
+ tok2 = @tokenizer.next_byte if tok1 == 13 # CR
135
148
  if tok1 != 10 && tok1 != 13
136
149
  raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
137
150
  elsif tok1 == 13 && tok2 != 10
@@ -203,7 +216,12 @@ module HexaPDF
203
216
  unless obj.respond_to?(:xref_section)
204
217
  raise_malformed("Object is not a cross-reference stream", pos: pos)
205
218
  end
206
- xref_section = obj.xref_section
219
+ begin
220
+ xref_section = obj.xref_section
221
+ rescue MalformedPDFError => e
222
+ e.pos = pos
223
+ raise
224
+ end
207
225
  trailer = obj.trailer
208
226
  unless xref_section.entry?(obj.oid, obj.gen)
209
227
  maybe_raise("Cross-reference stream doesn't contain entry for itself", pos: pos)
@@ -381,34 +399,42 @@ module HexaPDF
381
399
  # If the file contains multiple cross-reference sections, all objects will be put into a single
382
400
  # cross-reference table, later objects overwriting prior ones.
383
401
  def reconstruct_revision
402
+ return if @in_reconstruct_revision
403
+ @in_reconstruct_revision = true
404
+
384
405
  raise unless @document.config['parser.try_xref_reconstruction']
385
406
  msg = "#{$!} - trying cross-reference table reconstruction"
386
407
  @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
387
408
 
388
409
  xref = XRefSection.new
389
410
  @tokenizer.pos = 0
411
+ linearized = nil
390
412
  while true
391
413
  @tokenizer.skip_whitespace
392
414
  pos = @tokenizer.pos
393
- @tokenizer.scan_until(/(\n|\r\n?)+/)
415
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
394
416
  next_new_line_pos = @tokenizer.pos
395
417
  @tokenizer.pos = pos
396
418
 
397
- token = @tokenizer.next_token rescue nil
419
+ token = @tokenizer.next_integer_or_keyword rescue nil
398
420
  if token.kind_of?(Integer)
399
- gen = @tokenizer.next_token rescue nil
400
- tok = @tokenizer.next_token rescue nil
421
+ gen = @tokenizer.next_integer_or_keyword rescue nil
422
+ tok = @tokenizer.next_integer_or_keyword rescue nil
401
423
  if @tokenizer.pos > next_new_line_pos
402
424
  @tokenizer.pos = next_new_line_pos
403
425
  elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
404
426
  xref.add_in_use_entry(token, gen, pos)
427
+ if linearized.nil?
428
+ obj = @tokenizer.next_object rescue nil
429
+ linearized = obj.kind_of?(Hash) && obj.key?(:Linearized)
430
+ end
405
431
  @tokenizer.scan_until(/(?:\n|\r\n?)endobj\b/)
406
432
  end
407
433
  elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
408
434
  obj = @tokenizer.next_object rescue nil
409
435
  # Use last trailer found in case of multiple revisions but use first trailer in case of
410
436
  # linearized file.
411
- trailer = obj if obj.kind_of?(Hash) && (obj.key?(:Prev) || trailer.nil?)
437
+ trailer = obj if obj.kind_of?(Hash) && (!linearized || trailer.nil?)
412
438
  elsif token == Tokenizer::NO_MORE_TOKENS
413
439
  break
414
440
  else
@@ -416,16 +442,22 @@ module HexaPDF
416
442
  end
417
443
  end
418
444
 
419
- trailer&.delete(:Prev) # no need for this and may wreak havoc
420
445
  if !trailer || trailer.empty?
421
- raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
446
+ _, trailer = load_revision(startxref_offset) rescue nil
447
+ unless trailer
448
+ @in_reconstruct_revision = false
449
+ raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
450
+ end
422
451
  end
452
+ trailer&.delete(:Prev) # no need for this and may wreak havoc
423
453
 
424
454
  loader = lambda do |xref_entry|
425
455
  obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
426
- @document.wrap(obj, oid: oid, gen: gen, stream: stream)
456
+ obj = @document.wrap(obj, oid: oid, gen: gen, stream: stream)
457
+ @document.security_handler ? @document.security_handler.decrypt(obj) : obj
427
458
  end
428
459
 
460
+ @in_reconstruct_revision = false
429
461
  Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
430
462
  loader: loader)
431
463
  end