hexapdf 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22367739c2160e7a5dbc9e8b20bfefb06ef96664ccb814e4ab719a9bce7b68f3
4
- data.tar.gz: 8f703fa1e8e7b9d0b966e37becbecbbb7a4c1d81d0a823d770fa3d531efb1a4a
3
+ metadata.gz: c43d8e9e117db1717ddfee73a54e4384743b8aa35863ab5bd19ffe57b8ce5674
4
+ data.tar.gz: 1020c8a3de8fcdf201500c1c0d22dfb99ed27daebac7baac92748f8127efc992
5
5
  SHA512:
6
- metadata.gz: b05954e3c3890cbbc40d8171e6f1d7f6375569d69b5c90fc0e74b5ea5553ff5a45f1090a3741f50bfab7a6e0725b1e79f5c0eab7b92e6f7e518f38eb1eb6a3f8
7
- data.tar.gz: 2a3441b7ee7ca89e1417ea4134c3ab7444b4f791f5cba274361e719626fe9b0b08c903425b53a7fb7b04be3cab96a4edbb5c502dc1e7b4e3cdc809f8a9ebafb6
6
+ metadata.gz: e19eea4e88077afb7e8532fa6fe9ab2a03ffc5588749b72277462a971ebcec877ee72868d0ab698744117d46566be98e65c10225649d3bd1b4cd6e64e9625767
7
+ data.tar.gz: 6626a9feba0af0b46f293c1069a0d53b458a0dc29d08b82253f14f9bb98a878b914042faccc433b73f2f0e35d4da47c58a1bdebd2f3dee2fefb24c076a4e6bb3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,21 @@
1
+ ## 0.14.3 - 2021-02-16
2
+
3
+ ### Fixed
4
+
5
+ * Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
6
+ text output
7
+ * [HexaPDF::Serializer] to handle infinite recursion problem
8
+ * Cross-reference table reconstruction to avoid an O(n^2) performance problem
9
+ * [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
10
+ containing a single value instead of an array
11
+ * Processing of invalid PDF files missing a required value in appearance streams
12
+ * Processing of invalid empty arrays that should be rectangles by converting
13
+ them to PDF null objects
14
+ * Processing of invalid PDF files containing indirect objects with offset 0
15
+ * Processing of invalid PDF files containing a space/CR or space/LF combination
16
+ after the 'stream' keyword
17
+
18
+
1
19
  ## 0.14.2 - 2021-01-22
2
20
 
3
21
  ### Fixed
@@ -156,6 +156,9 @@ module HexaPDF
156
156
  #
157
157
  # * Returns the default value if one is specified and no value is available.
158
158
  #
159
+ # Note: If field information is available for the entry, a Hash or Array value will always be
160
+ # wrapped by Dictionary or PDFArray. Otherwise, the value will be returned as-is.
161
+ #
159
162
  # Note: This method may throw a "can't add a new key into hash during iteration" error in
160
163
  # certain cases because it potentially modifies the underlying hash!
161
164
  def [](name)
@@ -344,7 +344,7 @@ module HexaPDF
344
344
  # Wraps a given array in the Rectangle class. Otherwise returns +nil+.
345
345
  def self.convert(data, _type, document)
346
346
  return unless data.kind_of?(Array) || data.kind_of?(HexaPDF::PDFArray)
347
- document.wrap(data, type: Rectangle)
347
+ data.empty? ? document.wrap(nil) : document.wrap(data, type: Rectangle)
348
348
  end
349
349
 
350
350
  end
@@ -67,7 +67,11 @@ module HexaPDF
67
67
  # they never appear in the output (PDF serialization would need to escape them)
68
68
  if @last_id == 13 || @last_id == 40 || @last_id == 92
69
69
  @glyph_map[:"s#{@last_id}"] = @last_id
70
- @last_id += (@last_id == 40 ? 2 : 1)
70
+ if @last_id == 40
71
+ @last_id += 1
72
+ @glyph_map[:"s#{@last_id}"] = @last_id
73
+ end
74
+ @last_id += 1
71
75
  end
72
76
  @glyph_map[glyph_id] = @last_id
73
77
  end
@@ -72,7 +72,13 @@ module HexaPDF
72
72
  obj, oid, gen, stream =
73
73
  case xref_entry.type
74
74
  when :in_use
75
- parse_indirect_object(xref_entry.pos)
75
+ if xref_entry.pos == 0 && xref_entry.oid != 0
76
+ # Handle seen-in-the-wild objects with invalid offset 0
77
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
78
+ [nil, xref_entry.oid, xref_entry.gen, nil]
79
+ else
80
+ parse_indirect_object(xref_entry.pos)
81
+ end
76
82
  when :free
77
83
  [nil, xref_entry.oid, xref_entry.gen, nil]
78
84
  when :compressed
@@ -83,7 +89,7 @@ module HexaPDF
83
89
 
84
90
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
85
91
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
86
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
92
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
87
93
  end
88
94
 
89
95
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
@@ -133,7 +139,9 @@ module HexaPDF
133
139
  tok1 = @tokenizer.next_byte
134
140
  tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
135
141
  if tok1 != 10 && tok1 != 13
136
- raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
142
+ tok2 = @tokenizer.next_byte
143
+ maybe_raise("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos,
144
+ force: tok1 != 32 || (tok2 != 10 && tok2 != 13)) # 32=space
137
145
  elsif tok1 == 13 && tok2 != 10
138
146
  maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
139
147
  pos: @tokenizer.pos)
@@ -390,14 +398,14 @@ module HexaPDF
390
398
  while true
391
399
  @tokenizer.skip_whitespace
392
400
  pos = @tokenizer.pos
393
- @tokenizer.scan_until(/(\n|\r\n?)+/)
401
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
394
402
  next_new_line_pos = @tokenizer.pos
395
403
  @tokenizer.pos = pos
396
404
 
397
- token = @tokenizer.next_token rescue nil
405
+ token = @tokenizer.next_integer_or_keyword rescue nil
398
406
  if token.kind_of?(Integer)
399
- gen = @tokenizer.next_token rescue nil
400
- tok = @tokenizer.next_token rescue nil
407
+ gen = @tokenizer.next_integer_or_keyword rescue nil
408
+ tok = @tokenizer.next_integer_or_keyword rescue nil
401
409
  if @tokenizer.pos > next_new_line_pos
402
410
  @tokenizer.pos = next_new_line_pos
403
411
  elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
@@ -65,6 +65,9 @@ module HexaPDF
65
65
  # * Returns the native Ruby object for values with class HexaPDF::Object. However, all
66
66
  # subclasses of HexaPDF::Object are returned as is (it makes no sense, for example, to return
67
67
  # the hash that describes the Catalog instead of the Catalog object).
68
+ #
69
+ # Note: Hash or Array values will always be returned as-is, i.e. not wrapped with Dictionary or
70
+ # PDFArray.
68
71
  def [](arg1, arg2 = nil)
69
72
  data = arg2 ? value[arg1, arg2] : value[arg1]
70
73
  return if data.nil?
@@ -343,6 +343,7 @@ module HexaPDF
343
343
  @io << data.freeze
344
344
  end
345
345
  @io << "\nendstream"
346
+ @in_object = false
346
347
 
347
348
  nil
348
349
  else
@@ -350,12 +351,12 @@ module HexaPDF
350
351
  obj.value[:Length] = data.size
351
352
 
352
353
  str = serialize_hash(obj.value)
354
+ @in_object = false
355
+
353
356
  str << "stream\n"
354
357
  str << data
355
358
  str << "\nendstream"
356
359
  end
357
- ensure
358
- @in_object = false
359
360
  end
360
361
 
361
362
  # Invokes the correct serialization method for the object.
@@ -188,6 +188,28 @@ module HexaPDF
188
188
  token
189
189
  end
190
190
 
191
+ # Returns a single integer or keyword token read from the current position and advances the scan
192
+ # pointer. If the current position doesn't contain such a token, +nil+ is returned without
193
+ # advancing the scan pointer. The value +NO_MORE_TOKENS+ is returned if there are no more tokens
194
+ # available.
195
+ #
196
+ # Initial runs of whitespace characters are ignored.
197
+ #
198
+ # Note: This is a special method meant for use with reconstructing the cross-reference table!
199
+ def next_integer_or_keyword
200
+ skip_whitespace
201
+ byte = @ss.string.getbyte(@ss.pos) || -1
202
+ if 48 <= byte && byte <= 57
203
+ parse_number
204
+ elsif (97 <= byte && byte <= 122) || (65 <= byte && byte <= 90)
205
+ parse_keyword
206
+ elsif byte == -1 # we reached the end of the file
207
+ NO_MORE_TOKENS
208
+ else
209
+ nil
210
+ end
211
+ end
212
+
191
213
  # Reads the byte (an integer) at the current position and advances the scan pointer.
192
214
  def next_byte
193
215
  prepare_string_scanner(1)
@@ -245,6 +245,9 @@ module HexaPDF
245
245
  end
246
246
 
247
247
  form = (@widget[:AP] ||= {})[:N] ||= @document.add({Type: :XObject, Subtype: :Form})
248
+ # Wrap existing object in Form class in case the PDF writer didn't include the /Subtype
249
+ # key; we can do this since we know this has to be a Form object
250
+ form = @document.wrap(form, type: :XObject, subtype: :Form) unless form[:Subtype] == :Form
248
251
  form.value.replace({Type: :XObject, Subtype: :Form, BBox: [0, 0, rect.width, rect.height]})
249
252
  form.contents = ''
250
253
  form[:Resources] = HexaPDF::Object.deep_copy(default_resources)
@@ -222,6 +222,10 @@ module HexaPDF
222
222
  yield("No procedure set specified", true)
223
223
  self[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI]
224
224
  else
225
+ if val.kind_of?(Symbol)
226
+ yield("Procedure set is a single value instead of an Array", true)
227
+ val = value[:ProcSet] = [val]
228
+ end
225
229
  val.reject! do |name|
226
230
  case name
227
231
  when :PDF, :Text, :ImageB, :ImageC, :ImageI
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '0.14.2'
40
+ VERSION = '0.14.3'
41
41
 
42
42
  end
@@ -29,6 +29,8 @@ describe HexaPDF::Font::TrueType::Subsetter do
29
29
 
30
30
  it "doesn't use certain subset glyph IDs for performance reasons" do
31
31
  1.upto(93) {|i| @subsetter.use_glyph(i) }
32
+ # glyph 0, 93 used glyph, 4 special glyphs
33
+ assert_equal(1 + 93 + 4, @subsetter.instance_variable_get(:@glyph_map).size)
32
34
  1.upto(12) {|i| assert_equal(i, @subsetter.subset_glyph_id(i), "id=#{i}") }
33
35
  13.upto(38) {|i| assert_equal(i + 1, @subsetter.subset_glyph_id(i), "id=#{i}") }
34
36
  39.upto(88) {|i| assert_equal(i + 3, @subsetter.subset_glyph_id(i), "id=#{i}") }
@@ -234,5 +234,12 @@ describe HexaPDF::DictionaryFields do
234
234
  @field.convert(data, doc)
235
235
  doc.verify
236
236
  end
237
+
238
+ it "converts to a null value if an (invalid) empty array is given" do
239
+ doc = Minitest::Mock.new
240
+ doc.expect(:wrap, :data, [nil])
241
+ @field.convert([], doc)
242
+ doc.verify
243
+ end
237
244
  end
238
245
  end
@@ -88,6 +88,12 @@ describe HexaPDF::Parser do
88
88
  assert_equal('12', TestHelper.collector(stream.fiber))
89
89
  end
90
90
 
91
+ it "handles keyword stream followed by space and CR or LF" do
92
+ create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
93
+ *, stream = @parser.parse_indirect_object
94
+ assert_equal('12', TestHelper.collector(stream.fiber))
95
+ end
96
+
91
97
  it "handles invalid indirect object value consisting of number followed by endobj without space" do
92
98
  create_parser("1 0 obj 749endobj")
93
99
  object, * = @parser.parse_indirect_object
@@ -157,6 +163,12 @@ describe HexaPDF::Parser do
157
163
  assert_match(/not CR alone/, exp.message)
158
164
  end
159
165
 
166
+ it "fails if keyword stream is followed by space and CR or LF instead of LF or CR/LF" do
167
+ create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
168
+ exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
169
+ assert_match(/must be followed by LF or CR\/LF/, exp.message)
170
+ end
171
+
160
172
  it "fails for numbers followed by endobj without space" do
161
173
  create_parser("1 0 obj 749endobj")
162
174
  exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
@@ -222,6 +234,23 @@ describe HexaPDF::Parser do
222
234
  assert_equal([1, 2], obj.value)
223
235
  end
224
236
 
237
+ it "handles an invalid indirect object offset of 0" do
238
+ obj = @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
239
+ assert(obj.null?)
240
+ assert_equal(2, obj.oid)
241
+ assert_equal(0, obj.gen)
242
+ end
243
+
244
+ describe "with strict parsing" do
245
+ it "raises an error if an indirect object has an offset of 0" do
246
+ @document.config['parser.on_correctable_error'] = proc { true }
247
+ exp = assert_raises(HexaPDF::MalformedPDFError) do
248
+ @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
249
+ end
250
+ assert_match(/has offset 0/, exp.message)
251
+ end
252
+ end
253
+
225
254
  it "fails if another object is found instead of an object stream" do
226
255
  def (@document).object(_oid)
227
256
  :invalid
@@ -512,6 +541,13 @@ describe HexaPDF::Parser do
512
541
  assert_equal(6, @parser.load_object(@xref).value)
513
542
  end
514
543
 
544
+ it "handles pathalogical cases which contain many opened literal strings" do
545
+ time = Time.now
546
+ create_parser("(1" << "(abc\n" * 10000 << "\n1 0 obj\n6\nendobj\ntrailer\n<</Size 1>>")
547
+ assert_equal(6, @parser.load_object(@xref).value)
548
+ assert(Time.now - time < 0.5, "Xref reconstruction takes too long")
549
+ end
550
+
515
551
  it "ignores invalid objects" do
516
552
  create_parser("1 x obj\n5\nendobj\n1 0 xobj\n6\nendobj\n1 0 obj 4\nendobj\ntrailer\n<</Size 1>>")
517
553
  assert_equal(4, @parser.load_object(@xref).value)
@@ -153,6 +153,13 @@ describe HexaPDF::Serializer do
153
153
  assert_equal("<</Key(value)/Length 6>>stream\nsome\nendstream", io.string)
154
154
  end
155
155
 
156
+ it "doesn't reset the internal recursion flag if the stream is serialized as part of another object" do
157
+ object = HexaPDF::Dictionary.new({}, oid: 5)
158
+ object[:Stream] = @stream
159
+ object[:Self] = object # needs to be the last entry so that :Stream gets serialized first!
160
+ assert_serialized("<</Stream 2 0 R/Self 5 0 R>>", object)
161
+ end
162
+
156
163
  it "fails if a stream without object identifier is serialized" do
157
164
  @stream.oid = 0
158
165
  assert_raises(HexaPDF::Error) { @serializer.serialize(@stream) }
@@ -27,4 +27,32 @@ describe HexaPDF::Tokenizer do
27
27
  5.times {|i| assert_equal(i, @tokenizer.next_token) }
28
28
  end
29
29
  end
30
+
31
+ it "has a special token scanning method for use with xref reconstruction" do
32
+ create_tokenizer(<<-EOF.chomp.gsub(/^ {8}/, ''))
33
+ % Comment
34
+ true
35
+ 123 50
36
+ obj
37
+ (ignored)
38
+ /Ignored
39
+ [/Ignored]
40
+ <</Ignored /Values>>
41
+ EOF
42
+
43
+ scan_to_newline = proc { @tokenizer.scan_until(/(\n|\r\n?)+|\z/) }
44
+
45
+ assert_nil(@tokenizer.next_integer_or_keyword)
46
+ scan_to_newline.call
47
+ assert_equal(true, @tokenizer.next_integer_or_keyword)
48
+ assert_equal(123, @tokenizer.next_integer_or_keyword)
49
+ assert_equal(50, @tokenizer.next_integer_or_keyword)
50
+ assert_equal('obj', @tokenizer.next_integer_or_keyword)
51
+ 4.times do
52
+ assert_nil(@tokenizer.next_integer_or_keyword)
53
+ scan_to_newline.call
54
+ end
55
+ assert_equal(HexaPDF::Tokenizer::NO_MORE_TOKENS, @tokenizer.next_integer_or_keyword)
56
+ end
57
+
30
58
  end
@@ -40,7 +40,7 @@ describe HexaPDF::Writer do
40
40
  219
41
41
  %%EOF
42
42
  3 0 obj
43
- <</Producer(HexaPDF version 0.14.2)>>
43
+ <</Producer(HexaPDF version 0.14.3)>>
44
44
  endobj
45
45
  xref
46
46
  3 1
@@ -72,7 +72,7 @@ describe HexaPDF::Writer do
72
72
  141
73
73
  %%EOF
74
74
  6 0 obj
75
- <</Producer(HexaPDF version 0.14.2)>>
75
+ <</Producer(HexaPDF version 0.14.3)>>
76
76
  endobj
77
77
  2 0 obj
78
78
  <</Length 10>>stream
@@ -407,10 +407,12 @@ describe HexaPDF::Type::AcroForm::AppearanceGenerator do
407
407
  @generator.create_appearances
408
408
  form = @widget[:AP][:N]
409
409
  form[:key] = :value
410
+ form.delete(:Subtype)
411
+ @widget[:AP][:N] = @doc.wrap(form, type: HexaPDF::Dictionary)
410
412
 
411
413
  @field[:V] = 'test1'
412
414
  @generator.create_appearances
413
- assert_same(form, @widget[:AP][:N])
415
+ assert_equal(form, @widget[:AP][:N])
414
416
  refute(form.key?(:key))
415
417
  assert_match(/test1/, form.contents)
416
418
  end
@@ -194,6 +194,12 @@ describe HexaPDF::Type::Resources do
194
194
  assert_equal([:PDF, :Text, :ImageB, :ImageC, :ImageI], @res[:ProcSet].value)
195
195
  end
196
196
 
197
+ it "handles an invalid ProcSet containing a single value instead of an array" do
198
+ @res[:ProcSet] = :PDF
199
+ @res.validate
200
+ assert_equal([:PDF], @res[:ProcSet].value)
201
+ end
202
+
197
203
  it "removes invalid procedure set names from ProcSet" do
198
204
  @res[:ProcSet] = [:PDF, :Unknown]
199
205
  @res.validate
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hexapdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.14.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Thomas Leitner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-22 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cmdparse