hexapdf 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 22367739c2160e7a5dbc9e8b20bfefb06ef96664ccb814e4ab719a9bce7b68f3
4
- data.tar.gz: 8f703fa1e8e7b9d0b966e37becbecbbb7a4c1d81d0a823d770fa3d531efb1a4a
3
+ metadata.gz: c43d8e9e117db1717ddfee73a54e4384743b8aa35863ab5bd19ffe57b8ce5674
4
+ data.tar.gz: 1020c8a3de8fcdf201500c1c0d22dfb99ed27daebac7baac92748f8127efc992
5
5
  SHA512:
6
- metadata.gz: b05954e3c3890cbbc40d8171e6f1d7f6375569d69b5c90fc0e74b5ea5553ff5a45f1090a3741f50bfab7a6e0725b1e79f5c0eab7b92e6f7e518f38eb1eb6a3f8
7
- data.tar.gz: 2a3441b7ee7ca89e1417ea4134c3ab7444b4f791f5cba274361e719626fe9b0b08c903425b53a7fb7b04be3cab96a4edbb5c502dc1e7b4e3cdc809f8a9ebafb6
6
+ metadata.gz: e19eea4e88077afb7e8532fa6fe9ab2a03ffc5588749b72277462a971ebcec877ee72868d0ab698744117d46566be98e65c10225649d3bd1b4cd6e64e9625767
7
+ data.tar.gz: 6626a9feba0af0b46f293c1069a0d53b458a0dc29d08b82253f14f9bb98a878b914042faccc433b73f2f0e35d4da47c58a1bdebd2f3dee2fefb24c076a4e6bb3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,21 @@
1
+ ## 0.14.3 - 2021-02-16
2
+
3
+ ### Fixed
4
+
5
+ * Bug in [HexaPDF::Font::TrueType::Subsetter#use_glyph] which lead to corrupt
6
+ text output
7
+ * [HexaPDF::Serializer] to handle infinite recursion problem
8
+ * Cross-reference table reconstruction to avoid an O(n^2) performance problem
9
+ * [HexaPDF::Type::Resources] validation to handle an invalid `/ProcSet` entry
10
+ containing a single value instead of an array
11
+ * Processing of invalid PDF files missing a required value in appearance streams
12
+ * Processing of invalid empty arrays that should be rectangles by converting
13
+ them to PDF null objects
14
+ * Processing of invalid PDF files containing indirect objects with offset 0
15
+ * Processing of invalid PDF files containing a space/CR or space/LF combination
16
+ after the 'stream' keyword
17
+
18
+
1
19
  ## 0.14.2 - 2021-01-22
2
20
 
3
21
  ### Fixed
@@ -156,6 +156,9 @@ module HexaPDF
156
156
  #
157
157
  # * Returns the default value if one is specified and no value is available.
158
158
  #
159
+ # Note: If field information is available for the entry, a Hash or Array value will always be
160
+ # wrapped by Dictionary or PDFArray. Otherwise, the value will be returned as-is.
161
+ #
159
162
  # Note: This method may throw a "can't add a new key into hash during iteration" error in
160
163
  # certain cases because it potentially modifies the underlying hash!
161
164
  def [](name)
@@ -344,7 +344,7 @@ module HexaPDF
344
344
  # Wraps a given array in the Rectangle class. Otherwise returns +nil+.
345
345
  def self.convert(data, _type, document)
346
346
  return unless data.kind_of?(Array) || data.kind_of?(HexaPDF::PDFArray)
347
- document.wrap(data, type: Rectangle)
347
+ data.empty? ? document.wrap(nil) : document.wrap(data, type: Rectangle)
348
348
  end
349
349
 
350
350
  end
@@ -67,7 +67,11 @@ module HexaPDF
67
67
  # they never appear in the output (PDF serialization would need to escape them)
68
68
  if @last_id == 13 || @last_id == 40 || @last_id == 92
69
69
  @glyph_map[:"s#{@last_id}"] = @last_id
70
- @last_id += (@last_id == 40 ? 2 : 1)
70
+ if @last_id == 40
71
+ @last_id += 1
72
+ @glyph_map[:"s#{@last_id}"] = @last_id
73
+ end
74
+ @last_id += 1
71
75
  end
72
76
  @glyph_map[glyph_id] = @last_id
73
77
  end
@@ -72,7 +72,13 @@ module HexaPDF
72
72
  obj, oid, gen, stream =
73
73
  case xref_entry.type
74
74
  when :in_use
75
- parse_indirect_object(xref_entry.pos)
75
+ if xref_entry.pos == 0 && xref_entry.oid != 0
76
+ # Handle seen-in-the-wild objects with invalid offset 0
77
+ maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
78
+ [nil, xref_entry.oid, xref_entry.gen, nil]
79
+ else
80
+ parse_indirect_object(xref_entry.pos)
81
+ end
76
82
  when :free
77
83
  [nil, xref_entry.oid, xref_entry.gen, nil]
78
84
  when :compressed
@@ -83,7 +89,7 @@ module HexaPDF
83
89
 
84
90
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
85
91
  raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
86
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
92
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
87
93
  end
88
94
 
89
95
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
@@ -133,7 +139,9 @@ module HexaPDF
133
139
  tok1 = @tokenizer.next_byte
134
140
  tok2 = @tokenizer.next_byte if tok1 == 13 # 13=CR, 10=LF
135
141
  if tok1 != 10 && tok1 != 13
136
- raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
142
+ tok2 = @tokenizer.next_byte
143
+ maybe_raise("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos,
144
+ force: tok1 != 32 || (tok2 != 10 && tok2 != 13)) # 32=space
137
145
  elsif tok1 == 13 && tok2 != 10
138
146
  maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
139
147
  pos: @tokenizer.pos)
@@ -390,14 +398,14 @@ module HexaPDF
390
398
  while true
391
399
  @tokenizer.skip_whitespace
392
400
  pos = @tokenizer.pos
393
- @tokenizer.scan_until(/(\n|\r\n?)+/)
401
+ @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
394
402
  next_new_line_pos = @tokenizer.pos
395
403
  @tokenizer.pos = pos
396
404
 
397
- token = @tokenizer.next_token rescue nil
405
+ token = @tokenizer.next_integer_or_keyword rescue nil
398
406
  if token.kind_of?(Integer)
399
- gen = @tokenizer.next_token rescue nil
400
- tok = @tokenizer.next_token rescue nil
407
+ gen = @tokenizer.next_integer_or_keyword rescue nil
408
+ tok = @tokenizer.next_integer_or_keyword rescue nil
401
409
  if @tokenizer.pos > next_new_line_pos
402
410
  @tokenizer.pos = next_new_line_pos
403
411
  elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
@@ -65,6 +65,9 @@ module HexaPDF
65
65
  # * Returns the native Ruby object for values with class HexaPDF::Object. However, all
66
66
  # subclasses of HexaPDF::Object are returned as is (it makes no sense, for example, to return
67
67
  # the hash that describes the Catalog instead of the Catalog object).
68
+ #
69
+ # Note: Hash or Array values will always be returned as-is, i.e. not wrapped with Dictionary or
70
+ # PDFArray.
68
71
  def [](arg1, arg2 = nil)
69
72
  data = arg2 ? value[arg1, arg2] : value[arg1]
70
73
  return if data.nil?
@@ -343,6 +343,7 @@ module HexaPDF
343
343
  @io << data.freeze
344
344
  end
345
345
  @io << "\nendstream"
346
+ @in_object = false
346
347
 
347
348
  nil
348
349
  else
@@ -350,12 +351,12 @@ module HexaPDF
350
351
  obj.value[:Length] = data.size
351
352
 
352
353
  str = serialize_hash(obj.value)
354
+ @in_object = false
355
+
353
356
  str << "stream\n"
354
357
  str << data
355
358
  str << "\nendstream"
356
359
  end
357
- ensure
358
- @in_object = false
359
360
  end
360
361
 
361
362
  # Invokes the correct serialization method for the object.
@@ -188,6 +188,28 @@ module HexaPDF
188
188
  token
189
189
  end
190
190
 
191
+ # Returns a single integer or keyword token read from the current position and advances the scan
192
+ # pointer. If the current position doesn't contain such a token, +nil+ is returned without
193
+ # advancing the scan pointer. The value +NO_MORE_TOKENS+ is returned if there are no more tokens
194
+ # available.
195
+ #
196
+ # Initial runs of whitespace characters are ignored.
197
+ #
198
+ # Note: This is a special method meant for use with reconstructing the cross-reference table!
199
+ def next_integer_or_keyword
200
+ skip_whitespace
201
+ byte = @ss.string.getbyte(@ss.pos) || -1
202
+ if 48 <= byte && byte <= 57
203
+ parse_number
204
+ elsif (97 <= byte && byte <= 122) || (65 <= byte && byte <= 90)
205
+ parse_keyword
206
+ elsif byte == -1 # we reached the end of the file
207
+ NO_MORE_TOKENS
208
+ else
209
+ nil
210
+ end
211
+ end
212
+
191
213
  # Reads the byte (an integer) at the current position and advances the scan pointer.
192
214
  def next_byte
193
215
  prepare_string_scanner(1)
@@ -245,6 +245,9 @@ module HexaPDF
245
245
  end
246
246
 
247
247
  form = (@widget[:AP] ||= {})[:N] ||= @document.add({Type: :XObject, Subtype: :Form})
248
+ # Wrap existing object in Form class in case the PDF writer didn't include the /Subtype
249
+ # key; we can do this since we know this has to be a Form object
250
+ form = @document.wrap(form, type: :XObject, subtype: :Form) unless form[:Subtype] == :Form
248
251
  form.value.replace({Type: :XObject, Subtype: :Form, BBox: [0, 0, rect.width, rect.height]})
249
252
  form.contents = ''
250
253
  form[:Resources] = HexaPDF::Object.deep_copy(default_resources)
@@ -222,6 +222,10 @@ module HexaPDF
222
222
  yield("No procedure set specified", true)
223
223
  self[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI]
224
224
  else
225
+ if val.kind_of?(Symbol)
226
+ yield("Procedure set is a single value instead of an Array", true)
227
+ val = value[:ProcSet] = [val]
228
+ end
225
229
  val.reject! do |name|
226
230
  case name
227
231
  when :PDF, :Text, :ImageB, :ImageC, :ImageI
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '0.14.2'
40
+ VERSION = '0.14.3'
41
41
 
42
42
  end
@@ -29,6 +29,8 @@ describe HexaPDF::Font::TrueType::Subsetter do
29
29
 
30
30
  it "doesn't use certain subset glyph IDs for performance reasons" do
31
31
  1.upto(93) {|i| @subsetter.use_glyph(i) }
32
+ # glyph 0, 93 used glyph, 4 special glyphs
33
+ assert_equal(1 + 93 + 4, @subsetter.instance_variable_get(:@glyph_map).size)
32
34
  1.upto(12) {|i| assert_equal(i, @subsetter.subset_glyph_id(i), "id=#{i}") }
33
35
  13.upto(38) {|i| assert_equal(i + 1, @subsetter.subset_glyph_id(i), "id=#{i}") }
34
36
  39.upto(88) {|i| assert_equal(i + 3, @subsetter.subset_glyph_id(i), "id=#{i}") }
@@ -234,5 +234,12 @@ describe HexaPDF::DictionaryFields do
234
234
  @field.convert(data, doc)
235
235
  doc.verify
236
236
  end
237
+
238
+ it "converts to a null value if an (invalid) empty array is given" do
239
+ doc = Minitest::Mock.new
240
+ doc.expect(:wrap, :data, [nil])
241
+ @field.convert([], doc)
242
+ doc.verify
243
+ end
237
244
  end
238
245
  end
@@ -88,6 +88,12 @@ describe HexaPDF::Parser do
88
88
  assert_equal('12', TestHelper.collector(stream.fiber))
89
89
  end
90
90
 
91
+ it "handles keyword stream followed by space and CR or LF" do
92
+ create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
93
+ *, stream = @parser.parse_indirect_object
94
+ assert_equal('12', TestHelper.collector(stream.fiber))
95
+ end
96
+
91
97
  it "handles invalid indirect object value consisting of number followed by endobj without space" do
92
98
  create_parser("1 0 obj 749endobj")
93
99
  object, * = @parser.parse_indirect_object
@@ -157,6 +163,12 @@ describe HexaPDF::Parser do
157
163
  assert_match(/not CR alone/, exp.message)
158
164
  end
159
165
 
166
+ it "fails if keyword stream is followed by space and CR or LF instead of LF or CR/LF" do
167
+ create_parser("1 0 obj<</Length 2>> stream \n12\nendstream endobj")
168
+ exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
169
+ assert_match(/must be followed by LF or CR\/LF/, exp.message)
170
+ end
171
+
160
172
  it "fails for numbers followed by endobj without space" do
161
173
  create_parser("1 0 obj 749endobj")
162
174
  exp = assert_raises(HexaPDF::MalformedPDFError) { @parser.parse_indirect_object }
@@ -222,6 +234,23 @@ describe HexaPDF::Parser do
222
234
  assert_equal([1, 2], obj.value)
223
235
  end
224
236
 
237
+ it "handles an invalid indirect object offset of 0" do
238
+ obj = @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
239
+ assert(obj.null?)
240
+ assert_equal(2, obj.oid)
241
+ assert_equal(0, obj.gen)
242
+ end
243
+
244
+ describe "with strict parsing" do
245
+ it "raises an error if an indirect object has an offset of 0" do
246
+ @document.config['parser.on_correctable_error'] = proc { true }
247
+ exp = assert_raises(HexaPDF::MalformedPDFError) do
248
+ @parser.load_object(HexaPDF::XRefSection.in_use_entry(2, 0, 0))
249
+ end
250
+ assert_match(/has offset 0/, exp.message)
251
+ end
252
+ end
253
+
225
254
  it "fails if another object is found instead of an object stream" do
226
255
  def (@document).object(_oid)
227
256
  :invalid
@@ -512,6 +541,13 @@ describe HexaPDF::Parser do
512
541
  assert_equal(6, @parser.load_object(@xref).value)
513
542
  end
514
543
 
544
+ it "handles pathalogical cases which contain many opened literal strings" do
545
+ time = Time.now
546
+ create_parser("(1" << "(abc\n" * 10000 << "\n1 0 obj\n6\nendobj\ntrailer\n<</Size 1>>")
547
+ assert_equal(6, @parser.load_object(@xref).value)
548
+ assert(Time.now - time < 0.5, "Xref reconstruction takes too long")
549
+ end
550
+
515
551
  it "ignores invalid objects" do
516
552
  create_parser("1 x obj\n5\nendobj\n1 0 xobj\n6\nendobj\n1 0 obj 4\nendobj\ntrailer\n<</Size 1>>")
517
553
  assert_equal(4, @parser.load_object(@xref).value)
@@ -153,6 +153,13 @@ describe HexaPDF::Serializer do
153
153
  assert_equal("<</Key(value)/Length 6>>stream\nsome\nendstream", io.string)
154
154
  end
155
155
 
156
+ it "doesn't reset the internal recursion flag if the stream is serialized as part of another object" do
157
+ object = HexaPDF::Dictionary.new({}, oid: 5)
158
+ object[:Stream] = @stream
159
+ object[:Self] = object # needs to be the last entry so that :Stream gets serialized first!
160
+ assert_serialized("<</Stream 2 0 R/Self 5 0 R>>", object)
161
+ end
162
+
156
163
  it "fails if a stream without object identifier is serialized" do
157
164
  @stream.oid = 0
158
165
  assert_raises(HexaPDF::Error) { @serializer.serialize(@stream) }
@@ -27,4 +27,32 @@ describe HexaPDF::Tokenizer do
27
27
  5.times {|i| assert_equal(i, @tokenizer.next_token) }
28
28
  end
29
29
  end
30
+
31
+ it "has a special token scanning method for use with xref reconstruction" do
32
+ create_tokenizer(<<-EOF.chomp.gsub(/^ {8}/, ''))
33
+ % Comment
34
+ true
35
+ 123 50
36
+ obj
37
+ (ignored)
38
+ /Ignored
39
+ [/Ignored]
40
+ <</Ignored /Values>>
41
+ EOF
42
+
43
+ scan_to_newline = proc { @tokenizer.scan_until(/(\n|\r\n?)+|\z/) }
44
+
45
+ assert_nil(@tokenizer.next_integer_or_keyword)
46
+ scan_to_newline.call
47
+ assert_equal(true, @tokenizer.next_integer_or_keyword)
48
+ assert_equal(123, @tokenizer.next_integer_or_keyword)
49
+ assert_equal(50, @tokenizer.next_integer_or_keyword)
50
+ assert_equal('obj', @tokenizer.next_integer_or_keyword)
51
+ 4.times do
52
+ assert_nil(@tokenizer.next_integer_or_keyword)
53
+ scan_to_newline.call
54
+ end
55
+ assert_equal(HexaPDF::Tokenizer::NO_MORE_TOKENS, @tokenizer.next_integer_or_keyword)
56
+ end
57
+
30
58
  end
@@ -40,7 +40,7 @@ describe HexaPDF::Writer do
40
40
  219
41
41
  %%EOF
42
42
  3 0 obj
43
- <</Producer(HexaPDF version 0.14.2)>>
43
+ <</Producer(HexaPDF version 0.14.3)>>
44
44
  endobj
45
45
  xref
46
46
  3 1
@@ -72,7 +72,7 @@ describe HexaPDF::Writer do
72
72
  141
73
73
  %%EOF
74
74
  6 0 obj
75
- <</Producer(HexaPDF version 0.14.2)>>
75
+ <</Producer(HexaPDF version 0.14.3)>>
76
76
  endobj
77
77
  2 0 obj
78
78
  <</Length 10>>stream
@@ -407,10 +407,12 @@ describe HexaPDF::Type::AcroForm::AppearanceGenerator do
407
407
  @generator.create_appearances
408
408
  form = @widget[:AP][:N]
409
409
  form[:key] = :value
410
+ form.delete(:Subtype)
411
+ @widget[:AP][:N] = @doc.wrap(form, type: HexaPDF::Dictionary)
410
412
 
411
413
  @field[:V] = 'test1'
412
414
  @generator.create_appearances
413
- assert_same(form, @widget[:AP][:N])
415
+ assert_equal(form, @widget[:AP][:N])
414
416
  refute(form.key?(:key))
415
417
  assert_match(/test1/, form.contents)
416
418
  end
@@ -194,6 +194,12 @@ describe HexaPDF::Type::Resources do
194
194
  assert_equal([:PDF, :Text, :ImageB, :ImageC, :ImageI], @res[:ProcSet].value)
195
195
  end
196
196
 
197
+ it "handles an invalid ProcSet containing a single value instead of an array" do
198
+ @res[:ProcSet] = :PDF
199
+ @res.validate
200
+ assert_equal([:PDF], @res[:ProcSet].value)
201
+ end
202
+
197
203
  it "removes invalid procedure set names from ProcSet" do
198
204
  @res[:ProcSet] = [:PDF, :Unknown]
199
205
  @res.validate
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hexapdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.14.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Thomas Leitner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-22 00:00:00.000000000 Z
11
+ date: 2021-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cmdparse