pdf-reader 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +5 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +1 -1
- data/lib/pdf/reader/font.rb +29 -17
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +247 -42
- data/lib/pdf/reader/page.rb +38 -20
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +4 -1
- data/lib/pdf/reader/parser.rb +9 -6
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +2 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +17 -9
- data/rbi/pdf-reader.rbi +388 -173
- metadata +15 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b4616131d0ad73c4ef2c4992ae79d4fde420d6857aba60e8dfac9b088a0b915
|
4
|
+
data.tar.gz: f93f481d7f76af426420dbf507a88e8ecead8ec84690781f42de3b7b5ffbd1bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86dbe3450a11e0deb3f5db98625375b252cc25f289d76c98b5de48342d1b4957de81c1c2b6cce53d7d09738e9576bd48213c92166d48911c1f45ad6a77f195a5
|
7
|
+
data.tar.gz: ee852ff644a095bae93eb7cc30c6d070c8c6adda4f9bfadecf938bf3ba2723fed08c75a3bf15ba30fcf8fded7ad6a5b74dad8a3f512823798686350f24b912eb
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v2.9.0 (24th January 2022)
|
2
|
+
- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
|
3
|
+
- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
|
4
|
+
- For sorbet users, additional type annotations are included in the gem
|
5
|
+
|
1
6
|
v2.8.0 (28th Decemeber 2021)
|
2
7
|
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
3
8
|
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
|
10
|
+
# a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
|
11
|
+
#
|
12
|
+
class AesV2SecurityHandler
|
13
|
+
|
14
|
+
def initialize(key)
|
15
|
+
@encrypt_key = key
|
16
|
+
end
|
17
|
+
|
18
|
+
##7.6.2 General Encryption Algorithm
|
19
|
+
#
|
20
|
+
# Algorithm 1: Encryption of data using the AES-128-CBC algorithm
|
21
|
+
#
|
22
|
+
# version == 4 and CFM == AESV2
|
23
|
+
#
|
24
|
+
# buf - a string to decrypt
|
25
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
|
+
#
|
27
|
+
def decrypt( buf, ref )
|
28
|
+
objKey = @encrypt_key.dup
|
29
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
31
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
32
|
+
length = objKey.length < 16 ? objKey.length : 16
|
33
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
|
+
cipher.decrypt
|
35
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
|
+
cipher.iv = buf[0..15]
|
37
|
+
cipher.update(buf[16..-1]) + cipher.final
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest'
|
6
|
+
require 'openssl'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
|
11
|
+
# Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
|
12
|
+
#
|
13
|
+
class AesV3SecurityHandler
|
14
|
+
|
15
|
+
def initialize(key)
|
16
|
+
@encrypt_key = key
|
17
|
+
@cipher = "AES-256-CBC"
|
18
|
+
end
|
19
|
+
|
20
|
+
##7.6.2 General Encryption Algorithm
|
21
|
+
#
|
22
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
|
+
#
|
24
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
25
|
+
#
|
26
|
+
# buf - a string to decrypt
|
27
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
|
+
#
|
29
|
+
def decrypt( buf, ref )
|
30
|
+
cipher = OpenSSL::Cipher.new(@cipher)
|
31
|
+
cipher.decrypt
|
32
|
+
cipher.key = @encrypt_key.dup
|
33
|
+
cipher.iv = buf[0..15]
|
34
|
+
cipher.update(buf[16..-1]) + cipher.final
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -59,6 +59,9 @@ class PDF::Reader
|
|
59
59
|
# Allow for this here
|
60
60
|
TRAILING_BYTECOUNT = 5000
|
61
61
|
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
64
|
+
|
62
65
|
attr_reader :pos
|
63
66
|
|
64
67
|
# Creates a new buffer.
|
@@ -143,13 +146,20 @@ class PDF::Reader
|
|
143
146
|
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
144
147
|
data = @io.read(TRAILING_BYTECOUNT)
|
145
148
|
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
150
|
+
|
146
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
147
152
|
lines = data.split(/[\n\r]+/).reverse
|
148
153
|
eof_index = lines.index { |l| l.strip[/^%%EOF/] }
|
149
154
|
|
150
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
151
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
152
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
153
163
|
end
|
154
164
|
|
155
165
|
private
|
@@ -230,13 +240,12 @@ class PDF::Reader
|
|
230
240
|
return if @tokens.size < 3
|
231
241
|
return if @tokens[2] != "R"
|
232
242
|
|
233
|
-
|
234
|
-
|
235
|
-
if
|
236
|
-
@tokens[0] = PDF::Reader::Reference.new(
|
237
|
-
@tokens
|
238
|
-
@tokens
|
239
|
-
@tokens.compact!
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
240
249
|
end
|
241
250
|
end
|
242
251
|
|
@@ -246,7 +255,7 @@ class PDF::Reader
|
|
246
255
|
# This is to reduce the chance of accidentally matching an embedded EI
|
247
256
|
def prepare_inline_token
|
248
257
|
idstart = @io.pos
|
249
|
-
|
258
|
+
prevchr = ''
|
250
259
|
eisize = 0 # how many chars in the end marker
|
251
260
|
seeking = 'E' # what are we looking for now?
|
252
261
|
loop do
|
@@ -264,11 +273,11 @@ class PDF::Reader
|
|
264
273
|
end
|
265
274
|
when 'I'
|
266
275
|
if chr == 'I'
|
267
|
-
seeking =
|
276
|
+
seeking = ''
|
268
277
|
else
|
269
278
|
seeking = 'E'
|
270
279
|
end
|
271
|
-
when
|
280
|
+
when ''
|
272
281
|
if WHITE_SPACE.include? chr
|
273
282
|
eisize += 1 # Drop trailer
|
274
283
|
break
|
@@ -276,28 +285,28 @@ class PDF::Reader
|
|
276
285
|
seeking = 'E'
|
277
286
|
end
|
278
287
|
end
|
279
|
-
prevchr = chr
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
280
289
|
end
|
281
|
-
unless seeking ==
|
290
|
+
unless seeking == ''
|
282
291
|
raise MalformedPDFError, "EI terminator not found"
|
283
292
|
end
|
284
293
|
eiend = @io.pos
|
285
294
|
@io.seek(idstart, IO::SEEK_SET)
|
286
295
|
str = @io.read(eiend - eisize - idstart) # get the ID content
|
287
|
-
@tokens <<
|
296
|
+
@tokens << str.freeze if str
|
288
297
|
end
|
289
298
|
|
290
299
|
# if we're currently inside a hex string, read hex nibbles until
|
291
300
|
# we find a closing >
|
292
301
|
#
|
293
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
294
304
|
str = "".dup
|
295
|
-
finished = false
|
296
305
|
|
297
|
-
|
306
|
+
until finished == :true
|
298
307
|
byte = @io.getbyte
|
299
308
|
if byte.nil?
|
300
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
301
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
302
311
|
str << byte
|
303
312
|
elsif byte <= 32
|
@@ -306,7 +315,7 @@ class PDF::Reader
|
|
306
315
|
@tokens << str if str.size > 0
|
307
316
|
@tokens << ">" if byte != 0x3E # '>'
|
308
317
|
@tokens << byte.chr
|
309
|
-
finished = true
|
318
|
+
finished = :true
|
310
319
|
end
|
311
320
|
end
|
312
321
|
end
|
@@ -353,14 +362,17 @@ class PDF::Reader
|
|
353
362
|
def prepare_regular_token
|
354
363
|
tok = "".dup
|
355
364
|
|
356
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
357
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
358
371
|
when 0x25
|
359
372
|
# comment, ignore everything until the next EOL char
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
364
376
|
end
|
365
377
|
when *TOKEN_WHITESPACE
|
366
378
|
# white space, token finished
|
@@ -430,15 +442,5 @@ class PDF::Reader
|
|
430
442
|
byte
|
431
443
|
end
|
432
444
|
|
433
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
434
|
-
# into higher level tokens. This methods adds a to_token() method
|
435
|
-
# to tokens that should remain as strings.
|
436
|
-
#
|
437
|
-
def string_token(token)
|
438
|
-
def token.to_token
|
439
|
-
to_s
|
440
|
-
end
|
441
|
-
token
|
442
|
-
end
|
443
445
|
end
|
444
446
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -35,15 +35,15 @@ class PDF::Reader
|
|
35
35
|
class CMap # :nodoc:
|
36
36
|
|
37
37
|
CMAP_KEYWORDS = {
|
38
|
-
"begincodespacerange" =>
|
39
|
-
"endcodespacerange" =>
|
40
|
-
"beginbfchar" =>
|
41
|
-
"endbfchar" =>
|
42
|
-
"beginbfrange" =>
|
43
|
-
"endbfrange" =>
|
44
|
-
"begin" =>
|
45
|
-
"begincmap" =>
|
46
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
47
47
|
}
|
48
48
|
|
49
49
|
attr_reader :map
|
@@ -53,30 +53,6 @@ class PDF::Reader
|
|
53
53
|
process_data(data)
|
54
54
|
end
|
55
55
|
|
56
|
-
def process_data(data)
|
57
|
-
parser = build_parser(data)
|
58
|
-
mode = :none
|
59
|
-
instructions = []
|
60
|
-
|
61
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
62
|
-
if token == "beginbfchar"
|
63
|
-
mode = :char
|
64
|
-
elsif token == "endbfchar"
|
65
|
-
process_bfchar_instructions(instructions)
|
66
|
-
instructions = []
|
67
|
-
mode = :none
|
68
|
-
elsif token == "beginbfrange"
|
69
|
-
mode = :range
|
70
|
-
elsif token == "endbfrange"
|
71
|
-
process_bfrange_instructions(instructions)
|
72
|
-
instructions = []
|
73
|
-
mode = :none
|
74
|
-
elsif mode == :char || mode == :range
|
75
|
-
instructions << token
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
56
|
def size
|
81
57
|
@map.size
|
82
58
|
end
|
@@ -86,13 +62,40 @@ class PDF::Reader
|
|
86
62
|
# Returns an array of Integers.
|
87
63
|
#
|
88
64
|
def decode(c)
|
89
|
-
|
90
|
-
return c unless Integer === c
|
91
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
92
66
|
end
|
93
67
|
|
94
68
|
private
|
95
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
96
99
|
def build_parser(instructions)
|
97
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
98
101
|
Parser.new(buffer)
|
@@ -107,7 +110,6 @@ class PDF::Reader
|
|
107
110
|
# exception when we try converting broken UTF-16 to UTF-8
|
108
111
|
#
|
109
112
|
def str_to_int(str)
|
110
|
-
return nil if str.nil? || str.size == 0
|
111
113
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
112
114
|
str.unpack("C*")
|
113
115
|
else # UTF-16
|
@@ -115,12 +117,15 @@ class PDF::Reader
|
|
115
117
|
end
|
116
118
|
result = []
|
117
119
|
while unpacked_string.any? do
|
118
|
-
if unpacked_string.size >= 2 &&
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
119
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
120
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
121
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
122
|
-
|
123
|
-
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
124
129
|
else
|
125
130
|
result << unpacked_string.shift
|
126
131
|
end
|
@@ -130,9 +135,11 @@ class PDF::Reader
|
|
130
135
|
|
131
136
|
def process_bfchar_instructions(instructions)
|
132
137
|
instructions.each_slice(2) do |one, two|
|
133
|
-
find = str_to_int(one)
|
134
|
-
replace = str_to_int(two)
|
135
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
136
143
|
end
|
137
144
|
end
|
138
145
|
|
@@ -143,30 +150,36 @@ class PDF::Reader
|
|
143
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
144
151
|
bfrange_type_two(start, finish, to)
|
145
152
|
else
|
146
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
147
154
|
end
|
148
155
|
end
|
149
156
|
end
|
150
157
|
|
151
158
|
def bfrange_type_one(start_code, end_code, dst)
|
152
|
-
start_code = str_to_int(start_code)
|
153
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
154
161
|
dst = str_to_int(dst)
|
155
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
156
165
|
# add all values in the range to our mapping
|
157
166
|
(start_code..end_code).each_with_index do |val, idx|
|
158
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
159
168
|
end
|
160
169
|
end
|
161
170
|
|
162
171
|
def bfrange_type_two(start_code, end_code, dst)
|
163
|
-
start_code = str_to_int(start_code)
|
164
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
165
177
|
from_range = (start_code..end_code)
|
166
178
|
|
167
179
|
# add all values in the range to our mapping
|
168
180
|
from_range.each_with_index do |val, idx|
|
169
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
170
183
|
end
|
171
184
|
end
|
172
185
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -51,9 +51,17 @@ class PDF::Reader
|
|
51
51
|
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
52
|
end
|
53
53
|
################################################################################
|
54
|
+
def self.validate_type_as_malformed(object, name, klass)
|
55
|
+
raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
56
|
+
end
|
57
|
+
################################################################################
|
54
58
|
def self.validate_not_nil(object, name)
|
55
59
|
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
60
|
end
|
61
|
+
################################################################################
|
62
|
+
def self.validate_not_nil_as_malformed(object, name)
|
63
|
+
raise MalformedPDFError, "#{object} must not be nil" if object.nil?
|
64
|
+
end
|
57
65
|
end
|
58
66
|
|
59
67
|
################################################################################
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
|
@@ -34,7 +34,7 @@ class PDF::Reader
|
|
34
34
|
def zlib_inflate(data)
|
35
35
|
begin
|
36
36
|
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
37
|
-
rescue Zlib::
|
37
|
+
rescue Zlib::Error
|
38
38
|
# by default, Ruby's Zlib assumes the data it's inflating
|
39
39
|
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
40
40
|
# fails, swallow the exception and attempt to inflate the data as a raw
|
@@ -43,7 +43,7 @@ class PDF::Reader
|
|
43
43
|
|
44
44
|
begin
|
45
45
|
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
46
|
-
rescue
|
46
|
+
rescue Zlib::Error
|
47
47
|
# swallow this one too, so we can try some other fallback options
|
48
48
|
end
|
49
49
|
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -42,7 +42,7 @@ class PDF::Reader
|
|
42
42
|
# returned untouched. At this stage PDF::Reader has no need to decode images.
|
43
43
|
#
|
44
44
|
def self.with(name, options = {})
|
45
|
-
case name
|
45
|
+
case name
|
46
46
|
when :ASCII85Decode then PDF::Reader::Filter::Ascii85.new(options)
|
47
47
|
when :ASCIIHexDecode then PDF::Reader::Filter::AsciiHex.new(options)
|
48
48
|
when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -149,27 +149,37 @@ class PDF::Reader
|
|
149
149
|
end
|
150
150
|
end
|
151
151
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
152
|
+
def build_encoding(obj)
|
153
|
+
if obj[:Encoding].is_a?(Symbol)
|
154
|
+
# one of the standard encodings, referenced by name
|
155
|
+
# TODO pass in a standard shape, always a Hash
|
156
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
157
|
+
elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
|
158
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
159
|
+
elsif obj[:Encoding].nil?
|
160
|
+
default_encoding(@basefont)
|
157
161
|
else
|
158
|
-
|
162
|
+
raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
|
159
163
|
end
|
160
|
-
|
161
|
-
|
162
|
-
|
164
|
+
end
|
165
|
+
|
166
|
+
def extract_base_info(obj)
|
167
|
+
@subtype = @ohash.deref_name(obj[:Subtype])
|
168
|
+
@basefont = @ohash.deref_name(obj[:BaseFont])
|
169
|
+
@encoding = build_encoding(obj)
|
170
|
+
@widths = @ohash.deref_array_of_numbers(obj[:Widths]) || []
|
171
|
+
@first_char = @ohash.deref_integer(obj[:FirstChar])
|
172
|
+
@last_char = @ohash.deref_integer(obj[:LastChar])
|
163
173
|
|
164
174
|
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
165
175
|
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
166
|
-
@cid_widths = @ohash.
|
167
|
-
@cid_default_width = @ohash.
|
176
|
+
@cid_widths = @ohash.deref_array(obj[:W]) || []
|
177
|
+
@cid_default_width = @ohash.deref_number(obj[:DW]) || 1000
|
168
178
|
|
169
179
|
if obj[:ToUnicode]
|
170
180
|
# ToUnicode is optional for Type1 and Type3
|
171
|
-
stream = @ohash.
|
172
|
-
if stream
|
181
|
+
stream = @ohash.deref_stream(obj[:ToUnicode])
|
182
|
+
if stream
|
173
183
|
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
174
184
|
end
|
175
185
|
end
|
@@ -177,7 +187,9 @@ class PDF::Reader
|
|
177
187
|
|
178
188
|
def extract_type3_info(obj)
|
179
189
|
if @subtype == :Type3
|
180
|
-
@font_matrix = @ohash.
|
190
|
+
@font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
|
191
|
+
0.001, 0, 0, 0.001, 0, 0
|
192
|
+
]
|
181
193
|
end
|
182
194
|
end
|
183
195
|
|
@@ -185,7 +197,7 @@ class PDF::Reader
|
|
185
197
|
if obj[:FontDescriptor]
|
186
198
|
# create a font descriptor object if we can, in other words, unless this is
|
187
199
|
# a CID Font
|
188
|
-
fd = @ohash.
|
200
|
+
fd = @ohash.deref_hash(obj[:FontDescriptor])
|
189
201
|
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
190
202
|
else
|
191
203
|
@font_descriptor = nil
|
@@ -197,9 +209,9 @@ class PDF::Reader
|
|
197
209
|
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
198
210
|
# A one-element array specifying the CIDFont dictionary that is the
|
199
211
|
# descendant of this Type 0 font.
|
200
|
-
descendants = @ohash.
|
212
|
+
descendants = @ohash.deref_array(obj[:DescendantFonts])
|
201
213
|
@descendantfonts = descendants.map { |desc|
|
202
|
-
PDF::Reader::Font.new(@ohash, @ohash.
|
214
|
+
PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
|
203
215
|
}
|
204
216
|
end
|
205
217
|
|