pdf-reader 2.8.0 → 2.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +9 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +10 -11
- data/lib/pdf/reader/font.rb +29 -17
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +247 -42
- data/lib/pdf/reader/page.rb +38 -20
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +4 -1
- data/lib/pdf/reader/parser.rb +20 -8
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +2 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +17 -9
- data/rbi/pdf-reader.rbi +388 -173
- metadata +15 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c734cf3cfc0abf1102f813976d4936d33b57815f114ce92224bbd605fe16a2
|
4
|
+
data.tar.gz: f52b1751f83717a7bc96c56e8d830559d387fb430cfa6fa2a78604d98c7476f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72fda8f6b32c20782adca6cca44d291c7cbe4ac9d858da5ed1c815af2a7d6680e3906cac47a8414923c8db639fd51365d9da8612c1c7f79a674b22448bb35cae
|
7
|
+
data.tar.gz: fa79a29d80a36d37e1188769bf7991d5108bbe08b11711a7c9bb1741cedd3682b77afe219a24ae7844fdbf10b23ca3eb5434f4b9418d7002f07fb8edf9dd6e26
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
v2.9.1 (4th February 2022)
|
2
|
+
- Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
|
3
|
+
- Other small bug fixes
|
4
|
+
|
5
|
+
v2.9.0 (24th January 2022)
|
6
|
+
- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
|
7
|
+
- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
|
8
|
+
- For sorbet users, additional type annotations are included in the gem
|
9
|
+
|
1
10
|
v2.8.0 (28th Decemeber 2021)
|
2
11
|
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
3
12
|
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
|
10
|
+
# a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
|
11
|
+
#
|
12
|
+
class AesV2SecurityHandler
|
13
|
+
|
14
|
+
def initialize(key)
|
15
|
+
@encrypt_key = key
|
16
|
+
end
|
17
|
+
|
18
|
+
##7.6.2 General Encryption Algorithm
|
19
|
+
#
|
20
|
+
# Algorithm 1: Encryption of data using the AES-128-CBC algorithm
|
21
|
+
#
|
22
|
+
# version == 4 and CFM == AESV2
|
23
|
+
#
|
24
|
+
# buf - a string to decrypt
|
25
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
|
+
#
|
27
|
+
def decrypt( buf, ref )
|
28
|
+
objKey = @encrypt_key.dup
|
29
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
31
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
32
|
+
length = objKey.length < 16 ? objKey.length : 16
|
33
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
|
+
cipher.decrypt
|
35
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
|
+
cipher.iv = buf[0..15]
|
37
|
+
cipher.update(buf[16..-1]) + cipher.final
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest'
|
6
|
+
require 'openssl'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
|
11
|
+
# Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
|
12
|
+
#
|
13
|
+
class AesV3SecurityHandler
|
14
|
+
|
15
|
+
def initialize(key)
|
16
|
+
@encrypt_key = key
|
17
|
+
@cipher = "AES-256-CBC"
|
18
|
+
end
|
19
|
+
|
20
|
+
##7.6.2 General Encryption Algorithm
|
21
|
+
#
|
22
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
|
+
#
|
24
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
25
|
+
#
|
26
|
+
# buf - a string to decrypt
|
27
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
|
+
#
|
29
|
+
def decrypt( buf, ref )
|
30
|
+
cipher = OpenSSL::Cipher.new(@cipher)
|
31
|
+
cipher.decrypt
|
32
|
+
cipher.key = @encrypt_key.dup
|
33
|
+
cipher.iv = buf[0..15]
|
34
|
+
cipher.update(buf[16..-1]) + cipher.final
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -59,6 +59,9 @@ class PDF::Reader
|
|
59
59
|
# Allow for this here
|
60
60
|
TRAILING_BYTECOUNT = 5000
|
61
61
|
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
64
|
+
|
62
65
|
attr_reader :pos
|
63
66
|
|
64
67
|
# Creates a new buffer.
|
@@ -143,13 +146,20 @@ class PDF::Reader
|
|
143
146
|
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
144
147
|
data = @io.read(TRAILING_BYTECOUNT)
|
145
148
|
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
150
|
+
|
146
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
147
152
|
lines = data.split(/[\n\r]+/).reverse
|
148
153
|
eof_index = lines.index { |l| l.strip[/^%%EOF/] }
|
149
154
|
|
150
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
151
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
152
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
153
163
|
end
|
154
164
|
|
155
165
|
private
|
@@ -230,13 +240,12 @@ class PDF::Reader
|
|
230
240
|
return if @tokens.size < 3
|
231
241
|
return if @tokens[2] != "R"
|
232
242
|
|
233
|
-
|
234
|
-
|
235
|
-
if
|
236
|
-
@tokens[0] = PDF::Reader::Reference.new(
|
237
|
-
@tokens
|
238
|
-
@tokens
|
239
|
-
@tokens.compact!
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
240
249
|
end
|
241
250
|
end
|
242
251
|
|
@@ -246,7 +255,7 @@ class PDF::Reader
|
|
246
255
|
# This is to reduce the chance of accidentally matching an embedded EI
|
247
256
|
def prepare_inline_token
|
248
257
|
idstart = @io.pos
|
249
|
-
|
258
|
+
prevchr = ''
|
250
259
|
eisize = 0 # how many chars in the end marker
|
251
260
|
seeking = 'E' # what are we looking for now?
|
252
261
|
loop do
|
@@ -264,11 +273,11 @@ class PDF::Reader
|
|
264
273
|
end
|
265
274
|
when 'I'
|
266
275
|
if chr == 'I'
|
267
|
-
seeking =
|
276
|
+
seeking = ''
|
268
277
|
else
|
269
278
|
seeking = 'E'
|
270
279
|
end
|
271
|
-
when
|
280
|
+
when ''
|
272
281
|
if WHITE_SPACE.include? chr
|
273
282
|
eisize += 1 # Drop trailer
|
274
283
|
break
|
@@ -276,28 +285,28 @@ class PDF::Reader
|
|
276
285
|
seeking = 'E'
|
277
286
|
end
|
278
287
|
end
|
279
|
-
prevchr = chr
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
280
289
|
end
|
281
|
-
unless seeking ==
|
290
|
+
unless seeking == ''
|
282
291
|
raise MalformedPDFError, "EI terminator not found"
|
283
292
|
end
|
284
293
|
eiend = @io.pos
|
285
294
|
@io.seek(idstart, IO::SEEK_SET)
|
286
295
|
str = @io.read(eiend - eisize - idstart) # get the ID content
|
287
|
-
@tokens <<
|
296
|
+
@tokens << str.freeze if str
|
288
297
|
end
|
289
298
|
|
290
299
|
# if we're currently inside a hex string, read hex nibbles until
|
291
300
|
# we find a closing >
|
292
301
|
#
|
293
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
294
304
|
str = "".dup
|
295
|
-
finished = false
|
296
305
|
|
297
|
-
|
306
|
+
until finished == :true
|
298
307
|
byte = @io.getbyte
|
299
308
|
if byte.nil?
|
300
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
301
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
302
311
|
str << byte
|
303
312
|
elsif byte <= 32
|
@@ -306,7 +315,7 @@ class PDF::Reader
|
|
306
315
|
@tokens << str if str.size > 0
|
307
316
|
@tokens << ">" if byte != 0x3E # '>'
|
308
317
|
@tokens << byte.chr
|
309
|
-
finished = true
|
318
|
+
finished = :true
|
310
319
|
end
|
311
320
|
end
|
312
321
|
end
|
@@ -353,14 +362,17 @@ class PDF::Reader
|
|
353
362
|
def prepare_regular_token
|
354
363
|
tok = "".dup
|
355
364
|
|
356
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
357
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
358
371
|
when 0x25
|
359
372
|
# comment, ignore everything until the next EOL char
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
364
376
|
end
|
365
377
|
when *TOKEN_WHITESPACE
|
366
378
|
# white space, token finished
|
@@ -430,15 +442,5 @@ class PDF::Reader
|
|
430
442
|
byte
|
431
443
|
end
|
432
444
|
|
433
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
434
|
-
# into higher level tokens. This methods adds a to_token() method
|
435
|
-
# to tokens that should remain as strings.
|
436
|
-
#
|
437
|
-
def string_token(token)
|
438
|
-
def token.to_token
|
439
|
-
to_s
|
440
|
-
end
|
441
|
-
token
|
442
|
-
end
|
443
445
|
end
|
444
446
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -35,15 +35,15 @@ class PDF::Reader
|
|
35
35
|
class CMap # :nodoc:
|
36
36
|
|
37
37
|
CMAP_KEYWORDS = {
|
38
|
-
"begincodespacerange" =>
|
39
|
-
"endcodespacerange" =>
|
40
|
-
"beginbfchar" =>
|
41
|
-
"endbfchar" =>
|
42
|
-
"beginbfrange" =>
|
43
|
-
"endbfrange" =>
|
44
|
-
"begin" =>
|
45
|
-
"begincmap" =>
|
46
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
47
47
|
}
|
48
48
|
|
49
49
|
attr_reader :map
|
@@ -53,30 +53,6 @@ class PDF::Reader
|
|
53
53
|
process_data(data)
|
54
54
|
end
|
55
55
|
|
56
|
-
def process_data(data)
|
57
|
-
parser = build_parser(data)
|
58
|
-
mode = :none
|
59
|
-
instructions = []
|
60
|
-
|
61
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
62
|
-
if token == "beginbfchar"
|
63
|
-
mode = :char
|
64
|
-
elsif token == "endbfchar"
|
65
|
-
process_bfchar_instructions(instructions)
|
66
|
-
instructions = []
|
67
|
-
mode = :none
|
68
|
-
elsif token == "beginbfrange"
|
69
|
-
mode = :range
|
70
|
-
elsif token == "endbfrange"
|
71
|
-
process_bfrange_instructions(instructions)
|
72
|
-
instructions = []
|
73
|
-
mode = :none
|
74
|
-
elsif mode == :char || mode == :range
|
75
|
-
instructions << token
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
56
|
def size
|
81
57
|
@map.size
|
82
58
|
end
|
@@ -86,13 +62,40 @@ class PDF::Reader
|
|
86
62
|
# Returns an array of Integers.
|
87
63
|
#
|
88
64
|
def decode(c)
|
89
|
-
|
90
|
-
return c unless Integer === c
|
91
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
92
66
|
end
|
93
67
|
|
94
68
|
private
|
95
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
96
99
|
def build_parser(instructions)
|
97
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
98
101
|
Parser.new(buffer)
|
@@ -107,7 +110,6 @@ class PDF::Reader
|
|
107
110
|
# exception when we try converting broken UTF-16 to UTF-8
|
108
111
|
#
|
109
112
|
def str_to_int(str)
|
110
|
-
return nil if str.nil? || str.size == 0
|
111
113
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
112
114
|
str.unpack("C*")
|
113
115
|
else # UTF-16
|
@@ -115,12 +117,15 @@ class PDF::Reader
|
|
115
117
|
end
|
116
118
|
result = []
|
117
119
|
while unpacked_string.any? do
|
118
|
-
if unpacked_string.size >= 2 &&
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
119
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
120
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
121
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
122
|
-
|
123
|
-
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
124
129
|
else
|
125
130
|
result << unpacked_string.shift
|
126
131
|
end
|
@@ -130,9 +135,11 @@ class PDF::Reader
|
|
130
135
|
|
131
136
|
def process_bfchar_instructions(instructions)
|
132
137
|
instructions.each_slice(2) do |one, two|
|
133
|
-
find = str_to_int(one)
|
134
|
-
replace = str_to_int(two)
|
135
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
136
143
|
end
|
137
144
|
end
|
138
145
|
|
@@ -143,30 +150,36 @@ class PDF::Reader
|
|
143
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
144
151
|
bfrange_type_two(start, finish, to)
|
145
152
|
else
|
146
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
147
154
|
end
|
148
155
|
end
|
149
156
|
end
|
150
157
|
|
151
158
|
def bfrange_type_one(start_code, end_code, dst)
|
152
|
-
start_code = str_to_int(start_code)
|
153
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
154
161
|
dst = str_to_int(dst)
|
155
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
156
165
|
# add all values in the range to our mapping
|
157
166
|
(start_code..end_code).each_with_index do |val, idx|
|
158
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
159
168
|
end
|
160
169
|
end
|
161
170
|
|
162
171
|
def bfrange_type_two(start_code, end_code, dst)
|
163
|
-
start_code = str_to_int(start_code)
|
164
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
165
177
|
from_range = (start_code..end_code)
|
166
178
|
|
167
179
|
# add all values in the range to our mapping
|
168
180
|
from_range.each_with_index do |val, idx|
|
169
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
170
183
|
end
|
171
184
|
end
|
172
185
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -51,9 +51,17 @@ class PDF::Reader
|
|
51
51
|
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
52
|
end
|
53
53
|
################################################################################
|
54
|
+
def self.validate_type_as_malformed(object, name, klass)
|
55
|
+
raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
56
|
+
end
|
57
|
+
################################################################################
|
54
58
|
def self.validate_not_nil(object, name)
|
55
59
|
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
60
|
end
|
61
|
+
################################################################################
|
62
|
+
def self.validate_not_nil_as_malformed(object, name)
|
63
|
+
raise MalformedPDFError, "#{object} must not be nil" if object.nil?
|
64
|
+
end
|
57
65
|
end
|
58
66
|
|
59
67
|
################################################################################
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
|
@@ -34,7 +34,7 @@ class PDF::Reader
|
|
34
34
|
def zlib_inflate(data)
|
35
35
|
begin
|
36
36
|
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
37
|
-
rescue Zlib::
|
37
|
+
rescue Zlib::Error
|
38
38
|
# by default, Ruby's Zlib assumes the data it's inflating
|
39
39
|
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
40
40
|
# fails, swallow the exception and attempt to inflate the data as a raw
|
@@ -43,7 +43,7 @@ class PDF::Reader
|
|
43
43
|
|
44
44
|
begin
|
45
45
|
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
46
|
-
rescue
|
46
|
+
rescue Zlib::Error
|
47
47
|
# swallow this one too, so we can try some other fallback options
|
48
48
|
end
|
49
49
|
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -42,17 +42,16 @@ class PDF::Reader
|
|
42
42
|
# returned untouched. At this stage PDF::Reader has no need to decode images.
|
43
43
|
#
|
44
44
|
def self.with(name, options = {})
|
45
|
-
case name
|
46
|
-
when :ASCII85Decode then PDF::Reader::Filter::Ascii85.new(options)
|
47
|
-
when :ASCIIHexDecode then PDF::Reader::Filter::AsciiHex.new(options)
|
48
|
-
when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
|
49
|
-
when :DCTDecode then PDF::Reader::Filter::Null.new(options)
|
50
|
-
when :FlateDecode
|
51
|
-
when :
|
52
|
-
when :
|
53
|
-
when :
|
54
|
-
when :
|
55
|
-
when :RunLengthDecode then PDF::Reader::Filter::RunLength.new(options)
|
45
|
+
case name
|
46
|
+
when :ASCII85Decode, :A85 then PDF::Reader::Filter::Ascii85.new(options)
|
47
|
+
when :ASCIIHexDecode, :AHx then PDF::Reader::Filter::AsciiHex.new(options)
|
48
|
+
when :CCITTFaxDecode, :CCF then PDF::Reader::Filter::Null.new(options)
|
49
|
+
when :DCTDecode, :DCT then PDF::Reader::Filter::Null.new(options)
|
50
|
+
when :FlateDecode, :Fl then PDF::Reader::Filter::Flate.new(options)
|
51
|
+
when :JBIG2Decode then PDF::Reader::Filter::Null.new(options)
|
52
|
+
when :JPXDecode then PDF::Reader::Filter::Null.new(options)
|
53
|
+
when :LZWDecode, :LZW then PDF::Reader::Filter::Lzw.new(options)
|
54
|
+
when :RunLengthDecode, :RL then PDF::Reader::Filter::RunLength.new(options)
|
56
55
|
else
|
57
56
|
raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
58
57
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -149,27 +149,37 @@ class PDF::Reader
|
|
149
149
|
end
|
150
150
|
end
|
151
151
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
152
|
+
def build_encoding(obj)
|
153
|
+
if obj[:Encoding].is_a?(Symbol)
|
154
|
+
# one of the standard encodings, referenced by name
|
155
|
+
# TODO pass in a standard shape, always a Hash
|
156
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
157
|
+
elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
|
158
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
159
|
+
elsif obj[:Encoding].nil?
|
160
|
+
default_encoding(@basefont)
|
157
161
|
else
|
158
|
-
|
162
|
+
raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
|
159
163
|
end
|
160
|
-
|
161
|
-
|
162
|
-
|
164
|
+
end
|
165
|
+
|
166
|
+
def extract_base_info(obj)
|
167
|
+
@subtype = @ohash.deref_name(obj[:Subtype])
|
168
|
+
@basefont = @ohash.deref_name(obj[:BaseFont])
|
169
|
+
@encoding = build_encoding(obj)
|
170
|
+
@widths = @ohash.deref_array_of_numbers(obj[:Widths]) || []
|
171
|
+
@first_char = @ohash.deref_integer(obj[:FirstChar])
|
172
|
+
@last_char = @ohash.deref_integer(obj[:LastChar])
|
163
173
|
|
164
174
|
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
165
175
|
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
166
|
-
@cid_widths = @ohash.
|
167
|
-
@cid_default_width = @ohash.
|
176
|
+
@cid_widths = @ohash.deref_array(obj[:W]) || []
|
177
|
+
@cid_default_width = @ohash.deref_number(obj[:DW]) || 1000
|
168
178
|
|
169
179
|
if obj[:ToUnicode]
|
170
180
|
# ToUnicode is optional for Type1 and Type3
|
171
|
-
stream = @ohash.
|
172
|
-
if stream
|
181
|
+
stream = @ohash.deref_stream(obj[:ToUnicode])
|
182
|
+
if stream
|
173
183
|
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
174
184
|
end
|
175
185
|
end
|
@@ -177,7 +187,9 @@ class PDF::Reader
|
|
177
187
|
|
178
188
|
def extract_type3_info(obj)
|
179
189
|
if @subtype == :Type3
|
180
|
-
@font_matrix = @ohash.
|
190
|
+
@font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
|
191
|
+
0.001, 0, 0, 0.001, 0, 0
|
192
|
+
]
|
181
193
|
end
|
182
194
|
end
|
183
195
|
|
@@ -185,7 +197,7 @@ class PDF::Reader
|
|
185
197
|
if obj[:FontDescriptor]
|
186
198
|
# create a font descriptor object if we can, in other words, unless this is
|
187
199
|
# a CID Font
|
188
|
-
fd = @ohash.
|
200
|
+
fd = @ohash.deref_hash(obj[:FontDescriptor])
|
189
201
|
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
190
202
|
else
|
191
203
|
@font_descriptor = nil
|
@@ -197,9 +209,9 @@ class PDF::Reader
|
|
197
209
|
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
198
210
|
# A one-element array specifying the CIDFont dictionary that is the
|
199
211
|
# descendant of this Type 0 font.
|
200
|
-
descendants = @ohash.
|
212
|
+
descendants = @ohash.deref_array(obj[:DescendantFonts])
|
201
213
|
@descendantfonts = descendants.map { |desc|
|
202
|
-
PDF::Reader::Font.new(@ohash, @ohash.
|
214
|
+
PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
|
203
215
|
}
|
204
216
|
end
|
205
217
|
|