pdf-reader 2.7.0 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +20 -0
- data/Rakefile +1 -1
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +10 -11
- data/lib/pdf/reader/font.rb +71 -16
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +251 -44
- data/lib/pdf/reader/page.rb +51 -22
- data/lib/pdf/reader/page_layout.rb +14 -28
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +52 -10
- data/lib/pdf/reader/parser.rb +22 -7
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +20 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/text_run.rb +13 -6
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +32 -11
- data/rbi/pdf-reader.rbi +408 -174
- metadata +16 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
data/lib/pdf/reader/font.rb
CHANGED
@@ -43,6 +43,7 @@ class PDF::Reader
|
|
43
43
|
@tounicode = nil
|
44
44
|
|
45
45
|
extract_base_info(obj)
|
46
|
+
extract_type3_info(obj)
|
46
47
|
extract_descriptor(obj)
|
47
48
|
extract_descendants(obj)
|
48
49
|
@width_calc = build_width_calculator
|
@@ -73,8 +74,44 @@ class PDF::Reader
|
|
73
74
|
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
74
75
|
end
|
75
76
|
|
77
|
+
# In most cases glyph width is converted into text space with a simple divide by 1000.
|
78
|
+
#
|
79
|
+
# However, Type3 fonts provide their own FontMatrix that's used for the transformation.
|
80
|
+
#
|
81
|
+
def glyph_width_in_text_space(code_point)
|
82
|
+
glyph_width_in_glyph_space = glyph_width(code_point)
|
83
|
+
|
84
|
+
if @subtype == :Type3
|
85
|
+
x1, y1 = font_matrix_transform(0,0)
|
86
|
+
x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
|
87
|
+
(x2 - x1).abs.round(2)
|
88
|
+
else
|
89
|
+
glyph_width_in_glyph_space / 1000.0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
76
93
|
private
|
77
94
|
|
95
|
+
# Only valid for Type3 fonts
|
96
|
+
def font_matrix_transform(x, y)
|
97
|
+
return x, y if @font_matrix.nil?
|
98
|
+
|
99
|
+
matrix = TransformationMatrix.new(
|
100
|
+
@font_matrix[0], @font_matrix[1],
|
101
|
+
@font_matrix[2], @font_matrix[3],
|
102
|
+
@font_matrix[4], @font_matrix[5],
|
103
|
+
)
|
104
|
+
|
105
|
+
if x == 0 && y == 0
|
106
|
+
[matrix.e, matrix.f]
|
107
|
+
else
|
108
|
+
[
|
109
|
+
(matrix.a * x) + (matrix.c * y) + (matrix.e),
|
110
|
+
(matrix.b * x) + (matrix.d * y) + (matrix.f)
|
111
|
+
]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
78
115
|
def default_encoding(font_name)
|
79
116
|
case font_name.to_s
|
80
117
|
when "Symbol" then
|
@@ -112,37 +149,55 @@ class PDF::Reader
|
|
112
149
|
end
|
113
150
|
end
|
114
151
|
|
115
|
-
def
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
152
|
+
def build_encoding(obj)
|
153
|
+
if obj[:Encoding].is_a?(Symbol)
|
154
|
+
# one of the standard encodings, referenced by name
|
155
|
+
# TODO pass in a standard shape, always a Hash
|
156
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
157
|
+
elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
|
158
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
159
|
+
elsif obj[:Encoding].nil?
|
160
|
+
default_encoding(@basefont)
|
120
161
|
else
|
121
|
-
|
162
|
+
raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
|
122
163
|
end
|
123
|
-
|
124
|
-
|
125
|
-
|
164
|
+
end
|
165
|
+
|
166
|
+
def extract_base_info(obj)
|
167
|
+
@subtype = @ohash.deref_name(obj[:Subtype])
|
168
|
+
@basefont = @ohash.deref_name(obj[:BaseFont])
|
169
|
+
@encoding = build_encoding(obj)
|
170
|
+
@widths = @ohash.deref_array_of_numbers(obj[:Widths]) || []
|
171
|
+
@first_char = @ohash.deref_integer(obj[:FirstChar])
|
172
|
+
@last_char = @ohash.deref_integer(obj[:LastChar])
|
126
173
|
|
127
174
|
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
128
175
|
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
129
|
-
@cid_widths = @ohash.
|
130
|
-
@cid_default_width = @ohash.
|
176
|
+
@cid_widths = @ohash.deref_array(obj[:W]) || []
|
177
|
+
@cid_default_width = @ohash.deref_number(obj[:DW]) || 1000
|
131
178
|
|
132
179
|
if obj[:ToUnicode]
|
133
180
|
# ToUnicode is optional for Type1 and Type3
|
134
|
-
stream = @ohash.
|
135
|
-
if stream
|
181
|
+
stream = @ohash.deref_stream(obj[:ToUnicode])
|
182
|
+
if stream
|
136
183
|
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
137
184
|
end
|
138
185
|
end
|
139
186
|
end
|
140
187
|
|
188
|
+
def extract_type3_info(obj)
|
189
|
+
if @subtype == :Type3
|
190
|
+
@font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
|
191
|
+
0.001, 0, 0, 0.001, 0, 0
|
192
|
+
]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
141
196
|
def extract_descriptor(obj)
|
142
197
|
if obj[:FontDescriptor]
|
143
198
|
# create a font descriptor object if we can, in other words, unless this is
|
144
199
|
# a CID Font
|
145
|
-
fd = @ohash.
|
200
|
+
fd = @ohash.deref_hash(obj[:FontDescriptor])
|
146
201
|
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
147
202
|
else
|
148
203
|
@font_descriptor = nil
|
@@ -154,9 +209,9 @@ class PDF::Reader
|
|
154
209
|
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
155
210
|
# A one-element array specifying the CIDFont dictionary that is the
|
156
211
|
# descendant of this Type 0 font.
|
157
|
-
descendants = @ohash.
|
212
|
+
descendants = @ohash.deref_array(obj[:DescendantFonts])
|
158
213
|
@descendantfonts = descendants.map { |desc|
|
159
|
-
PDF::Reader::Font.new(@ohash, @ohash.
|
214
|
+
PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
|
160
215
|
}
|
161
216
|
end
|
162
217
|
|
@@ -15,22 +15,23 @@ class PDF::Reader
|
|
15
15
|
:x_height, :font_flags
|
16
16
|
|
17
17
|
def initialize(ohash, fd_hash)
|
18
|
-
|
19
|
-
@
|
20
|
-
@
|
21
|
-
@
|
22
|
-
@
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
31
|
-
@
|
32
|
-
@
|
33
|
-
@
|
18
|
+
# TODO change these to typed derefs
|
19
|
+
@ascent = ohash.deref_number(fd_hash[:Ascent]) || 0
|
20
|
+
@descent = ohash.deref_number(fd_hash[:Descent]) || 0
|
21
|
+
@missing_width = ohash.deref_number(fd_hash[:MissingWidth]) || 0
|
22
|
+
@font_bounding_box = ohash.deref_array_of_numbers(fd_hash[:FontBBox]) || [0,0,0,0]
|
23
|
+
@avg_width = ohash.deref_number(fd_hash[:AvgWidth]) || 0
|
24
|
+
@cap_height = ohash.deref_number(fd_hash[:CapHeight]) || 0
|
25
|
+
@font_flags = ohash.deref_integer(fd_hash[:Flags]) || 0
|
26
|
+
@italic_angle = ohash.deref_number(fd_hash[:ItalicAngle])
|
27
|
+
@font_name = ohash.deref_name(fd_hash[:FontName]).to_s
|
28
|
+
@leading = ohash.deref_number(fd_hash[:Leading]) || 0
|
29
|
+
@max_width = ohash.deref_number(fd_hash[:MaxWidth]) || 0
|
30
|
+
@stem_v = ohash.deref_number(fd_hash[:StemV])
|
31
|
+
@x_height = ohash.deref_number(fd_hash[:XHeight])
|
32
|
+
@font_stretch = ohash.deref_name(fd_hash[:FontStretch]) || :Normal
|
33
|
+
@font_weight = ohash.deref_number(fd_hash[:FontWeight]) || 400
|
34
|
+
@font_family = ohash.deref_string(fd_hash[:FontFamily])
|
34
35
|
|
35
36
|
# A FontDescriptor may have an embedded font program in FontFile
|
36
37
|
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
@@ -40,7 +41,7 @@ class PDF::Reader
|
|
40
41
|
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
41
42
|
# 3) OpenType: OpenType Font Program
|
42
43
|
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
43
|
-
@font_program_stream = ohash.
|
44
|
+
@font_program_stream = ohash.deref_stream(fd_hash[:FontFile2])
|
44
45
|
#TODO handle FontFile and FontFile3
|
45
46
|
|
46
47
|
@is_ttf = true if @font_program_stream
|
@@ -15,15 +15,24 @@ module PDF
|
|
15
15
|
# This behaves and looks much like a limited PDF::Reader::Page class.
|
16
16
|
#
|
17
17
|
class FormXObject
|
18
|
-
|
18
|
+
extend Forwardable
|
19
19
|
|
20
20
|
attr_reader :xobject
|
21
21
|
|
22
|
+
def_delegators :resources, :color_spaces
|
23
|
+
def_delegators :resources, :fonts
|
24
|
+
def_delegators :resources, :graphic_states
|
25
|
+
def_delegators :resources, :patterns
|
26
|
+
def_delegators :resources, :procedure_sets
|
27
|
+
def_delegators :resources, :properties
|
28
|
+
def_delegators :resources, :shadings
|
29
|
+
def_delegators :resources, :xobjects
|
30
|
+
|
22
31
|
def initialize(page, xobject, options = {})
|
23
32
|
@page = page
|
24
33
|
@objects = page.objects
|
25
34
|
@cache = options[:cache] || {}
|
26
|
-
@xobject = @objects.
|
35
|
+
@xobject = @objects.deref_stream(xobject)
|
27
36
|
end
|
28
37
|
|
29
38
|
# return a hash of fonts used on this form.
|
@@ -34,9 +43,9 @@ module PDF
|
|
34
43
|
# to most available metrics for each font.
|
35
44
|
#
|
36
45
|
def font_objects
|
37
|
-
raw_fonts = @objects.
|
46
|
+
raw_fonts = @objects.deref_hash(fonts)
|
38
47
|
::Hash[raw_fonts.map { |label, font|
|
39
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
48
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font))]
|
40
49
|
}]
|
41
50
|
end
|
42
51
|
|
@@ -61,7 +70,7 @@ module PDF
|
|
61
70
|
# Returns the resources that accompany this form.
|
62
71
|
#
|
63
72
|
def resources
|
64
|
-
@resources ||= @objects.
|
73
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(@xobject.hash[:Resources]) || {})
|
65
74
|
end
|
66
75
|
|
67
76
|
def callback(receivers, name, params=[])
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
require 'rc4'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Processes the Encrypt dict from an encrypted PDF and a user provided
|
11
|
+
# password and returns a key that can decrypt the file.
|
12
|
+
#
|
13
|
+
# This can generate a decryption key compatible with the following standard encryption algorithms:
|
14
|
+
#
|
15
|
+
# * Version 5 (AESV3)
|
16
|
+
#
|
17
|
+
class KeyBuilderV5
|
18
|
+
|
19
|
+
def initialize(opts = {})
|
20
|
+
@key_length = 256
|
21
|
+
|
22
|
+
# hash(32B) + validation salt(8B) + key salt(8B)
|
23
|
+
@owner_key = opts[:owner_key] || ""
|
24
|
+
|
25
|
+
# hash(32B) + validation salt(8B) + key salt(8B)
|
26
|
+
@user_key = opts[:user_key] || ""
|
27
|
+
|
28
|
+
# decryption key, encrypted w/ owner password
|
29
|
+
@owner_encryption_key = opts[:owner_encryption_key] || ""
|
30
|
+
|
31
|
+
# decryption key, encrypted w/ user password
|
32
|
+
@user_encryption_key = opts[:user_encryption_key] || ""
|
33
|
+
end
|
34
|
+
|
35
|
+
# Takes a string containing a user provided password.
|
36
|
+
#
|
37
|
+
# If the password matches the file, then a string containing a key suitable for
|
38
|
+
# decrypting the file will be returned. If the password doesn't match the file,
|
39
|
+
# and exception will be raised.
|
40
|
+
#
|
41
|
+
def key(pass)
|
42
|
+
pass = pass.byteslice(0...127).to_s # UTF-8 encoded password. first 127 bytes
|
43
|
+
|
44
|
+
encrypt_key = auth_owner_pass(pass)
|
45
|
+
encrypt_key ||= auth_user_pass(pass)
|
46
|
+
encrypt_key ||= auth_owner_pass_r6(pass)
|
47
|
+
encrypt_key ||= auth_user_pass_r6(pass)
|
48
|
+
|
49
|
+
raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
|
50
|
+
encrypt_key
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# Algorithm 3.2a - Computing an encryption key
|
56
|
+
#
|
57
|
+
# Defined in PDF 1.7 Extension Level 3
|
58
|
+
#
|
59
|
+
# if the string is a valid user/owner password, this will return the decryption key
|
60
|
+
#
|
61
|
+
def auth_owner_pass(password)
|
62
|
+
if Digest::SHA256.digest(password + @owner_key[32..39] + @user_key) == @owner_key[0..31]
|
63
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
64
|
+
cipher.decrypt
|
65
|
+
cipher.key = Digest::SHA256.digest(password + @owner_key[40..-1] + @user_key)
|
66
|
+
cipher.iv = "\x00" * 16
|
67
|
+
cipher.padding = 0
|
68
|
+
cipher.update(@owner_encryption_key) + cipher.final
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def auth_user_pass(password)
|
73
|
+
if Digest::SHA256.digest(password + @user_key[32..39]) == @user_key[0..31]
|
74
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
75
|
+
cipher.decrypt
|
76
|
+
cipher.key = Digest::SHA256.digest(password + @user_key[40..-1])
|
77
|
+
cipher.iv = "\x00" * 16
|
78
|
+
cipher.padding = 0
|
79
|
+
cipher.update(@user_encryption_key) + cipher.final
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def auth_owner_pass_r6(password)
|
84
|
+
if r6_digest(password, @owner_key[32..39].to_s, @user_key[0,48].to_s) == @owner_key[0..31]
|
85
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
86
|
+
cipher.decrypt
|
87
|
+
cipher.key = r6_digest(password, @owner_key[40,8].to_s, @user_key[0, 48].to_s)
|
88
|
+
cipher.iv = "\x00" * 16
|
89
|
+
cipher.padding = 0
|
90
|
+
cipher.update(@owner_encryption_key) + cipher.final
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def auth_user_pass_r6(password)
|
95
|
+
if r6_digest(password, @user_key[32..39].to_s) == @user_key[0..31]
|
96
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
97
|
+
cipher.decrypt
|
98
|
+
cipher.key = r6_digest(password, @user_key[40,8].to_s)
|
99
|
+
cipher.iv = "\x00" * 16
|
100
|
+
cipher.padding = 0
|
101
|
+
cipher.update(@user_encryption_key) + cipher.final
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# PDF 2.0 spec, 7.6.4.3.4
|
106
|
+
# Algorithm 2.B: Computing a hash (revision 6 and later)
|
107
|
+
def r6_digest(password, salt, user_key = '')
|
108
|
+
k = Digest::SHA256.digest(password + salt + user_key)
|
109
|
+
e = ''
|
110
|
+
|
111
|
+
i = 0
|
112
|
+
while i < 64 or e.getbyte(-1).to_i > i - 32
|
113
|
+
k1 = (password + k + user_key) * 64
|
114
|
+
|
115
|
+
aes = OpenSSL::Cipher.new("aes-128-cbc").encrypt
|
116
|
+
aes.key = k[0, 16].to_s
|
117
|
+
aes.iv = k[16, 16].to_s
|
118
|
+
aes.padding = 0
|
119
|
+
e = String.new(aes.update(k1))
|
120
|
+
k = case unpack_128bit_bigendian_int(e) % 3
|
121
|
+
when 0 then Digest::SHA256.digest(e)
|
122
|
+
when 1 then Digest::SHA384.digest(e)
|
123
|
+
when 2 then Digest::SHA512.digest(e)
|
124
|
+
end
|
125
|
+
i = i + 1
|
126
|
+
end
|
127
|
+
|
128
|
+
k[0, 32].to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
def unpack_128bit_bigendian_int(str)
|
132
|
+
ints = str[0,16].to_s.unpack("N*")
|
133
|
+
(ints[0].to_i << 96) + (ints[1].to_i << 64) + (ints[2].to_i << 32) + ints[3].to_i
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|