pdf-reader 2.7.0 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +20 -0
  3. data/Rakefile +1 -1
  4. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  5. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  6. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  7. data/lib/pdf/reader/buffer.rb +36 -34
  8. data/lib/pdf/reader/cmap.rb +64 -51
  9. data/lib/pdf/reader/error.rb +8 -0
  10. data/lib/pdf/reader/filter/ascii85.rb +1 -1
  11. data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
  12. data/lib/pdf/reader/filter/depredict.rb +1 -1
  13. data/lib/pdf/reader/filter/flate.rb +3 -3
  14. data/lib/pdf/reader/filter/lzw.rb +1 -1
  15. data/lib/pdf/reader/filter/null.rb +1 -2
  16. data/lib/pdf/reader/filter/run_length.rb +1 -1
  17. data/lib/pdf/reader/filter.rb +10 -11
  18. data/lib/pdf/reader/font.rb +71 -16
  19. data/lib/pdf/reader/font_descriptor.rb +18 -17
  20. data/lib/pdf/reader/form_xobject.rb +14 -5
  21. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  22. data/lib/pdf/reader/null_security_handler.rb +0 -4
  23. data/lib/pdf/reader/object_hash.rb +251 -44
  24. data/lib/pdf/reader/page.rb +51 -22
  25. data/lib/pdf/reader/page_layout.rb +14 -28
  26. data/lib/pdf/reader/page_state.rb +1 -1
  27. data/lib/pdf/reader/page_text_receiver.rb +52 -10
  28. data/lib/pdf/reader/parser.rb +22 -7
  29. data/lib/pdf/reader/point.rb +1 -1
  30. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  31. data/lib/pdf/reader/rectangle.rb +20 -2
  32. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
  33. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  34. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
  35. data/lib/pdf/reader/stream.rb +2 -2
  36. data/lib/pdf/reader/text_run.rb +13 -6
  37. data/lib/pdf/reader/type_check.rb +52 -0
  38. data/lib/pdf/reader/validating_receiver.rb +262 -0
  39. data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
  40. data/lib/pdf/reader/xref.rb +20 -3
  41. data/lib/pdf/reader.rb +32 -11
  42. data/rbi/pdf-reader.rbi +408 -174
  43. metadata +16 -9
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -43,6 +43,7 @@ class PDF::Reader
43
43
  @tounicode = nil
44
44
 
45
45
  extract_base_info(obj)
46
+ extract_type3_info(obj)
46
47
  extract_descriptor(obj)
47
48
  extract_descendants(obj)
48
49
  @width_calc = build_width_calculator
@@ -73,8 +74,44 @@ class PDF::Reader
73
74
  @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
74
75
  end
75
76
 
77
+ # In most cases glyph width is converted into text space with a simple divide by 1000.
78
+ #
79
+ # However, Type3 fonts provide their own FontMatrix that's used for the transformation.
80
+ #
81
+ def glyph_width_in_text_space(code_point)
82
+ glyph_width_in_glyph_space = glyph_width(code_point)
83
+
84
+ if @subtype == :Type3
85
+ x1, y1 = font_matrix_transform(0,0)
86
+ x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
87
+ (x2 - x1).abs.round(2)
88
+ else
89
+ glyph_width_in_glyph_space / 1000.0
90
+ end
91
+ end
92
+
76
93
  private
77
94
 
95
+ # Only valid for Type3 fonts
96
+ def font_matrix_transform(x, y)
97
+ return x, y if @font_matrix.nil?
98
+
99
+ matrix = TransformationMatrix.new(
100
+ @font_matrix[0], @font_matrix[1],
101
+ @font_matrix[2], @font_matrix[3],
102
+ @font_matrix[4], @font_matrix[5],
103
+ )
104
+
105
+ if x == 0 && y == 0
106
+ [matrix.e, matrix.f]
107
+ else
108
+ [
109
+ (matrix.a * x) + (matrix.c * y) + (matrix.e),
110
+ (matrix.b * x) + (matrix.d * y) + (matrix.f)
111
+ ]
112
+ end
113
+ end
114
+
78
115
  def default_encoding(font_name)
79
116
  case font_name.to_s
80
117
  when "Symbol" then
@@ -112,37 +149,55 @@ class PDF::Reader
112
149
  end
113
150
  end
114
151
 
115
- def extract_base_info(obj)
116
- @subtype = @ohash.object(obj[:Subtype])
117
- @basefont = @ohash.object(obj[:BaseFont])
118
- if @ohash.object(obj[:Encoding])
119
- @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
152
+ def build_encoding(obj)
153
+ if obj[:Encoding].is_a?(Symbol)
154
+ # one of the standard encodings, referenced by name
155
+ # TODO pass in a standard shape, always a Hash
156
+ PDF::Reader::Encoding.new(obj[:Encoding])
157
+ elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
158
+ PDF::Reader::Encoding.new(obj[:Encoding])
159
+ elsif obj[:Encoding].nil?
160
+ default_encoding(@basefont)
120
161
  else
121
- @encoding = default_encoding(@basefont)
162
+ raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
122
163
  end
123
- @widths = @ohash.object(obj[:Widths]) || []
124
- @first_char = @ohash.object(obj[:FirstChar])
125
- @last_char = @ohash.object(obj[:LastChar])
164
+ end
165
+
166
+ def extract_base_info(obj)
167
+ @subtype = @ohash.deref_name(obj[:Subtype])
168
+ @basefont = @ohash.deref_name(obj[:BaseFont])
169
+ @encoding = build_encoding(obj)
170
+ @widths = @ohash.deref_array_of_numbers(obj[:Widths]) || []
171
+ @first_char = @ohash.deref_integer(obj[:FirstChar])
172
+ @last_char = @ohash.deref_integer(obj[:LastChar])
126
173
 
127
174
  # CID Fonts are not required to have a W or DW entry, if they don't exist,
128
175
  # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
129
- @cid_widths = @ohash.object(obj[:W]) || []
130
- @cid_default_width = @ohash.object(obj[:DW]) || 1000
176
+ @cid_widths = @ohash.deref_array(obj[:W]) || []
177
+ @cid_default_width = @ohash.deref_number(obj[:DW]) || 1000
131
178
 
132
179
  if obj[:ToUnicode]
133
180
  # ToUnicode is optional for Type1 and Type3
134
- stream = @ohash.object(obj[:ToUnicode])
135
- if stream.is_a?(PDF::Reader::Stream)
181
+ stream = @ohash.deref_stream(obj[:ToUnicode])
182
+ if stream
136
183
  @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
137
184
  end
138
185
  end
139
186
  end
140
187
 
188
+ def extract_type3_info(obj)
189
+ if @subtype == :Type3
190
+ @font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
191
+ 0.001, 0, 0, 0.001, 0, 0
192
+ ]
193
+ end
194
+ end
195
+
141
196
  def extract_descriptor(obj)
142
197
  if obj[:FontDescriptor]
143
198
  # create a font descriptor object if we can, in other words, unless this is
144
199
  # a CID Font
145
- fd = @ohash.object(obj[:FontDescriptor])
200
+ fd = @ohash.deref_hash(obj[:FontDescriptor])
146
201
  @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
147
202
  else
148
203
  @font_descriptor = nil
@@ -154,9 +209,9 @@ class PDF::Reader
154
209
  # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
155
210
  # A one-element array specifying the CIDFont dictionary that is the
156
211
  # descendant of this Type 0 font.
157
- descendants = @ohash.object(obj[:DescendantFonts])
212
+ descendants = @ohash.deref_array(obj[:DescendantFonts])
158
213
  @descendantfonts = descendants.map { |desc|
159
- PDF::Reader::Font.new(@ohash, @ohash.object(desc))
214
+ PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
160
215
  }
161
216
  end
162
217
 
@@ -15,22 +15,23 @@ class PDF::Reader
15
15
  :x_height, :font_flags
16
16
 
17
17
  def initialize(ohash, fd_hash)
18
- @ascent = ohash.object(fd_hash[:Ascent]) || 0
19
- @descent = ohash.object(fd_hash[:Descent]) || 0
20
- @missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
21
- @font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
22
- @avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
23
- @cap_height = ohash.object(fd_hash[:CapHeight]) || 0
24
- @font_flags = ohash.object(fd_hash[:Flags]) || 0
25
- @italic_angle = ohash.object(fd_hash[:ItalicAngle])
26
- @font_name = ohash.object(fd_hash[:FontName]).to_s
27
- @leading = ohash.object(fd_hash[:Leading]) || 0
28
- @max_width = ohash.object(fd_hash[:MaxWidth]) || 0
29
- @stem_v = ohash.object(fd_hash[:StemV])
30
- @x_height = ohash.object(fd_hash[:XHeight])
31
- @font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
32
- @font_weight = ohash.object(fd_hash[:FontWeight]) || 400
33
- @font_family = ohash.object(fd_hash[:FontFamily])
18
+ # TODO change these to typed derefs
19
+ @ascent = ohash.deref_number(fd_hash[:Ascent]) || 0
20
+ @descent = ohash.deref_number(fd_hash[:Descent]) || 0
21
+ @missing_width = ohash.deref_number(fd_hash[:MissingWidth]) || 0
22
+ @font_bounding_box = ohash.deref_array_of_numbers(fd_hash[:FontBBox]) || [0,0,0,0]
23
+ @avg_width = ohash.deref_number(fd_hash[:AvgWidth]) || 0
24
+ @cap_height = ohash.deref_number(fd_hash[:CapHeight]) || 0
25
+ @font_flags = ohash.deref_integer(fd_hash[:Flags]) || 0
26
+ @italic_angle = ohash.deref_number(fd_hash[:ItalicAngle])
27
+ @font_name = ohash.deref_name(fd_hash[:FontName]).to_s
28
+ @leading = ohash.deref_number(fd_hash[:Leading]) || 0
29
+ @max_width = ohash.deref_number(fd_hash[:MaxWidth]) || 0
30
+ @stem_v = ohash.deref_number(fd_hash[:StemV])
31
+ @x_height = ohash.deref_number(fd_hash[:XHeight])
32
+ @font_stretch = ohash.deref_name(fd_hash[:FontStretch]) || :Normal
33
+ @font_weight = ohash.deref_number(fd_hash[:FontWeight]) || 400
34
+ @font_family = ohash.deref_string(fd_hash[:FontFamily])
34
35
 
35
36
  # A FontDescriptor may have an embedded font program in FontFile
36
37
  # (Type 1 Font Program), FontFile2 (TrueType font program), or
@@ -40,7 +41,7 @@ class PDF::Reader
40
41
  # 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
41
42
  # 3) OpenType: OpenType Font Program
42
43
  # see Section 9.9, PDF 32000-1:2008, pp 288-292
43
- @font_program_stream = ohash.object(fd_hash[:FontFile2])
44
+ @font_program_stream = ohash.deref_stream(fd_hash[:FontFile2])
44
45
  #TODO handle FontFile and FontFile3
45
46
 
46
47
  @is_ttf = true if @font_program_stream
@@ -15,15 +15,24 @@ module PDF
15
15
  # This behaves and looks much like a limited PDF::Reader::Page class.
16
16
  #
17
17
  class FormXObject
18
- include ResourceMethods
18
+ extend Forwardable
19
19
 
20
20
  attr_reader :xobject
21
21
 
22
+ def_delegators :resources, :color_spaces
23
+ def_delegators :resources, :fonts
24
+ def_delegators :resources, :graphic_states
25
+ def_delegators :resources, :patterns
26
+ def_delegators :resources, :procedure_sets
27
+ def_delegators :resources, :properties
28
+ def_delegators :resources, :shadings
29
+ def_delegators :resources, :xobjects
30
+
22
31
  def initialize(page, xobject, options = {})
23
32
  @page = page
24
33
  @objects = page.objects
25
34
  @cache = options[:cache] || {}
26
- @xobject = @objects.deref(xobject)
35
+ @xobject = @objects.deref_stream(xobject)
27
36
  end
28
37
 
29
38
  # return a hash of fonts used on this form.
@@ -34,9 +43,9 @@ module PDF
34
43
  # to most available metrics for each font.
35
44
  #
36
45
  def font_objects
37
- raw_fonts = @objects.deref(resources[:Font] || {})
46
+ raw_fonts = @objects.deref_hash(fonts)
38
47
  ::Hash[raw_fonts.map { |label, font|
39
- [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
48
+ [label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font))]
40
49
  }]
41
50
  end
42
51
 
@@ -61,7 +70,7 @@ module PDF
61
70
  # Returns the resources that accompany this form.
62
71
  #
63
72
  def resources
64
- @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
73
+ @resources ||= Resources.new(@objects, @objects.deref_hash(@xobject.hash[:Resources]) || {})
65
74
  end
66
75
 
67
76
  def callback(receivers, name, params=[])
@@ -0,0 +1,138 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+ require 'rc4'
7
+
8
+ class PDF::Reader
9
+
10
+ # Processes the Encrypt dict from an encrypted PDF and a user provided
11
+ # password and returns a key that can decrypt the file.
12
+ #
13
+ # This can generate a decryption key compatible with the following standard encryption algorithms:
14
+ #
15
+ # * Version 5 (AESV3)
16
+ #
17
+ class KeyBuilderV5
18
+
19
+ def initialize(opts = {})
20
+ @key_length = 256
21
+
22
+ # hash(32B) + validation salt(8B) + key salt(8B)
23
+ @owner_key = opts[:owner_key] || ""
24
+
25
+ # hash(32B) + validation salt(8B) + key salt(8B)
26
+ @user_key = opts[:user_key] || ""
27
+
28
+ # decryption key, encrypted w/ owner password
29
+ @owner_encryption_key = opts[:owner_encryption_key] || ""
30
+
31
+ # decryption key, encrypted w/ user password
32
+ @user_encryption_key = opts[:user_encryption_key] || ""
33
+ end
34
+
35
+ # Takes a string containing a user provided password.
36
+ #
37
+ # If the password matches the file, then a string containing a key suitable for
38
+ # decrypting the file will be returned. If the password doesn't match the file,
39
+ # and exception will be raised.
40
+ #
41
+ def key(pass)
42
+ pass = pass.byteslice(0...127).to_s # UTF-8 encoded password. first 127 bytes
43
+
44
+ encrypt_key = auth_owner_pass(pass)
45
+ encrypt_key ||= auth_user_pass(pass)
46
+ encrypt_key ||= auth_owner_pass_r6(pass)
47
+ encrypt_key ||= auth_user_pass_r6(pass)
48
+
49
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
50
+ encrypt_key
51
+ end
52
+
53
+ private
54
+
55
+ # Algorithm 3.2a - Computing an encryption key
56
+ #
57
+ # Defined in PDF 1.7 Extension Level 3
58
+ #
59
+ # if the string is a valid user/owner password, this will return the decryption key
60
+ #
61
+ def auth_owner_pass(password)
62
+ if Digest::SHA256.digest(password + @owner_key[32..39] + @user_key) == @owner_key[0..31]
63
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
64
+ cipher.decrypt
65
+ cipher.key = Digest::SHA256.digest(password + @owner_key[40..-1] + @user_key)
66
+ cipher.iv = "\x00" * 16
67
+ cipher.padding = 0
68
+ cipher.update(@owner_encryption_key) + cipher.final
69
+ end
70
+ end
71
+
72
+ def auth_user_pass(password)
73
+ if Digest::SHA256.digest(password + @user_key[32..39]) == @user_key[0..31]
74
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
75
+ cipher.decrypt
76
+ cipher.key = Digest::SHA256.digest(password + @user_key[40..-1])
77
+ cipher.iv = "\x00" * 16
78
+ cipher.padding = 0
79
+ cipher.update(@user_encryption_key) + cipher.final
80
+ end
81
+ end
82
+
83
+ def auth_owner_pass_r6(password)
84
+ if r6_digest(password, @owner_key[32..39].to_s, @user_key[0,48].to_s) == @owner_key[0..31]
85
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
86
+ cipher.decrypt
87
+ cipher.key = r6_digest(password, @owner_key[40,8].to_s, @user_key[0, 48].to_s)
88
+ cipher.iv = "\x00" * 16
89
+ cipher.padding = 0
90
+ cipher.update(@owner_encryption_key) + cipher.final
91
+ end
92
+ end
93
+
94
+ def auth_user_pass_r6(password)
95
+ if r6_digest(password, @user_key[32..39].to_s) == @user_key[0..31]
96
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
97
+ cipher.decrypt
98
+ cipher.key = r6_digest(password, @user_key[40,8].to_s)
99
+ cipher.iv = "\x00" * 16
100
+ cipher.padding = 0
101
+ cipher.update(@user_encryption_key) + cipher.final
102
+ end
103
+ end
104
+
105
+ # PDF 2.0 spec, 7.6.4.3.4
106
+ # Algorithm 2.B: Computing a hash (revision 6 and later)
107
+ def r6_digest(password, salt, user_key = '')
108
+ k = Digest::SHA256.digest(password + salt + user_key)
109
+ e = ''
110
+
111
+ i = 0
112
+ while i < 64 or e.getbyte(-1).to_i > i - 32
113
+ k1 = (password + k + user_key) * 64
114
+
115
+ aes = OpenSSL::Cipher.new("aes-128-cbc").encrypt
116
+ aes.key = k[0, 16].to_s
117
+ aes.iv = k[16, 16].to_s
118
+ aes.padding = 0
119
+ e = String.new(aes.update(k1))
120
+ k = case unpack_128bit_bigendian_int(e) % 3
121
+ when 0 then Digest::SHA256.digest(e)
122
+ when 1 then Digest::SHA384.digest(e)
123
+ when 2 then Digest::SHA512.digest(e)
124
+ end
125
+ i = i + 1
126
+ end
127
+
128
+ k[0, 32].to_s
129
+ end
130
+
131
+ def unpack_128bit_bigendian_int(str)
132
+ ints = str[0,16].to_s.unpack("N*")
133
+ (ints[0].to_i << 96) + (ints[1].to_i << 64) + (ints[2].to_i << 32) + ints[3].to_i
134
+ end
135
+
136
+ end
137
+ end
138
+
@@ -7,10 +7,6 @@ class PDF::Reader
7
7
  # A null object security handler. Used when a PDF is unencrypted.
8
8
  class NullSecurityHandler
9
9
 
10
- def self.supports?(encrypt)
11
- encrypt.nil?
12
- end
13
-
14
10
  def decrypt(buf, _ref)
15
11
  buf
16
12
  end