pdf-reader 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d34a3033638fcf4f78ce2d71dd5f5aebc2ea7d68
4
- data.tar.gz: 912ed3986ab7e7eefce8f987cffc0d0357bcd675
2
+ SHA256:
3
+ metadata.gz: b1e63414d8a3db12b6ea802fc45893ebf35c09dd37ca02c5cc73137d7d782364
4
+ data.tar.gz: afb778860a3dd8aab83d634c393666e159101505aba843262f61f7af49cf30e5
5
5
  SHA512:
6
- metadata.gz: a24c874cae12838cf122c92338e6da1639d15187be9ef5c1f149eedd0167cc21b53544c3148efd37992535efe2d0ec4487a02fc3a668992295cecc7aa5529f2a
7
- data.tar.gz: c88b9daa21de84eb2a841cc024c499ecc7c5d4b5908ad6618494fa3e7f5d694d291ae87ceb72f5ff9f4d2db66b3bc781e1ff499485ba50290710c6e1061a76db
6
+ metadata.gz: 1c32f7ac1b0d9f0d27ec445905af7dce3544d505221d6b940b8f5f37f85eaf95fad81d40850c85788dc459b59dd1f58398b27e9ec8e72bdbb077e94f77e9f332
7
+ data.tar.gz: 88bdd1bebe08ad919344788a9a7416e782c7fb5185ef984447ab1f9c968a8fb6a24af2b95dec99da2b43d4d4861a64ba1e8584f0ec25d01c3c13ae4f81f0191c
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ v2.1.0 (15th Februar 2018)
2
+ - Support extra encrypted PDF variants (thanks to Gyuchang Jun)
3
+ - various bug fixes
4
+
1
5
  v2.0.0 (25th February 2017)
2
6
  - various bug fixes
3
7
 
@@ -134,6 +134,9 @@ module PDF
134
134
 
135
135
  def page_count
136
136
  pages = @objects.deref(root[:Pages])
137
+ unless pages.kind_of?(::Hash)
138
+ raise MalformedPDFError, 'Pages structure is missing'
139
+ end
137
140
  @page_count ||= @objects.deref(pages[:Count])
138
141
  end
139
142
 
@@ -173,9 +176,13 @@ module PDF
173
176
  # methods available on each page
174
177
  #
175
178
  def pages
176
- (1..self.page_count).map { |num|
177
- PDF::Reader::Page.new(@objects, num, :cache => @cache)
178
- }
179
+ (1..self.page_count).map do |num|
180
+ begin
181
+ PDF::Reader::Page.new(@objects, num, :cache => @cache)
182
+ rescue InvalidPageError => ex
183
+ raise MalformedPDFError, "Missing data for page: #{num}"
184
+ end
185
+ end
179
186
  end
180
187
 
181
188
  # returns a single PDF::Reader::Page for the specified page.
@@ -193,7 +200,7 @@ module PDF
193
200
  def page(num)
194
201
  num = num.to_i
195
202
  if num < 1 || num > self.page_count
196
- raise ArgumentError, "valid pages are 1 .. #{self.page_count}"
203
+ raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
197
204
  end
198
205
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
199
206
  end
@@ -219,7 +226,7 @@ module PDF
219
226
  pdfdoc_to_utf8(obj)
220
227
  end
221
228
  else
222
- obj
229
+ @objects.deref(obj)
223
230
  end
224
231
  end
225
232
 
@@ -241,7 +248,13 @@ module PDF
241
248
  end
242
249
 
243
250
  def root
244
- @root ||= @objects.deref(@objects.trailer[:Root])
251
+ @root ||= begin
252
+ obj = @objects.deref(@objects.trailer[:Root])
253
+ unless obj.kind_of?(::Hash)
254
+ raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
255
+ end
256
+ obj
257
+ end
245
258
  end
246
259
 
247
260
  end
@@ -277,6 +290,7 @@ require 'pdf/reader/reference'
277
290
  require 'pdf/reader/register_receiver'
278
291
  require 'pdf/reader/null_security_handler'
279
292
  require 'pdf/reader/standard_security_handler'
293
+ require 'pdf/reader/standard_security_handler_v5'
280
294
  require 'pdf/reader/unimplemented_security_handler'
281
295
  require 'pdf/reader/stream'
282
296
  require 'pdf/reader/text_run'
@@ -198,10 +198,6 @@ class PDF::Reader
198
198
  end
199
199
  end
200
200
 
201
- def has_mapping?
202
- @mapping.size > 0
203
- end
204
-
205
201
  def glyphlist
206
202
  @glyphlist ||= PDF::Reader::GlyphHash.new
207
203
  end
@@ -52,6 +52,10 @@ class PDF::Reader
52
52
  # the PDF spec and cannot be recovered
53
53
  class MalformedPDFError < RuntimeError; end
54
54
 
55
+ ################################################################################
56
+ # an exception that is raised when an invalid page number is used
57
+ class InvalidPageError < ArgumentError; end
58
+
55
59
  ################################################################################
56
60
  # an exception that is raised when a PDF object appears to be invalid
57
61
  class InvalidObjectError < MalformedPDFError; end
@@ -116,11 +116,10 @@ module PDF
116
116
  result
117
117
  end
118
118
 
119
- private
120
-
121
119
  def self.create_new_string(string_table,some_code, other_code)
122
120
  string_table[some_code] + string_table[other_code][0].chr
123
121
  end
122
+ private_class_method :create_new_string
124
123
 
125
124
  end
126
125
  end
@@ -300,7 +300,16 @@ class PDF::Reader
300
300
  permissions: encrypt[:P].to_i,
301
301
  encrypted_metadata: encmeta,
302
302
  file_id: (deref(trailer[:ID]) || []).first,
303
- password: opts[:password]
303
+ password: opts[:password],
304
+ cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
305
+ )
306
+ elsif StandardSecurityHandlerV5.supports?(encrypt)
307
+ StandardSecurityHandlerV5.new(
308
+ O: encrypt[:O],
309
+ U: encrypt[:U],
310
+ OE: encrypt[:OE],
311
+ UE: encrypt[:UE],
312
+ password: opts[:password]
304
313
  )
305
314
  else
306
315
  UnimplementedSecurityHandler.new
@@ -341,6 +350,10 @@ class PDF::Reader
341
350
  def get_page_objects(ref)
342
351
  obj = deref(ref)
343
352
 
353
+ unless obj.kind_of?(::Hash)
354
+ raise MalformedPDFError, "Dereferenced page object must be a dict"
355
+ end
356
+
344
357
  if obj[:Type] == :Page
345
358
  ref
346
359
  elsif obj[:Kids]
@@ -36,7 +36,7 @@ module PDF
36
36
  @cache = options[:cache] || {}
37
37
 
38
38
  unless @page_object.is_a?(::Hash)
39
- raise ArgumentError, "invalid page: #{pagenum}"
39
+ raise InvalidPageError, "Invalid page: #{pagenum}"
40
40
  end
41
41
  end
42
42
 
@@ -30,8 +30,8 @@ class PDF::Reader
30
30
  @runs.each do |run|
31
31
  x_pos = ((run.x - @x_offset) / col_multiplier).round
32
32
  y_pos = row_count - (run.y / row_multiplier).round
33
- if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
34
- local_string_insert(page[y_pos], run.text, x_pos)
33
+ if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
34
+ local_string_insert(page[y_pos-1], run.text, x_pos)
35
35
  end
36
36
  end
37
37
  interesting_rows(page).map(&:rstrip).join("\n")
@@ -25,6 +25,7 @@
25
25
  #
26
26
  ################################################################################
27
27
  require 'digest/md5'
28
+ require 'openssl'
28
29
  require 'rc4'
29
30
 
30
31
  class PDF::Reader
@@ -54,6 +55,7 @@ class PDF::Reader
54
55
  @encryptMeta = opts.fetch(:encrypted_metadata, true)
55
56
  @file_id = opts[:file_id] || ""
56
57
  @encrypt_key = build_standard_key(opts[:password] || "")
58
+ @cfm = opts[:cfm]
57
59
 
58
60
  if @key_length != 5 && @key_length != 16
59
61
  msg = "StandardSecurityHandler only supports 40 and 128 bit\
@@ -62,28 +64,40 @@ class PDF::Reader
62
64
  end
63
65
  end
64
66
 
65
- # This handler supports all RC4 encryption that follows the PDF spec. It does not support
66
- # AES encryption that was added in later versions of the spec.
67
+ # This handler supports all encryption that follows upto PDF 1.5 spec (revision 4)
67
68
  def self.supports?(encrypt)
68
69
  return false if encrypt.nil?
69
70
 
70
71
  filter = encrypt.fetch(:Filter, :Standard)
71
72
  version = encrypt.fetch(:V, 0)
72
73
  algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
73
- filter == :Standard &&
74
- (version <= 3 || (version == 4 && algorithm != :AESV2))
74
+ (filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
75
+ (version <= 3 || (version == 4 && ((algorithm == :V2) || (algorithm == :AESV2))))
75
76
  end
76
77
 
77
78
  ##7.6.2 General Encryption Algorithm
78
79
  #
79
80
  # Algorithm 1: Encryption of data using the RC4 or AES algorithms
80
81
  #
81
- # used to decrypt RC4 encrypted PDF streams (buf)
82
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
82
83
  #
83
84
  # buf - a string to decrypt
84
85
  # ref - a PDF::Reader::Reference for the object to decrypt
85
86
  #
86
87
  def decrypt( buf, ref )
88
+ case @cfm
89
+ when :AESV2
90
+ decrypt_aes128(buf, ref)
91
+ else
92
+ decrypt_rc4(buf, ref)
93
+ end
94
+ end
95
+
96
+ private
97
+
98
+ # decrypt with RC4 algorithm
99
+ # version <=3 or (version == 4 and CFM == V2)
100
+ def decrypt_rc4( buf, ref )
87
101
  objKey = @encrypt_key.dup
88
102
  (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
89
103
  (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
@@ -92,7 +106,20 @@ class PDF::Reader
92
106
  rc4.decrypt(buf)
93
107
  end
94
108
 
95
- private
109
+ # decrypt with AES-128-CBC algorithm
110
+ # when (version == 4 and CFM == AESV2)
111
+ def decrypt_aes128( buf, ref )
112
+ objKey = @encrypt_key.dup
113
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
114
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
115
+ objKey << 'sAlT' # Algorithm 1, b)
116
+ length = objKey.length < 16 ? objKey.length : 16
117
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
118
+ cipher.decrypt
119
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
120
+ cipher.iv = buf[0..15]
121
+ cipher.update(buf[16..-1]) + cipher.final
122
+ end
96
123
 
97
124
  # Pads supplied password to 32bytes using PassPadBytes as specified on
98
125
  # pp61 of spec
@@ -125,9 +152,9 @@ class PDF::Reader
125
152
  if @revision > 2 then
126
153
  50.times { md5 = Digest::MD5.digest(md5) }
127
154
  keyBegins = md5[0, key_length]
128
- #first itteration decrypt owner_key
155
+ #first iteration decrypt owner_key
129
156
  out = @owner_key
130
- #RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
157
+ #RC4 keyed with (keyBegins XOR with iteration #) to decrypt previous out
131
158
  19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
132
159
  else
133
160
  out = RC4.new( md5[0, 5] ).decrypt( @owner_key )
@@ -0,0 +1,89 @@
1
+ # coding: utf-8
2
+ require 'digest'
3
+ require 'openssl'
4
+
5
+ class PDF::Reader
6
+
7
+ # class creates interface to encrypt dictionary for use in Decrypt
8
+ class StandardSecurityHandlerV5
9
+
10
+ attr_reader :key_length, :encrypt_key
11
+
12
+ def initialize(opts = {})
13
+ @key_length = 256
14
+ @O = opts[:O] # hash(32B) + validation salt(8B) + key salt(8B)
15
+ @U = opts[:U] # hash(32B) + validation salt(8B) + key salt(8B)
16
+ @OE = opts[:OE] # decryption key, encrypted w/ owner password
17
+ @UE = opts[:UE] # decryption key, encrypted w/ user password
18
+ @encrypt_key = build_standard_key(opts[:password] || '')
19
+ end
20
+
21
+ # This handler supports AES-256 encryption defined in PDF 1.7 Extension Level 3
22
+ def self.supports?(encrypt)
23
+ return false if encrypt.nil?
24
+
25
+ filter = encrypt.fetch(:Filter, :Standard)
26
+ version = encrypt.fetch(:V, 0)
27
+ revision = encrypt.fetch(:R, 0)
28
+ algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
29
+ (filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
30
+ ((version == 5) && (revision == 5) && (algorithm == :AESV3))
31
+ end
32
+
33
+ ##7.6.2 General Encryption Algorithm
34
+ #
35
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
36
+ #
37
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
38
+ #
39
+ # buf - a string to decrypt
40
+ # ref - a PDF::Reader::Reference for the object to decrypt
41
+ #
42
+ def decrypt( buf, ref )
43
+ cipher = OpenSSL::Cipher.new("AES-#{@key_length}-CBC")
44
+ cipher.decrypt
45
+ cipher.key = @encrypt_key.dup
46
+ cipher.iv = buf[0..15]
47
+ cipher.update(buf[16..-1]) + cipher.final
48
+ end
49
+
50
+ private
51
+ # Algorithm 3.2a - Computing an encryption key
52
+ #
53
+ # Defined in PDF 1.7 Extension Level 3
54
+ #
55
+ # if the string is a valid user/owner password, this will return the decryption key
56
+ #
57
+ def auth_owner_pass(password)
58
+ if Digest::SHA256.digest(password + @O[32..39] + @U) == @O[0..31]
59
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
60
+ cipher.decrypt
61
+ cipher.key = Digest::SHA256.digest(password + @O[40..-1] + @U)
62
+ cipher.iv = "\x00" * 16
63
+ cipher.padding = 0
64
+ cipher.update(@OE) + cipher.final
65
+ end
66
+ end
67
+
68
+ def auth_user_pass(password)
69
+ if Digest::SHA256.digest(password + @U[32..39]) == @U[0..31]
70
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
71
+ cipher.decrypt
72
+ cipher.key = Digest::SHA256.digest(password + @U[40..-1])
73
+ cipher.iv = "\x00" * 16
74
+ cipher.padding = 0
75
+ cipher.update(@UE) + cipher.final
76
+ end
77
+ end
78
+
79
+ def build_standard_key(pass)
80
+ pass = pass.byteslice(0...127) # UTF-8 encoded password. first 127 bytes
81
+
82
+ encrypt_key = auth_owner_pass(pass)
83
+ encrypt_key ||= auth_user_pass(pass)
84
+
85
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
86
+ encrypt_key
87
+ end
88
+ end
89
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-25 00:00:00.000000000 Z
11
+ date: 2018-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -257,6 +257,7 @@ files:
257
257
  - lib/pdf/reader/register_receiver.rb
258
258
  - lib/pdf/reader/resource_methods.rb
259
259
  - lib/pdf/reader/standard_security_handler.rb
260
+ - lib/pdf/reader/standard_security_handler_v5.rb
260
261
  - lib/pdf/reader/stream.rb
261
262
  - lib/pdf/reader/synchronized_cache.rb
262
263
  - lib/pdf/reader/text_run.rb
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
296
  version: '0'
296
297
  requirements: []
297
298
  rubyforge_project:
298
- rubygems_version: 2.6.8
299
+ rubygems_version: 2.7.3
299
300
  signing_key:
300
301
  specification_version: 4
301
302
  summary: A library for accessing the content of PDF files