pdf-reader 0.11.0.alpha → 0.12.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -22,7 +22,7 @@
22
22
  # damages, whether based on tort (including without limitation negligence
23
23
  # or strict liability), contract or other legal or equitable grounds even
24
24
  # if Adobe has been advised or had reason to know of the possibility of
25
- # such damages.� The Adobe materials are provided on an "AS IS" basis.�
25
+ # such damages. The Adobe materials are provided on an "AS IS" basis.
26
26
  # Adobe specifically disclaims all express, statutory, or implied
27
27
  # warranties relating to the Adobe materials, including but not limited to
28
28
  # those concerning merchantability or fitness for a particular purpose or
@@ -28,15 +28,17 @@ class PDF::Reader
28
28
  class ObjectHash
29
29
  include Enumerable
30
30
 
31
- CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
-
33
31
  attr_accessor :default
34
32
  attr_reader :trailer, :pdf_version
35
33
 
36
34
  # Creates a new ObjectHash object. input can be a string with a valid filename,
37
35
  # a string containing a PDF file, or an IO object.
38
36
  #
39
- def initialize(input)
37
+ # valid options
38
+ #
39
+ # :password - the user password to decrypt the source PDF
40
+ #
41
+ def initialize(input, opts = {})
40
42
  if input.respond_to?(:seek) && input.respond_to?(:read)
41
43
  @io = input
42
44
  elsif File.file?(input.to_s)
@@ -53,10 +55,7 @@ class PDF::Reader
53
55
  @xref = PDF::Reader::XRef.new(@io)
54
56
  @trailer = @xref.trailer
55
57
  @cache = PDF::Reader::ObjectCache.new
56
-
57
- if trailer[:Encrypt]
58
- raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
- end
58
+ @sec_handler = build_security_handler(opts)
60
59
  end
61
60
 
62
61
  # returns the type of object a ref points to
@@ -92,7 +91,7 @@ class PDF::Reader
92
91
  @cache[key]
93
92
  elsif xref[key].is_a?(Fixnum)
94
93
  buf = new_buffer(xref[key])
95
- @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
94
+ @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
96
95
  elsif xref[key].is_a?(PDF::Reader::Reference)
97
96
  container_key = xref[key]
98
97
  object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
@@ -103,10 +102,6 @@ class PDF::Reader
103
102
  end
104
103
  end
105
104
 
106
- def cacheable?(obj)
107
- obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
- end
109
-
110
105
  # If key is a PDF::Reader::Reference object, lookup the corresponding
111
106
  # object in the PDF and return it. Otherwise return key untouched.
112
107
  #
@@ -252,8 +247,43 @@ class PDF::Reader
252
247
  @page_references ||= get_page_objects(root[:Pages]).flatten
253
248
  end
254
249
 
250
+ def encrypted?
251
+ trailer.has_key?(:Encrypt)
252
+ end
253
+
255
254
  private
256
255
 
256
+ def build_security_handler(opts = {})
257
+ return nil if trailer[:Encrypt].nil?
258
+
259
+ enc = deref(trailer[:Encrypt])
260
+ case enc[:Filter]
261
+ when :Standard
262
+ StandardSecurityHandler.new(enc, deref(trailer[:ID]), opts[:password])
263
+ else
264
+ raise PDF::Reader::EncryptedPDFError, "Unsupported encryption method (#{enc[:Filter]})"
265
+ end
266
+ end
267
+
268
+ def decrypt(ref, obj)
269
+ return obj if @sec_handler.nil?
270
+
271
+ case obj
272
+ when PDF::Reader::Stream then
273
+ obj.data = @sec_handler.decrypt(obj.data, ref)
274
+ obj
275
+ when Hash then
276
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
277
+ Hash[*arr]
278
+ when Array then
279
+ obj.collect { |item| decrypt(ref, item) }
280
+ when String
281
+ @sec_handler.decrypt(obj, ref)
282
+ else
283
+ obj
284
+ end
285
+ end
286
+
257
287
  def new_buffer(offset = 0)
258
288
  PDF::Reader::Buffer.new(@io, :seek => offset)
259
289
  end
@@ -63,27 +63,62 @@ module PDF
63
63
  @resources ||= @objects.deref(attributes[:Resources]) || {}
64
64
  end
65
65
 
66
- # return a hash of fonts used on this page.
66
+ # Returns a Hash of color spaces that are available to this page
67
67
  #
68
- # The keys are the font labels used within the page content stream.
69
- #
70
- # The values are a PDF::Reader::Font instances that provide access
71
- # to most available metrics for each font.
68
+ def color_spaces
69
+ @objects.deref(resources[:ColorSpace]) || {}
70
+ end
71
+
72
+ # Returns a Hash of fonts that are available to this page
72
73
  #
73
74
  def fonts
74
- raw_fonts = objects.deref(resources[:Font] || {})
75
- ::Hash[raw_fonts.map { |label, font|
76
- [label, PDF::Reader::Font.new(objects, objects.deref(font))]
77
- }]
75
+ @objects.deref(resources[:Font]) || {}
76
+ end
77
+
78
+ # Returns a Hash of external graphic states that are available to this
79
+ # page
80
+ #
81
+ def graphic_states
82
+ @objects.deref(resources[:ExtGState]) || {}
83
+ end
84
+
85
+ # Returns a Hash of patterns that are available to this page
86
+ #
87
+ def patterns
88
+ @objects.deref(resources[:Pattern]) || {}
89
+ end
90
+
91
+ # Returns an Array of procedure sets that are available to this page
92
+ #
93
+ def procedure_sets
94
+ @objects.deref(resources[:ProcSet]) || []
95
+ end
96
+
97
+ # Returns a Hash of properties sets that are available to this page
98
+ #
99
+ def properties
100
+ @objects.deref(resources[:Properties]) || {}
101
+ end
102
+
103
+ # Returns a Hash of shadings that are available to this page
104
+ #
105
+ def shadings
106
+ @objects.deref(resources[:Shading]) || {}
107
+ end
108
+
109
+ # Returns a Hash of XObjects that are available to this page
110
+ #
111
+ def xobjects
112
+ @objects.deref(resources[:XObject]) || {}
78
113
  end
79
114
 
80
115
  # returns the plain text content of this page encoded as UTF-8. Any
81
116
  # characters that can't be translated will be returned as a ▯
82
117
  #
83
118
  def text
84
- text_receiver = PageTextReceiver.new(fonts)
85
- walk(text_receiver)
86
- text_receiver.content
119
+ receiver = PageTextReceiver.new
120
+ walk(receiver)
121
+ receiver.content
87
122
  end
88
123
  alias :to_s :text
89
124
 
@@ -91,10 +126,25 @@ module PDF
91
126
  # passes callbacks to the receiver objects.
92
127
  #
93
128
  # This is mostly low level and you can probably ignore it unless you need
94
- # access to soemthing like the raw encoded text. For an example of how
129
+ # access to something like the raw encoded text. For an example of how
95
130
  # this can be used as a basis for higher level functionality, see the
96
131
  # text() method
97
132
  #
133
+ # If someone was motivated enough, this method is intended to provide all
134
+ # the data required to faithfully render the entire page. If you find
135
+ # some required data isn't available it's a bug - let me know.
136
+ #
137
+ # Many operators that generate callbacks will reference resources stored
138
+ # in the page header - think images, fonts, etc. To facilitate these
139
+ # operators, the first available callback is page=. If your receiver
140
+ # accepts that callback it will be passed the current
141
+ # PDF::Reader::Page object. Use the Page#resources method to grab any
142
+ # required resources.
143
+ #
144
+ # It may help to think of each page as a self contained program made up of
145
+ # a set of instructions and associated resources. Calling walk() executes
146
+ # the program in the correct order and calls out to your implementation.
147
+ #
98
148
  def walk(*receivers)
99
149
  callback(receivers, :page=, [self])
100
150
  content_stream(receivers, raw_content)
@@ -118,10 +168,6 @@ module PDF
118
168
  root ||= objects.deref(@objects.trailer[:Root])
119
169
  end
120
170
 
121
- def xobjects
122
- resources[:XObject] || {}
123
- end
124
-
125
171
  def content_stream(receivers, instructions)
126
172
  buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
127
173
  parser = Parser.new(buffer, @objects)
@@ -1,6 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
3
  require 'matrix'
4
+ require 'yaml'
4
5
 
5
6
  module PDF
6
7
  class Reader
@@ -19,10 +20,14 @@ module PDF
19
20
  :text_knockout => 0
20
21
  }
21
22
 
22
- def initialize(fonts)
23
- @fonts = fonts
23
+ # starting a new page
24
+ def page=(page)
25
+ @page = page
26
+ @objects = page.objects
27
+ @fonts = build_fonts(page.fonts)
28
+ @form_fonts = {}
24
29
  @content = ::Hash.new
25
- @stack = [DEFAULT_GRAPHICS_STATE]
30
+ @stack = [DEFAULT_GRAPHICS_STATE]
26
31
  end
27
32
 
28
33
  def content
@@ -151,6 +156,7 @@ module PDF
151
156
 
152
157
  # record text that is drawn on the page
153
158
  def show_text(string) # Tj
159
+ raise PDF::Reader::MalformedPDFError, "current font is invalid" if current_font.nil?
154
160
  at = transform(Point.new(0,0))
155
161
  @content[at.y] ||= ""
156
162
  @content[at.y] << current_font.to_utf8(string)
@@ -178,8 +184,36 @@ module PDF
178
184
  move_to_next_line_and_show_text(string)
179
185
  end
180
186
 
187
+ #####################################################
188
+ # XObjects
189
+ #####################################################
190
+ def invoke_xobject(label)
191
+ save_graphics_state
192
+ xobject = @objects.deref(@page.xobjects[label])
193
+
194
+ matrix = xobject.hash[:Matrix]
195
+ concatenate_matrix(*matrix) if matrix
196
+
197
+ if xobject.hash[:Subtype] == :Form
198
+ form = PDF::Reader::FormXObject.new(@page, xobject)
199
+ @form_fonts = form.fonts
200
+ form.walk(self)
201
+ end
202
+ @form_fonts = {}
203
+
204
+ restore_graphics_state
205
+ end
206
+
181
207
  private
182
208
 
209
+ # wrap the raw PDF Font objects in handy ruby Font objects.
210
+ #
211
+ def build_fonts(raw_fonts)
212
+ ::Hash[raw_fonts.map { |label, font|
213
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
214
+ }]
215
+ end
216
+
183
217
  # transform x and y co-ordinates from the current text space to the
184
218
  # underlying device space.
185
219
  #
@@ -231,7 +265,7 @@ module PDF
231
265
  end
232
266
 
233
267
  def current_font
234
- @fonts[state[:text_font]]
268
+ @form_fonts[state[:text_font]] || @fonts[state[:text_font]]
235
269
  end
236
270
 
237
271
  # private class for representing points on a cartesian plain. Used
@@ -0,0 +1,186 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2011 Evan J Brunner (ejbrun@appittome.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'digest/md5'
26
+ require 'rc4'
27
+
28
+ class PDF::Reader
29
+
30
+ # class creates interface to encrypt dictionary for use in Decrypt
31
+ class StandardSecurityHandler
32
+
33
+ ## 7.6.3.3 Encryption Key Algorithm (pp61)
34
+ #
35
+ # needs a document's user password to build a key for decrypting an
36
+ # encrypted PDF document
37
+ #
38
+ PassPadBytes = [ 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
39
+ 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
40
+ 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
41
+ 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a ]
42
+
43
+ attr_reader :filter, :subFilter, :version, :key_length,
44
+ :crypt_filter, :stream_filter, :string_filter, :embedded_file_filter,
45
+ :encrypt_key
46
+ attr_reader :revision, :owner_key, :user_key, :permissions, :file_id, :password
47
+
48
+ def initialize( enc, file_id, password )
49
+ @filter = enc[:Filter]
50
+ @subFilter = enc[:SubFilter]
51
+ @version = enc[:V].to_i
52
+ @key_length = enc[:Length].to_i/8
53
+ @crypt_filter = enc[:CF]
54
+ @stream_filter = enc[:StmF]
55
+ @string_filter = enc[:StrF]
56
+ @revision = enc[:R].to_i
57
+ @owner_key = enc[:O]
58
+ @user_key = enc[:U]
59
+ @permissions = enc[:P].to_i
60
+ @embedded_file_filter = enc[:EFF]
61
+
62
+ @encryptMeta = enc.has_key?(:EncryptMetadata)? enc[:EncryptMetadata].to_s == "true" : true;
63
+
64
+ @file_id = file_id.first
65
+
66
+ @encrypt_key = build_standard_key(password)
67
+ end
68
+
69
+ ##7.6.2 General Encryption Algorithm
70
+ #
71
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
72
+ #
73
+ # used to decrypt RC4 encrypted PDF streams (buf)
74
+ #
75
+ # buf - a string to decrypt
76
+ # ref - a PDF::Reader::Reference for the object to decrypt
77
+ #
78
+ def decrypt( buf, ref )
79
+ objKey = @encrypt_key.dup
80
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
81
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
82
+ rc4 = RC4.new( Digest::MD5.digest(objKey) )
83
+ rc4.decrypt(buf)
84
+ end
85
+
86
+ private
87
+
88
+ # Pads supplied password to 32bytes using PassPadBytes as specified on
89
+ # pp61 of spec
90
+ def padPass(p="")
91
+ if p.nil? || p.empty?
92
+ PassPadBytes.pack('C*')
93
+ else
94
+ p[(0...32)] + PassPadBytes[0...(32-p.length)].pack('C*')
95
+ end
96
+ end
97
+
98
+ def xorEachByte(buf, int)
99
+ buf.each_byte.map{ |b| b^int}.pack("C*")
100
+ end
101
+
102
+ ## 7.6.3.4 Password Algorithms
103
+ #
104
+ # Algorithm 7 - Authenticating the Owner Password
105
+ #
106
+ # Used to test Owner passwords
107
+ #
108
+ # if the string is a valid owner password this will return the user
109
+ # password that should be used to decrypt the document.
110
+ #
111
+ # if the supplied password is not a valid owner password for this document
112
+ # then it returns nil
113
+ #
114
+ def authOwnerPass(pass)
115
+ md5 = Digest::MD5.digest(padPass(pass))
116
+ if @revision > 2 then
117
+ 50.times { md5 = Digest::MD5.digest(md5) }
118
+ keyBegins = md5[(0...@key_length)]
119
+ #first itteration decrypt owner_key
120
+ out = @owner_key
121
+ #RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
122
+ 19.downto(0).each { |i| out=RC4.new(xorEachByte(keyBegins,i)).decrypt(out) }
123
+ else
124
+ out = RC4.new( md5[(0...5)] ).decrypt( @owner_key )
125
+ end
126
+ # c) check output as user password
127
+ authUserPass( out )
128
+ end
129
+
130
+ # Algorithm 6 - Authenticating the User Password
131
+ #
132
+ # Used to test User passwords
133
+ #
134
+ # if the string is a valid user password this will return the user
135
+ # password that should be used to decrypt the document.
136
+ #
137
+ # if the supplied password is not a valid user password for this document
138
+ # then it returns nil
139
+ #
140
+ def authUserPass(pass)
141
+ keyBegins = makeFileKey(pass)
142
+ if @revision > 2
143
+ #initialize out for first iteration
144
+ out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
145
+ #zero doesn't matter -> so from 0-19
146
+ 20.times{ |i| out=RC4.new(xorEachByte(keyBegins, i)).decrypt(out) }
147
+ else
148
+ out = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*"))
149
+ end
150
+ @user_key[(0...16)] == out ? keyBegins : nil
151
+ end
152
+
153
+ def makeFileKey( user_pass )
154
+ # a) if there's a password, pad it to 32 bytes, else, just use the padding.
155
+ @buf = padPass(user_pass)
156
+ # c) add owner key
157
+ @buf << @owner_key
158
+ # d) add permissions 1 byte at a time, in little-endian order
159
+ (0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
160
+ # e) add the file ID
161
+ @buf << @file_id
162
+ # f) if revision > 4 then if encryptMetadata add 4 bytes of 0x00 else add 4 bytes of 0xFF
163
+ if @revision > 4
164
+ @buf << [ @encryptMetadata ? 0x00 : 0xFF ].pack('C')*4
165
+ end
166
+ # b) init MD5 digest + g) finish the hash
167
+ md5 = Digest::MD5.digest(@buf)
168
+ # h) spin hash 50 times
169
+ if @revision > 2
170
+ 50.times {
171
+ md5 = Digest::MD5.digest(md5[(0...@key_length)])
172
+ }
173
+ end
174
+ # i) n = key_length revision > 3, n = 5 revision == 2
175
+ md5[(0...((@revision < 3) ? 5 : @key_length))]
176
+ end
177
+
178
+ def build_standard_key(pass)
179
+ encrypt_key = authOwnerPass(pass)
180
+ encrypt_key ||= authUserPass(pass)
181
+
182
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
183
+ encrypt_key
184
+ end
185
+ end
186
+ end