pdf-reader 0.11.0.alpha → 0.12.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@
22
22
  # damages, whether based on tort (including without limitation negligence
23
23
  # or strict liability), contract or other legal or equitable grounds even
24
24
  # if Adobe has been advised or had reason to know of the possibility of
25
- # such damages.� The Adobe materials are provided on an "AS IS" basis.�
25
+ # such damages. The Adobe materials are provided on an "AS IS" basis.
26
26
  # Adobe specifically disclaims all express, statutory, or implied
27
27
  # warranties relating to the Adobe materials, including but not limited to
28
28
  # those concerning merchantability or fitness for a particular purpose or
@@ -28,15 +28,17 @@ class PDF::Reader
28
28
  class ObjectHash
29
29
  include Enumerable
30
30
 
31
- CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
-
33
31
  attr_accessor :default
34
32
  attr_reader :trailer, :pdf_version
35
33
 
36
34
  # Creates a new ObjectHash object. input can be a string with a valid filename,
37
35
  # a string containing a PDF file, or an IO object.
38
36
  #
39
- def initialize(input)
37
+ # valid options
38
+ #
39
+ # :password - the user password to decrypt the source PDF
40
+ #
41
+ def initialize(input, opts = {})
40
42
  if input.respond_to?(:seek) && input.respond_to?(:read)
41
43
  @io = input
42
44
  elsif File.file?(input.to_s)
@@ -53,10 +55,7 @@ class PDF::Reader
53
55
  @xref = PDF::Reader::XRef.new(@io)
54
56
  @trailer = @xref.trailer
55
57
  @cache = PDF::Reader::ObjectCache.new
56
-
57
- if trailer[:Encrypt]
58
- raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
- end
58
+ @sec_handler = build_security_handler(opts)
60
59
  end
61
60
 
62
61
  # returns the type of object a ref points to
@@ -92,7 +91,7 @@ class PDF::Reader
92
91
  @cache[key]
93
92
  elsif xref[key].is_a?(Fixnum)
94
93
  buf = new_buffer(xref[key])
95
- @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
94
+ @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
96
95
  elsif xref[key].is_a?(PDF::Reader::Reference)
97
96
  container_key = xref[key]
98
97
  object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
@@ -103,10 +102,6 @@ class PDF::Reader
103
102
  end
104
103
  end
105
104
 
106
- def cacheable?(obj)
107
- obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
- end
109
-
110
105
  # If key is a PDF::Reader::Reference object, lookup the corresponding
111
106
  # object in the PDF and return it. Otherwise return key untouched.
112
107
  #
@@ -252,8 +247,43 @@ class PDF::Reader
252
247
  @page_references ||= get_page_objects(root[:Pages]).flatten
253
248
  end
254
249
 
250
+ def encrypted?
251
+ trailer.has_key?(:Encrypt)
252
+ end
253
+
255
254
  private
256
255
 
256
+ def build_security_handler(opts = {})
257
+ return nil if trailer[:Encrypt].nil?
258
+
259
+ enc = deref(trailer[:Encrypt])
260
+ case enc[:Filter]
261
+ when :Standard
262
+ StandardSecurityHandler.new(enc, deref(trailer[:ID]), opts[:password])
263
+ else
264
+ raise PDF::Reader::EncryptedPDFError, "Unsupported encryption method (#{enc[:Filter]})"
265
+ end
266
+ end
267
+
268
+ def decrypt(ref, obj)
269
+ return obj if @sec_handler.nil?
270
+
271
+ case obj
272
+ when PDF::Reader::Stream then
273
+ obj.data = @sec_handler.decrypt(obj.data, ref)
274
+ obj
275
+ when Hash then
276
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
277
+ Hash[*arr]
278
+ when Array then
279
+ obj.collect { |item| decrypt(ref, item) }
280
+ when String
281
+ @sec_handler.decrypt(obj, ref)
282
+ else
283
+ obj
284
+ end
285
+ end
286
+
257
287
  def new_buffer(offset = 0)
258
288
  PDF::Reader::Buffer.new(@io, :seek => offset)
259
289
  end
@@ -63,27 +63,62 @@ module PDF
63
63
  @resources ||= @objects.deref(attributes[:Resources]) || {}
64
64
  end
65
65
 
66
- # return a hash of fonts used on this page.
66
+ # Returns a Hash of color spaces that are available to this page
67
67
  #
68
- # The keys are the font labels used within the page content stream.
69
- #
70
- # The values are a PDF::Reader::Font instances that provide access
71
- # to most available metrics for each font.
68
+ def color_spaces
69
+ @objects.deref(resources[:ColorSpace]) || {}
70
+ end
71
+
72
+ # Returns a Hash of fonts that are available to this page
72
73
  #
73
74
  def fonts
74
- raw_fonts = objects.deref(resources[:Font] || {})
75
- ::Hash[raw_fonts.map { |label, font|
76
- [label, PDF::Reader::Font.new(objects, objects.deref(font))]
77
- }]
75
+ @objects.deref(resources[:Font]) || {}
76
+ end
77
+
78
+ # Returns a Hash of external graphic states that are available to this
79
+ # page
80
+ #
81
+ def graphic_states
82
+ @objects.deref(resources[:ExtGState]) || {}
83
+ end
84
+
85
+ # Returns a Hash of patterns that are available to this page
86
+ #
87
+ def patterns
88
+ @objects.deref(resources[:Pattern]) || {}
89
+ end
90
+
91
+ # Returns an Array of procedure sets that are available to this page
92
+ #
93
+ def procedure_sets
94
+ @objects.deref(resources[:ProcSet]) || []
95
+ end
96
+
97
+ # Returns a Hash of properties sets that are available to this page
98
+ #
99
+ def properties
100
+ @objects.deref(resources[:Properties]) || {}
101
+ end
102
+
103
+ # Returns a Hash of shadings that are available to this page
104
+ #
105
+ def shadings
106
+ @objects.deref(resources[:Shading]) || {}
107
+ end
108
+
109
+ # Returns a Hash of XObjects that are available to this page
110
+ #
111
+ def xobjects
112
+ @objects.deref(resources[:XObject]) || {}
78
113
  end
79
114
 
80
115
  # returns the plain text content of this page encoded as UTF-8. Any
81
116
  # characters that can't be translated will be returned as a ▯
82
117
  #
83
118
  def text
84
- text_receiver = PageTextReceiver.new(fonts)
85
- walk(text_receiver)
86
- text_receiver.content
119
+ receiver = PageTextReceiver.new
120
+ walk(receiver)
121
+ receiver.content
87
122
  end
88
123
  alias :to_s :text
89
124
 
@@ -91,10 +126,25 @@ module PDF
91
126
  # passes callbacks to the receiver objects.
92
127
  #
93
128
  # This is mostly low level and you can probably ignore it unless you need
94
- # access to soemthing like the raw encoded text. For an example of how
129
+ # access to something like the raw encoded text. For an example of how
95
130
  # this can be used as a basis for higher level functionality, see the
96
131
  # text() method
97
132
  #
133
+ # If someone was motivated enough, this method is intended to provide all
134
+ # the data required to faithfully render the entire page. If you find
135
+ # some required data isn't available it's a bug - let me know.
136
+ #
137
+ # Many operators that generate callbacks will reference resources stored
138
+ # in the page header - think images, fonts, etc. To facilitate these
139
+ # operators, the first available callback is page=. If your receiver
140
+ # accepts that callback it will be passed the current
141
+ # PDF::Reader::Page object. Use the Page#resources method to grab any
142
+ # required resources.
143
+ #
144
+ # It may help to think of each page as a self contained program made up of
145
+ # a set of instructions and associated resources. Calling walk() executes
146
+ # the program in the correct order and calls out to your implementation.
147
+ #
98
148
  def walk(*receivers)
99
149
  callback(receivers, :page=, [self])
100
150
  content_stream(receivers, raw_content)
@@ -118,10 +168,6 @@ module PDF
118
168
  root ||= objects.deref(@objects.trailer[:Root])
119
169
  end
120
170
 
121
- def xobjects
122
- resources[:XObject] || {}
123
- end
124
-
125
171
  def content_stream(receivers, instructions)
126
172
  buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
127
173
  parser = Parser.new(buffer, @objects)
@@ -1,6 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
3
  require 'matrix'
4
+ require 'yaml'
4
5
 
5
6
  module PDF
6
7
  class Reader
@@ -19,10 +20,14 @@ module PDF
19
20
  :text_knockout => 0
20
21
  }
21
22
 
22
- def initialize(fonts)
23
- @fonts = fonts
23
+ # starting a new page
24
+ def page=(page)
25
+ @page = page
26
+ @objects = page.objects
27
+ @fonts = build_fonts(page.fonts)
28
+ @form_fonts = {}
24
29
  @content = ::Hash.new
25
- @stack = [DEFAULT_GRAPHICS_STATE]
30
+ @stack = [DEFAULT_GRAPHICS_STATE]
26
31
  end
27
32
 
28
33
  def content
@@ -151,6 +156,7 @@ module PDF
151
156
 
152
157
  # record text that is drawn on the page
153
158
  def show_text(string) # Tj
159
+ raise PDF::Reader::MalformedPDFError, "current font is invalid" if current_font.nil?
154
160
  at = transform(Point.new(0,0))
155
161
  @content[at.y] ||= ""
156
162
  @content[at.y] << current_font.to_utf8(string)
@@ -178,8 +184,36 @@ module PDF
178
184
  move_to_next_line_and_show_text(string)
179
185
  end
180
186
 
187
+ #####################################################
188
+ # XObjects
189
+ #####################################################
190
+ def invoke_xobject(label)
191
+ save_graphics_state
192
+ xobject = @objects.deref(@page.xobjects[label])
193
+
194
+ matrix = xobject.hash[:Matrix]
195
+ concatenate_matrix(*matrix) if matrix
196
+
197
+ if xobject.hash[:Subtype] == :Form
198
+ form = PDF::Reader::FormXObject.new(@page, xobject)
199
+ @form_fonts = form.fonts
200
+ form.walk(self)
201
+ end
202
+ @form_fonts = {}
203
+
204
+ restore_graphics_state
205
+ end
206
+
181
207
  private
182
208
 
209
+ # wrap the raw PDF Font objects in handy ruby Font objects.
210
+ #
211
+ def build_fonts(raw_fonts)
212
+ ::Hash[raw_fonts.map { |label, font|
213
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
214
+ }]
215
+ end
216
+
183
217
  # transform x and y co-ordinates from the current text space to the
184
218
  # underlying device space.
185
219
  #
@@ -231,7 +265,7 @@ module PDF
231
265
  end
232
266
 
233
267
  def current_font
234
- @fonts[state[:text_font]]
268
+ @form_fonts[state[:text_font]] || @fonts[state[:text_font]]
235
269
  end
236
270
 
237
271
  # private class for representing points on a cartesian plain. Used
@@ -0,0 +1,186 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2011 Evan J Brunner (ejbrun@appittome.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'digest/md5'
26
+ require 'rc4'
27
+
28
+ class PDF::Reader
29
+
30
+ # class creates interface to encrypt dictionary for use in Decrypt
31
+ class StandardSecurityHandler
32
+
33
+ ## 7.6.3.3 Encryption Key Algorithm (pp61)
34
+ #
35
+ # needs a document's user password to build a key for decrypting an
36
+ # encrypted PDF document
37
+ #
38
+ PassPadBytes = [ 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41,
39
+ 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
40
+ 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
41
+ 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a ]
42
+
43
+ attr_reader :filter, :subFilter, :version, :key_length,
44
+ :crypt_filter, :stream_filter, :string_filter, :embedded_file_filter,
45
+ :encrypt_key
46
+ attr_reader :revision, :owner_key, :user_key, :permissions, :file_id, :password
47
+
48
+ def initialize( enc, file_id, password )
49
+ @filter = enc[:Filter]
50
+ @subFilter = enc[:SubFilter]
51
+ @version = enc[:V].to_i
52
+ @key_length = enc[:Length].to_i/8
53
+ @crypt_filter = enc[:CF]
54
+ @stream_filter = enc[:StmF]
55
+ @string_filter = enc[:StrF]
56
+ @revision = enc[:R].to_i
57
+ @owner_key = enc[:O]
58
+ @user_key = enc[:U]
59
+ @permissions = enc[:P].to_i
60
+ @embedded_file_filter = enc[:EFF]
61
+
62
+ @encryptMeta = enc.has_key?(:EncryptMetadata)? enc[:EncryptMetadata].to_s == "true" : true;
63
+
64
+ @file_id = file_id.first
65
+
66
+ @encrypt_key = build_standard_key(password)
67
+ end
68
+
69
+ ##7.6.2 General Encryption Algorithm
70
+ #
71
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
72
+ #
73
+ # used to decrypt RC4 encrypted PDF streams (buf)
74
+ #
75
+ # buf - a string to decrypt
76
+ # ref - a PDF::Reader::Reference for the object to decrypt
77
+ #
78
+ def decrypt( buf, ref )
79
+ objKey = @encrypt_key.dup
80
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
81
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
82
+ rc4 = RC4.new( Digest::MD5.digest(objKey) )
83
+ rc4.decrypt(buf)
84
+ end
85
+
86
+ private
87
+
88
+ # Pads supplied password to 32bytes using PassPadBytes as specified on
89
+ # pp61 of spec
90
+ def padPass(p="")
91
+ if p.nil? || p.empty?
92
+ PassPadBytes.pack('C*')
93
+ else
94
+ p[(0...32)] + PassPadBytes[0...(32-p.length)].pack('C*')
95
+ end
96
+ end
97
+
98
+ def xorEachByte(buf, int)
99
+ buf.each_byte.map{ |b| b^int}.pack("C*")
100
+ end
101
+
102
+ ## 7.6.3.4 Password Algorithms
103
+ #
104
+ # Algorithm 7 - Authenticating the Owner Password
105
+ #
106
+ # Used to test Owner passwords
107
+ #
108
+ # if the string is a valid owner password this will return the user
109
+ # password that should be used to decrypt the document.
110
+ #
111
+ # if the supplied password is not a valid owner password for this document
112
+ # then it returns nil
113
+ #
114
+ def authOwnerPass(pass)
115
+ md5 = Digest::MD5.digest(padPass(pass))
116
+ if @revision > 2 then
117
+ 50.times { md5 = Digest::MD5.digest(md5) }
118
+ keyBegins = md5[(0...@key_length)]
119
+ #first itteration decrypt owner_key
120
+ out = @owner_key
121
+ #RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
122
+ 19.downto(0).each { |i| out=RC4.new(xorEachByte(keyBegins,i)).decrypt(out) }
123
+ else
124
+ out = RC4.new( md5[(0...5)] ).decrypt( @owner_key )
125
+ end
126
+ # c) check output as user password
127
+ authUserPass( out )
128
+ end
129
+
130
+ # Algorithm 6 - Authenticating the User Password
131
+ #
132
+ # Used to test User passwords
133
+ #
134
+ # if the string is a valid user password this will return the user
135
+ # password that should be used to decrypt the document.
136
+ #
137
+ # if the supplied password is not a valid user password for this document
138
+ # then it returns nil
139
+ #
140
+ def authUserPass(pass)
141
+ keyBegins = makeFileKey(pass)
142
+ if @revision > 2
143
+ #initialize out for first iteration
144
+ out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
145
+ #zero doesn't matter -> so from 0-19
146
+ 20.times{ |i| out=RC4.new(xorEachByte(keyBegins, i)).decrypt(out) }
147
+ else
148
+ out = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*"))
149
+ end
150
+ @user_key[(0...16)] == out ? keyBegins : nil
151
+ end
152
+
153
+ def makeFileKey( user_pass )
154
+ # a) if there's a password, pad it to 32 bytes, else, just use the padding.
155
+ @buf = padPass(user_pass)
156
+ # c) add owner key
157
+ @buf << @owner_key
158
+ # d) add permissions 1 byte at a time, in little-endian order
159
+ (0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
160
+ # e) add the file ID
161
+ @buf << @file_id
162
+ # f) if revision > 4 then if encryptMetadata add 4 bytes of 0x00 else add 4 bytes of 0xFF
163
+ if @revision > 4
164
+ @buf << [ @encryptMetadata ? 0x00 : 0xFF ].pack('C')*4
165
+ end
166
+ # b) init MD5 digest + g) finish the hash
167
+ md5 = Digest::MD5.digest(@buf)
168
+ # h) spin hash 50 times
169
+ if @revision > 2
170
+ 50.times {
171
+ md5 = Digest::MD5.digest(md5[(0...@key_length)])
172
+ }
173
+ end
174
+ # i) n = key_length revision > 3, n = 5 revision == 2
175
+ md5[(0...((@revision < 3) ? 5 : @key_length))]
176
+ end
177
+
178
+ def build_standard_key(pass)
179
+ encrypt_key = authOwnerPass(pass)
180
+ encrypt_key ||= authUserPass(pass)
181
+
182
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
183
+ encrypt_key
184
+ end
185
+ end
186
+ end