pdf-reader 1.4.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -27,42 +28,8 @@
|
|
27
28
|
|
28
29
|
class PDF::Reader
|
29
30
|
################################################################################
|
30
|
-
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
31
|
-
# something of interest is found.
|
32
|
-
#
|
33
|
-
# The callback methods should exist on the receiver object passed into the constructor.
|
34
|
-
# Whenever some content is found that will trigger a callback, the receiver is checked
|
35
|
-
# to see if the callback is defined.
|
36
|
-
#
|
37
|
-
# If it is defined it will be called. If not, processing will continue.
|
38
|
-
#
|
39
|
-
# = Available Callbacks
|
40
|
-
# The following callbacks are available and should be methods defined on your receiver class. Only
|
41
|
-
# implement the ones you need - the rest will be ignored.
|
42
|
-
#
|
43
|
-
# Some callbacks will include parameters which will be passed in as an array. For callbacks
|
44
|
-
# that supply no paramters, or where you don't need them, the *params argument can be left off.
|
45
|
-
# Some example callback method definitions are:
|
46
|
-
#
|
47
|
-
# def begin_document
|
48
|
-
# def end_page
|
49
|
-
# def show_text(string, *params)
|
50
|
-
# def fill_stroke(*params)
|
51
|
-
#
|
52
|
-
# You should be able to infer the basic command the callback is reporting based on the name. For
|
53
|
-
# further experimentation, define the callback with just a *params parameter, then print out the
|
54
|
-
# contents of the array using something like:
|
55
|
-
#
|
56
|
-
# puts params.inspect
|
57
|
-
#
|
58
31
|
# == Text Callbacks
|
59
32
|
#
|
60
|
-
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
61
|
-
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
|
62
|
-
# careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
|
63
|
-
# example). The string may not be byte-by-byte identical with the string that was originally
|
64
|
-
# written to the PDF.
|
65
|
-
#
|
66
33
|
# - end_text_object
|
67
34
|
# - move_to_start_of_next_line
|
68
35
|
# - set_character_spacing
|
@@ -80,14 +47,6 @@ class PDF::Reader
|
|
80
47
|
# - move_to_next_line_and_show_text
|
81
48
|
# - set_spacing_next_line_show_text
|
82
49
|
#
|
83
|
-
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
84
|
-
# may also appear:
|
85
|
-
#
|
86
|
-
# - show_text_raw
|
87
|
-
# - show_text_with_positioning_raw
|
88
|
-
# - move_to_next_line_and_show_text_raw
|
89
|
-
# - set_spacing_next_line_show_text_raw
|
90
|
-
#
|
91
50
|
# == Graphics Callbacks
|
92
51
|
# - close_fill_stroke
|
93
52
|
# - fill_stroke
|
@@ -145,42 +104,7 @@ class PDF::Reader
|
|
145
104
|
# - set_clipping_path_with_even_odd
|
146
105
|
# - append_curved_segment_final_point_replicated
|
147
106
|
#
|
148
|
-
|
149
|
-
# - begin_compatibility_section
|
150
|
-
# - end_compatibility_section,
|
151
|
-
# - begin_document
|
152
|
-
# - end_document
|
153
|
-
# - begin_page_container
|
154
|
-
# - end_page_container
|
155
|
-
# - begin_page
|
156
|
-
# - end_page
|
157
|
-
# - metadata
|
158
|
-
# - xml_metadata
|
159
|
-
# - page_count
|
160
|
-
# - begin_form_xobject
|
161
|
-
# - end_form_xobject
|
162
|
-
#
|
163
|
-
# == Resource Callbacks
|
164
|
-
#
|
165
|
-
# Each page can contain (or inherit) a range of resources required for the page,
|
166
|
-
# including things like fonts and images. The following callbacks may appear
|
167
|
-
# after begin_page if the relevant resources exist on a page:
|
168
|
-
#
|
169
|
-
# - resource_procset
|
170
|
-
# - resource_xobject
|
171
|
-
# - resource_extgstate
|
172
|
-
# - resource_colorspace
|
173
|
-
# - resource_pattern
|
174
|
-
# - resource_font
|
175
|
-
#
|
176
|
-
# In most cases, these callbacks associate a name with each resource, allowing it
|
177
|
-
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
178
|
-
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
179
|
-
# invoke_xobject "IM1".
|
180
|
-
#
|
181
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
182
|
-
# eventually be removed
|
183
|
-
class PagesStrategy< AbstractStrategy # :nodoc:
|
107
|
+
class PagesStrategy # :nodoc:
|
184
108
|
OPERATORS = {
|
185
109
|
'b' => :close_fill_stroke,
|
186
110
|
'B' => :fill_stroke,
|
@@ -256,232 +180,6 @@ class PDF::Reader
|
|
256
180
|
'\'' => :move_to_next_line_and_show_text,
|
257
181
|
'"' => :set_spacing_next_line_show_text,
|
258
182
|
}
|
259
|
-
def self.to_sym
|
260
|
-
:pages
|
261
|
-
end
|
262
|
-
################################################################################
|
263
|
-
# Begin processing the document
|
264
|
-
def process
|
265
|
-
return false unless options[:pages]
|
266
|
-
|
267
|
-
callback(:begin_document, [root])
|
268
|
-
walk_pages(@ohash.object(root[:Pages]))
|
269
|
-
callback(:end_document)
|
270
|
-
end
|
271
|
-
private
|
272
|
-
################################################################################
|
273
|
-
def params_to_utf8(params, font)
|
274
|
-
if params.is_a?(String)
|
275
|
-
font.to_utf8(params)
|
276
|
-
elsif params.is_a?(Array)
|
277
|
-
params.map { |i| params_to_utf8(i, font)}
|
278
|
-
else
|
279
|
-
params
|
280
|
-
end
|
281
|
-
end
|
282
|
-
################################################################################
|
283
|
-
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
284
|
-
# its content
|
285
|
-
def walk_pages(page)
|
286
|
-
|
287
|
-
# extract page content
|
288
|
-
if page[:Type] == :Pages
|
289
|
-
callback(:begin_page_container, [page])
|
290
|
-
res = @ohash.object(page[:Resources])
|
291
|
-
resources.push res if res
|
292
|
-
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
293
|
-
resources.pop if res
|
294
|
-
callback(:end_page_container)
|
295
|
-
elsif page[:Type] == :Page
|
296
|
-
callback(:begin_page, [page])
|
297
|
-
res = @ohash.object(page[:Resources])
|
298
|
-
resources.push res if res
|
299
|
-
walk_resources(current_resources)
|
300
|
-
|
301
|
-
if @ohash.object(page[:Contents]).kind_of?(Array)
|
302
|
-
contents = @ohash.object(page[:Contents])
|
303
|
-
else
|
304
|
-
contents = [page[:Contents]]
|
305
|
-
end
|
306
|
-
|
307
|
-
fonts = font_hash_from_resources(current_resources)
|
308
|
-
|
309
|
-
if page.has_key?(:Contents) and page[:Contents]
|
310
|
-
direct_contents = contents.map { |content| @ohash.object(content) }
|
311
|
-
content_stream(direct_contents, fonts)
|
312
|
-
end
|
313
|
-
|
314
|
-
resources.pop if res
|
315
|
-
callback(:end_page)
|
316
|
-
end
|
317
|
-
end
|
318
|
-
################################################################################
|
319
|
-
# Retreive the XObject for the supplied label and if it's a Form, walk it
|
320
|
-
# like a regular page content stream.
|
321
|
-
#
|
322
|
-
def walk_xobject_form(label)
|
323
|
-
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
324
|
-
xobject = @ohash.object(xobjects[label])
|
325
|
-
|
326
|
-
if xobject && xobject.hash[:Subtype] == :Form
|
327
|
-
callback(:begin_form_xobject)
|
328
|
-
xobj_resources = @ohash.object(xobject.hash[:Resources])
|
329
|
-
if xobj_resources
|
330
|
-
resources.push xobj_resources
|
331
|
-
walk_resources(xobj_resources)
|
332
|
-
end
|
333
|
-
fonts = font_hash_from_resources(xobj_resources)
|
334
|
-
content_stream(xobject, fonts)
|
335
|
-
callback(:end_form_xobject)
|
336
|
-
resources.pop if xobj_resources
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
|
-
################################################################################
|
341
|
-
# Return a merged hash of all resources that are current. Pages, page and xobject
|
342
|
-
#
|
343
|
-
def current_resources
|
344
|
-
hash = {}
|
345
|
-
resources.each do |res|
|
346
|
-
hash.merge!(res)
|
347
|
-
end
|
348
|
-
hash
|
349
|
-
end
|
350
|
-
################################################################################
|
351
|
-
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
352
|
-
# it contains
|
353
|
-
#
|
354
|
-
def content_stream(instructions, fonts = {})
|
355
|
-
instructions = [instructions] unless instructions.kind_of?(Array)
|
356
|
-
instructions = instructions.map { |ins|
|
357
|
-
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
358
|
-
}.join
|
359
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
360
|
-
parser = Parser.new(buffer, @ohash)
|
361
|
-
current_font = nil
|
362
|
-
params = []
|
363
|
-
|
364
|
-
while (token = parser.parse_token(OPERATORS))
|
365
|
-
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
366
|
-
if OPERATORS[token] == :set_text_font_and_size
|
367
|
-
current_font = params.first
|
368
|
-
if fonts[current_font].nil?
|
369
|
-
raise MalformedPDFError, "Unknown font #{current_font}"
|
370
|
-
end
|
371
|
-
end
|
372
|
-
|
373
|
-
# handle special cases in response to certain operators
|
374
|
-
if OPERATORS[token].to_s.include?("show_text")
|
375
|
-
# convert any text to utf-8, but output the raw string if the user wants it
|
376
|
-
if options[:raw_text]
|
377
|
-
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
378
|
-
end
|
379
|
-
params = params_to_utf8(params, fonts[current_font])
|
380
|
-
elsif token == "ID"
|
381
|
-
# inline image data, first convert the current params into a more familiar hash
|
382
|
-
map = {}
|
383
|
-
params.each_slice(2) do |key, value|
|
384
|
-
map[key] = value
|
385
|
-
end
|
386
|
-
params = [map, buffer.token]
|
387
|
-
end
|
388
|
-
|
389
|
-
callback(OPERATORS[token], params)
|
390
|
-
|
391
|
-
if OPERATORS[token] == :invoke_xobject
|
392
|
-
xobject_label = params.first
|
393
|
-
params.clear
|
394
|
-
walk_xobject_form(xobject_label)
|
395
|
-
else
|
396
|
-
params.clear
|
397
|
-
end
|
398
|
-
else
|
399
|
-
params << token
|
400
|
-
end
|
401
|
-
end
|
402
|
-
rescue EOFError
|
403
|
-
raise MalformedPDFError, "End Of File while processing a content stream"
|
404
|
-
end
|
405
|
-
################################################################################
|
406
|
-
def walk_resources(resources)
|
407
|
-
return unless resources.respond_to?(:[])
|
408
|
-
|
409
|
-
resources = resolve_references(resources)
|
410
|
-
|
411
|
-
# extract any procset information
|
412
|
-
if resources[:ProcSet]
|
413
|
-
callback(:resource_procset, resources[:ProcSet])
|
414
|
-
end
|
415
|
-
|
416
|
-
# extract any xobject information
|
417
|
-
if resources[:XObject]
|
418
|
-
@ohash.object(resources[:XObject]).each do |name, val|
|
419
|
-
callback(:resource_xobject, [name, @ohash.object(val)])
|
420
|
-
end
|
421
|
-
end
|
422
|
-
|
423
|
-
# extract any extgstate information
|
424
|
-
if resources[:ExtGState]
|
425
|
-
@ohash.object(resources[:ExtGState]).each do |name, val|
|
426
|
-
callback(:resource_extgstate, [name, @ohash.object(val)])
|
427
|
-
end
|
428
|
-
end
|
429
|
-
|
430
|
-
# extract any colorspace information
|
431
|
-
if resources[:ColorSpace]
|
432
|
-
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
433
|
-
callback(:resource_colorspace, [name, @ohash.object(val)])
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
# extract any pattern information
|
438
|
-
if resources[:Pattern]
|
439
|
-
@ohash.object(resources[:Pattern]).each do |name, val|
|
440
|
-
callback(:resource_pattern, [name, @ohash.object(val)])
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
# extract any font information
|
445
|
-
if resources[:Font]
|
446
|
-
fonts = font_hash_from_resources(resources)
|
447
|
-
fonts.each do |label, font|
|
448
|
-
callback(:resource_font, [label, font])
|
449
|
-
end
|
450
|
-
end
|
451
|
-
end
|
452
|
-
################################################################################
|
453
|
-
# Convert any PDF::Reader::Resource objects into a real object
|
454
|
-
def resolve_references(obj)
|
455
|
-
case obj
|
456
|
-
when PDF::Reader::Stream then
|
457
|
-
obj.hash = resolve_references(obj.hash)
|
458
|
-
obj
|
459
|
-
when PDF::Reader::Reference then
|
460
|
-
resolve_references(@ohash.object(obj))
|
461
|
-
when Hash then
|
462
|
-
arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
|
463
|
-
Hash[*arr]
|
464
|
-
when Array then
|
465
|
-
obj.collect { |item| resolve_references(item) }
|
466
|
-
else
|
467
|
-
obj
|
468
|
-
end
|
469
|
-
end
|
470
|
-
################################################################################
|
471
|
-
################################################################################
|
472
|
-
def font_hash_from_resources(resources)
|
473
|
-
return {} unless resources.respond_to?(:[])
|
474
|
-
|
475
|
-
fonts = {}
|
476
|
-
resources = @ohash.object(resources[:Font]) || {}
|
477
|
-
resources.each do |label, desc|
|
478
|
-
fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
479
|
-
end
|
480
|
-
fonts
|
481
|
-
end
|
482
|
-
def resources
|
483
|
-
@resources ||= []
|
484
|
-
end
|
485
183
|
end
|
486
184
|
################################################################################
|
487
185
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -118,6 +119,7 @@ class PDF::Reader
|
|
118
119
|
loop do
|
119
120
|
key = parse_token
|
120
121
|
break if key.kind_of?(Token) and key == ">>"
|
122
|
+
raise MalformedPDFError, "unterminated dict" if @buffer.empty?
|
121
123
|
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
122
124
|
|
123
125
|
value = parse_token
|
@@ -131,8 +133,7 @@ class PDF::Reader
|
|
131
133
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
132
134
|
def pdf_name
|
133
135
|
tok = @buffer.token
|
134
|
-
tok =
|
135
|
-
tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
|
136
|
+
tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
|
136
137
|
match[1, 2].hex.chr
|
137
138
|
end
|
138
139
|
tok.to_sym
|
@@ -145,6 +146,7 @@ class PDF::Reader
|
|
145
146
|
loop do
|
146
147
|
item = parse_token
|
147
148
|
break if item.kind_of?(Token) and item == "]"
|
149
|
+
raise MalformedPDFError, "unterminated array" if @buffer.empty?
|
148
150
|
a << item
|
149
151
|
end
|
150
152
|
|
@@ -153,29 +155,30 @@ class PDF::Reader
|
|
153
155
|
################################################################################
|
154
156
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
155
157
|
def hex_string
|
156
|
-
str = ""
|
158
|
+
str = "".dup
|
157
159
|
|
158
160
|
loop do
|
159
161
|
token = @buffer.token
|
160
162
|
break if token == ">"
|
163
|
+
raise MalformedPDFError, "unterminated hex string" if @buffer.empty?
|
161
164
|
str << token
|
162
165
|
end
|
163
166
|
|
164
167
|
# add a missing digit if required, as required by the spec
|
165
168
|
str << "0" unless str.size % 2 == 0
|
166
|
-
str.scan(/../).map {|i| i.hex.chr}.join
|
169
|
+
str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
|
167
170
|
end
|
168
171
|
################################################################################
|
169
172
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
170
173
|
def string
|
171
174
|
str = @buffer.token
|
172
|
-
return "" if str == ")"
|
175
|
+
return "".dup.force_encoding("binary") if str == ")"
|
173
176
|
Error.assert_equal(parse_token, ")")
|
174
177
|
|
175
178
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
176
|
-
MAPPING[match] || ""
|
179
|
+
MAPPING[match] || "".dup
|
177
180
|
end
|
178
|
-
str
|
181
|
+
str.force_encoding("binary")
|
179
182
|
end
|
180
183
|
|
181
184
|
MAPPING = {
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -25,6 +26,7 @@
|
|
25
26
|
#
|
26
27
|
################################################################################
|
27
28
|
require 'digest/md5'
|
29
|
+
require 'openssl'
|
28
30
|
require 'rc4'
|
29
31
|
|
30
32
|
class PDF::Reader
|
@@ -42,51 +44,83 @@ class PDF::Reader
|
|
42
44
|
0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
|
43
45
|
0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a ]
|
44
46
|
|
45
|
-
attr_reader :
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
57
|
-
@
|
58
|
-
|
59
|
-
@
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
47
|
+
attr_reader :key_length, :revision, :encrypt_key
|
48
|
+
attr_reader :owner_key, :user_key, :permissions, :file_id, :password
|
49
|
+
|
50
|
+
def initialize(opts = {})
|
51
|
+
@key_length = opts[:key_length].to_i/8
|
52
|
+
@revision = opts[:revision].to_i
|
53
|
+
@owner_key = opts[:owner_key]
|
54
|
+
@user_key = opts[:user_key]
|
55
|
+
@permissions = opts[:permissions].to_i
|
56
|
+
@encryptMeta = opts.fetch(:encrypted_metadata, true)
|
57
|
+
@file_id = opts[:file_id] || ""
|
58
|
+
@encrypt_key = build_standard_key(opts[:password] || "")
|
59
|
+
@cfm = opts[:cfm]
|
60
|
+
|
61
|
+
if @key_length != 5 && @key_length != 16
|
62
|
+
msg = "StandardSecurityHandler only supports 40 and 128 bit\
|
63
|
+
encryption (#{@key_length * 8}bit)"
|
64
|
+
raise ArgumentError, msg
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# This handler supports all encryption that follows upto PDF 1.5 spec (revision 4)
|
69
|
+
def self.supports?(encrypt)
|
70
|
+
return false if encrypt.nil?
|
71
|
+
|
72
|
+
filter = encrypt.fetch(:Filter, :Standard)
|
73
|
+
version = encrypt.fetch(:V, 0)
|
74
|
+
algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
75
|
+
(filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
|
76
|
+
(version <= 3 || (version == 4 && ((algorithm == :V2) || (algorithm == :AESV2))))
|
69
77
|
end
|
70
78
|
|
71
79
|
##7.6.2 General Encryption Algorithm
|
72
80
|
#
|
73
81
|
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
74
82
|
#
|
75
|
-
# used to decrypt RC4 encrypted PDF streams (buf)
|
83
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
76
84
|
#
|
77
85
|
# buf - a string to decrypt
|
78
86
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
79
87
|
#
|
80
88
|
def decrypt( buf, ref )
|
89
|
+
case @cfm
|
90
|
+
when :AESV2
|
91
|
+
decrypt_aes128(buf, ref)
|
92
|
+
else
|
93
|
+
decrypt_rc4(buf, ref)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
# decrypt with RC4 algorithm
|
100
|
+
# version <=3 or (version == 4 and CFM == V2)
|
101
|
+
def decrypt_rc4( buf, ref )
|
81
102
|
objKey = @encrypt_key.dup
|
82
103
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
83
104
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
84
105
|
length = objKey.length < 16 ? objKey.length : 16
|
85
|
-
rc4 = RC4.new( Digest::MD5.digest(objKey)[
|
106
|
+
rc4 = RC4.new( Digest::MD5.digest(objKey)[0,length] )
|
86
107
|
rc4.decrypt(buf)
|
87
108
|
end
|
88
109
|
|
89
|
-
|
110
|
+
# decrypt with AES-128-CBC algorithm
|
111
|
+
# when (version == 4 and CFM == AESV2)
|
112
|
+
def decrypt_aes128( buf, ref )
|
113
|
+
objKey = @encrypt_key.dup
|
114
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
115
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
116
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
117
|
+
length = objKey.length < 16 ? objKey.length : 16
|
118
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
119
|
+
cipher.decrypt
|
120
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
121
|
+
cipher.iv = buf[0..15]
|
122
|
+
cipher.update(buf[16..-1]) + cipher.final
|
123
|
+
end
|
90
124
|
|
91
125
|
# Pads supplied password to 32bytes using PassPadBytes as specified on
|
92
126
|
# pp61 of spec
|
@@ -94,7 +128,7 @@ class PDF::Reader
|
|
94
128
|
if p.nil? || p.empty?
|
95
129
|
PassPadBytes.pack('C*')
|
96
130
|
else
|
97
|
-
p[
|
131
|
+
p[0, 32] + PassPadBytes[0, 32-p.length].pack('C*')
|
98
132
|
end
|
99
133
|
end
|
100
134
|
|
@@ -118,13 +152,13 @@ class PDF::Reader
|
|
118
152
|
md5 = Digest::MD5.digest(pad_pass(pass))
|
119
153
|
if @revision > 2 then
|
120
154
|
50.times { md5 = Digest::MD5.digest(md5) }
|
121
|
-
keyBegins = md5[
|
122
|
-
#first
|
155
|
+
keyBegins = md5[0, key_length]
|
156
|
+
#first iteration decrypt owner_key
|
123
157
|
out = @owner_key
|
124
|
-
#RC4 keyed with (keyBegins XOR with
|
158
|
+
#RC4 keyed with (keyBegins XOR with iteration #) to decrypt previous out
|
125
159
|
19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
|
126
160
|
else
|
127
|
-
out = RC4.new( md5[
|
161
|
+
out = RC4.new( md5[0, 5] ).decrypt( @owner_key )
|
128
162
|
end
|
129
163
|
# c) check output as user password
|
130
164
|
auth_user_pass( out )
|
@@ -142,12 +176,12 @@ class PDF::Reader
|
|
142
176
|
#
|
143
177
|
def auth_user_pass(pass)
|
144
178
|
keyBegins = make_file_key(pass)
|
145
|
-
if @revision
|
179
|
+
if @revision >= 3
|
146
180
|
#initialize out for first iteration
|
147
181
|
out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
|
148
182
|
#zero doesn't matter -> so from 0-19
|
149
|
-
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).
|
150
|
-
pass = @user_key[
|
183
|
+
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).encrypt(out) }
|
184
|
+
pass = @user_key[0, 16] == out
|
151
185
|
else
|
152
186
|
pass = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*")) == @user_key
|
153
187
|
end
|
@@ -163,20 +197,24 @@ class PDF::Reader
|
|
163
197
|
(0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
|
164
198
|
# e) add the file ID
|
165
199
|
@buf << @file_id
|
166
|
-
# f) if revision
|
167
|
-
if @revision
|
168
|
-
@buf << [
|
200
|
+
# f) if revision >= 4 and metadata not encrypted then add 4 bytes of 0xFF
|
201
|
+
if @revision >= 4 && !@encryptMeta
|
202
|
+
@buf << [0xFF,0xFF,0xFF,0xFF].pack('C*')
|
169
203
|
end
|
170
204
|
# b) init MD5 digest + g) finish the hash
|
171
205
|
md5 = Digest::MD5.digest(@buf)
|
172
206
|
# h) spin hash 50 times
|
173
|
-
if @revision
|
207
|
+
if @revision >= 3
|
174
208
|
50.times {
|
175
|
-
md5 = Digest::MD5.digest(md5[
|
209
|
+
md5 = Digest::MD5.digest(md5[0, @key_length])
|
176
210
|
}
|
177
211
|
end
|
178
|
-
# i) n = key_length revision
|
179
|
-
|
212
|
+
# i) n = key_length revision >= 3, n = 5 revision == 2
|
213
|
+
if @revision < 3
|
214
|
+
md5[0, 5]
|
215
|
+
else
|
216
|
+
md5[0, @key_length]
|
217
|
+
end
|
180
218
|
end
|
181
219
|
|
182
220
|
def build_standard_key(pass)
|