pdf-reader 1.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -27,42 +28,8 @@
|
|
27
28
|
|
28
29
|
class PDF::Reader
|
29
30
|
################################################################################
|
30
|
-
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
31
|
-
# something of interest is found.
|
32
|
-
#
|
33
|
-
# The callback methods should exist on the receiver object passed into the constructor.
|
34
|
-
# Whenever some content is found that will trigger a callback, the receiver is checked
|
35
|
-
# to see if the callback is defined.
|
36
|
-
#
|
37
|
-
# If it is defined it will be called. If not, processing will continue.
|
38
|
-
#
|
39
|
-
# = Available Callbacks
|
40
|
-
# The following callbacks are available and should be methods defined on your receiver class. Only
|
41
|
-
# implement the ones you need - the rest will be ignored.
|
42
|
-
#
|
43
|
-
# Some callbacks will include parameters which will be passed in as an array. For callbacks
|
44
|
-
# that supply no paramters, or where you don't need them, the *params argument can be left off.
|
45
|
-
# Some example callback method definitions are:
|
46
|
-
#
|
47
|
-
# def begin_document
|
48
|
-
# def end_page
|
49
|
-
# def show_text(string, *params)
|
50
|
-
# def fill_stroke(*params)
|
51
|
-
#
|
52
|
-
# You should be able to infer the basic command the callback is reporting based on the name. For
|
53
|
-
# further experimentation, define the callback with just a *params parameter, then print out the
|
54
|
-
# contents of the array using something like:
|
55
|
-
#
|
56
|
-
# puts params.inspect
|
57
|
-
#
|
58
31
|
# == Text Callbacks
|
59
32
|
#
|
60
|
-
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
61
|
-
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
|
62
|
-
# careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
|
63
|
-
# example). The string may not be byte-by-byte identical with the string that was originally
|
64
|
-
# written to the PDF.
|
65
|
-
#
|
66
33
|
# - end_text_object
|
67
34
|
# - move_to_start_of_next_line
|
68
35
|
# - set_character_spacing
|
@@ -80,14 +47,6 @@ class PDF::Reader
|
|
80
47
|
# - move_to_next_line_and_show_text
|
81
48
|
# - set_spacing_next_line_show_text
|
82
49
|
#
|
83
|
-
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
84
|
-
# may also appear:
|
85
|
-
#
|
86
|
-
# - show_text_raw
|
87
|
-
# - show_text_with_positioning_raw
|
88
|
-
# - move_to_next_line_and_show_text_raw
|
89
|
-
# - set_spacing_next_line_show_text_raw
|
90
|
-
#
|
91
50
|
# == Graphics Callbacks
|
92
51
|
# - close_fill_stroke
|
93
52
|
# - fill_stroke
|
@@ -145,42 +104,7 @@ class PDF::Reader
|
|
145
104
|
# - set_clipping_path_with_even_odd
|
146
105
|
# - append_curved_segment_final_point_replicated
|
147
106
|
#
|
148
|
-
|
149
|
-
# - begin_compatibility_section
|
150
|
-
# - end_compatibility_section,
|
151
|
-
# - begin_document
|
152
|
-
# - end_document
|
153
|
-
# - begin_page_container
|
154
|
-
# - end_page_container
|
155
|
-
# - begin_page
|
156
|
-
# - end_page
|
157
|
-
# - metadata
|
158
|
-
# - xml_metadata
|
159
|
-
# - page_count
|
160
|
-
# - begin_form_xobject
|
161
|
-
# - end_form_xobject
|
162
|
-
#
|
163
|
-
# == Resource Callbacks
|
164
|
-
#
|
165
|
-
# Each page can contain (or inherit) a range of resources required for the page,
|
166
|
-
# including things like fonts and images. The following callbacks may appear
|
167
|
-
# after begin_page if the relevant resources exist on a page:
|
168
|
-
#
|
169
|
-
# - resource_procset
|
170
|
-
# - resource_xobject
|
171
|
-
# - resource_extgstate
|
172
|
-
# - resource_colorspace
|
173
|
-
# - resource_pattern
|
174
|
-
# - resource_font
|
175
|
-
#
|
176
|
-
# In most cases, these callbacks associate a name with each resource, allowing it
|
177
|
-
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
178
|
-
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
179
|
-
# invoke_xobject "IM1".
|
180
|
-
#
|
181
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
182
|
-
# eventually be removed
|
183
|
-
class PagesStrategy< AbstractStrategy # :nodoc:
|
107
|
+
class PagesStrategy # :nodoc:
|
184
108
|
OPERATORS = {
|
185
109
|
'b' => :close_fill_stroke,
|
186
110
|
'B' => :fill_stroke,
|
@@ -256,232 +180,6 @@ class PDF::Reader
|
|
256
180
|
'\'' => :move_to_next_line_and_show_text,
|
257
181
|
'"' => :set_spacing_next_line_show_text,
|
258
182
|
}
|
259
|
-
def self.to_sym
|
260
|
-
:pages
|
261
|
-
end
|
262
|
-
################################################################################
|
263
|
-
# Begin processing the document
|
264
|
-
def process
|
265
|
-
return false unless options[:pages]
|
266
|
-
|
267
|
-
callback(:begin_document, [root])
|
268
|
-
walk_pages(@ohash.object(root[:Pages]))
|
269
|
-
callback(:end_document)
|
270
|
-
end
|
271
|
-
private
|
272
|
-
################################################################################
|
273
|
-
def params_to_utf8(params, font)
|
274
|
-
if params.is_a?(String)
|
275
|
-
font.to_utf8(params)
|
276
|
-
elsif params.is_a?(Array)
|
277
|
-
params.map { |i| params_to_utf8(i, font)}
|
278
|
-
else
|
279
|
-
params
|
280
|
-
end
|
281
|
-
end
|
282
|
-
################################################################################
|
283
|
-
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
284
|
-
# its content
|
285
|
-
def walk_pages(page)
|
286
|
-
|
287
|
-
# extract page content
|
288
|
-
if page[:Type] == :Pages
|
289
|
-
callback(:begin_page_container, [page])
|
290
|
-
res = @ohash.object(page[:Resources])
|
291
|
-
resources.push res if res
|
292
|
-
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
293
|
-
resources.pop if res
|
294
|
-
callback(:end_page_container)
|
295
|
-
elsif page[:Type] == :Page
|
296
|
-
callback(:begin_page, [page])
|
297
|
-
res = @ohash.object(page[:Resources])
|
298
|
-
resources.push res if res
|
299
|
-
walk_resources(current_resources)
|
300
|
-
|
301
|
-
if @ohash.object(page[:Contents]).kind_of?(Array)
|
302
|
-
contents = @ohash.object(page[:Contents])
|
303
|
-
else
|
304
|
-
contents = [page[:Contents]]
|
305
|
-
end
|
306
|
-
|
307
|
-
fonts = font_hash_from_resources(current_resources)
|
308
|
-
|
309
|
-
if page.has_key?(:Contents) and page[:Contents]
|
310
|
-
direct_contents = contents.map { |content| @ohash.object(content) }
|
311
|
-
content_stream(direct_contents, fonts)
|
312
|
-
end
|
313
|
-
|
314
|
-
resources.pop if res
|
315
|
-
callback(:end_page)
|
316
|
-
end
|
317
|
-
end
|
318
|
-
################################################################################
|
319
|
-
# Retreive the XObject for the supplied label and if it's a Form, walk it
|
320
|
-
# like a regular page content stream.
|
321
|
-
#
|
322
|
-
def walk_xobject_form(label)
|
323
|
-
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
324
|
-
xobject = @ohash.object(xobjects[label])
|
325
|
-
|
326
|
-
if xobject && xobject.hash[:Subtype] == :Form
|
327
|
-
callback(:begin_form_xobject)
|
328
|
-
xobj_resources = @ohash.object(xobject.hash[:Resources])
|
329
|
-
if xobj_resources
|
330
|
-
resources.push xobj_resources
|
331
|
-
walk_resources(xobj_resources)
|
332
|
-
end
|
333
|
-
fonts = font_hash_from_resources(xobj_resources)
|
334
|
-
content_stream(xobject, fonts)
|
335
|
-
callback(:end_form_xobject)
|
336
|
-
resources.pop if xobj_resources
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
|
-
################################################################################
|
341
|
-
# Return a merged hash of all resources that are current. Pages, page and xobject
|
342
|
-
#
|
343
|
-
def current_resources
|
344
|
-
hash = {}
|
345
|
-
resources.each do |res|
|
346
|
-
hash.merge!(res)
|
347
|
-
end
|
348
|
-
hash
|
349
|
-
end
|
350
|
-
################################################################################
|
351
|
-
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
352
|
-
# it contains
|
353
|
-
#
|
354
|
-
def content_stream(instructions, fonts = {})
|
355
|
-
instructions = [instructions] unless instructions.kind_of?(Array)
|
356
|
-
instructions = instructions.map { |ins|
|
357
|
-
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
358
|
-
}.join
|
359
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
360
|
-
parser = Parser.new(buffer, @ohash)
|
361
|
-
current_font = nil
|
362
|
-
params = []
|
363
|
-
|
364
|
-
while (token = parser.parse_token(OPERATORS))
|
365
|
-
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
366
|
-
if OPERATORS[token] == :set_text_font_and_size
|
367
|
-
current_font = params.first
|
368
|
-
if fonts[current_font].nil?
|
369
|
-
raise MalformedPDFError, "Unknown font #{current_font}"
|
370
|
-
end
|
371
|
-
end
|
372
|
-
|
373
|
-
# handle special cases in response to certain operators
|
374
|
-
if OPERATORS[token].to_s.include?("show_text")
|
375
|
-
# convert any text to utf-8, but output the raw string if the user wants it
|
376
|
-
if options[:raw_text]
|
377
|
-
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
378
|
-
end
|
379
|
-
params = params_to_utf8(params, fonts[current_font])
|
380
|
-
elsif token == "ID"
|
381
|
-
# inline image data, first convert the current params into a more familiar hash
|
382
|
-
map = {}
|
383
|
-
params.each_slice(2) do |key, value|
|
384
|
-
map[key] = value
|
385
|
-
end
|
386
|
-
params = [map, buffer.token]
|
387
|
-
end
|
388
|
-
|
389
|
-
callback(OPERATORS[token], params)
|
390
|
-
|
391
|
-
if OPERATORS[token] == :invoke_xobject
|
392
|
-
xobject_label = params.first
|
393
|
-
params.clear
|
394
|
-
walk_xobject_form(xobject_label)
|
395
|
-
else
|
396
|
-
params.clear
|
397
|
-
end
|
398
|
-
else
|
399
|
-
params << token
|
400
|
-
end
|
401
|
-
end
|
402
|
-
rescue EOFError
|
403
|
-
raise MalformedPDFError, "End Of File while processing a content stream"
|
404
|
-
end
|
405
|
-
################################################################################
|
406
|
-
def walk_resources(resources)
|
407
|
-
return unless resources.respond_to?(:[])
|
408
|
-
|
409
|
-
resources = resolve_references(resources)
|
410
|
-
|
411
|
-
# extract any procset information
|
412
|
-
if resources[:ProcSet]
|
413
|
-
callback(:resource_procset, resources[:ProcSet])
|
414
|
-
end
|
415
|
-
|
416
|
-
# extract any xobject information
|
417
|
-
if resources[:XObject]
|
418
|
-
@ohash.object(resources[:XObject]).each do |name, val|
|
419
|
-
callback(:resource_xobject, [name, @ohash.object(val)])
|
420
|
-
end
|
421
|
-
end
|
422
|
-
|
423
|
-
# extract any extgstate information
|
424
|
-
if resources[:ExtGState]
|
425
|
-
@ohash.object(resources[:ExtGState]).each do |name, val|
|
426
|
-
callback(:resource_extgstate, [name, @ohash.object(val)])
|
427
|
-
end
|
428
|
-
end
|
429
|
-
|
430
|
-
# extract any colorspace information
|
431
|
-
if resources[:ColorSpace]
|
432
|
-
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
433
|
-
callback(:resource_colorspace, [name, @ohash.object(val)])
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
# extract any pattern information
|
438
|
-
if resources[:Pattern]
|
439
|
-
@ohash.object(resources[:Pattern]).each do |name, val|
|
440
|
-
callback(:resource_pattern, [name, @ohash.object(val)])
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
# extract any font information
|
445
|
-
if resources[:Font]
|
446
|
-
fonts = font_hash_from_resources(resources)
|
447
|
-
fonts.each do |label, font|
|
448
|
-
callback(:resource_font, [label, font])
|
449
|
-
end
|
450
|
-
end
|
451
|
-
end
|
452
|
-
################################################################################
|
453
|
-
# Convert any PDF::Reader::Resource objects into a real object
|
454
|
-
def resolve_references(obj)
|
455
|
-
case obj
|
456
|
-
when PDF::Reader::Stream then
|
457
|
-
obj.hash = resolve_references(obj.hash)
|
458
|
-
obj
|
459
|
-
when PDF::Reader::Reference then
|
460
|
-
resolve_references(@ohash.object(obj))
|
461
|
-
when Hash then
|
462
|
-
arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
|
463
|
-
Hash[*arr]
|
464
|
-
when Array then
|
465
|
-
obj.collect { |item| resolve_references(item) }
|
466
|
-
else
|
467
|
-
obj
|
468
|
-
end
|
469
|
-
end
|
470
|
-
################################################################################
|
471
|
-
################################################################################
|
472
|
-
def font_hash_from_resources(resources)
|
473
|
-
return {} unless resources.respond_to?(:[])
|
474
|
-
|
475
|
-
fonts = {}
|
476
|
-
resources = @ohash.object(resources[:Font]) || {}
|
477
|
-
resources.each do |label, desc|
|
478
|
-
fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
479
|
-
end
|
480
|
-
fonts
|
481
|
-
end
|
482
|
-
def resources
|
483
|
-
@resources ||= []
|
484
|
-
end
|
485
183
|
end
|
486
184
|
################################################################################
|
487
185
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -118,6 +119,7 @@ class PDF::Reader
|
|
118
119
|
loop do
|
119
120
|
key = parse_token
|
120
121
|
break if key.kind_of?(Token) and key == ">>"
|
122
|
+
raise MalformedPDFError, "unterminated dict" if @buffer.empty?
|
121
123
|
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
122
124
|
|
123
125
|
value = parse_token
|
@@ -131,8 +133,7 @@ class PDF::Reader
|
|
131
133
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
132
134
|
def pdf_name
|
133
135
|
tok = @buffer.token
|
134
|
-
tok =
|
135
|
-
tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
|
136
|
+
tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
|
136
137
|
match[1, 2].hex.chr
|
137
138
|
end
|
138
139
|
tok.to_sym
|
@@ -145,6 +146,7 @@ class PDF::Reader
|
|
145
146
|
loop do
|
146
147
|
item = parse_token
|
147
148
|
break if item.kind_of?(Token) and item == "]"
|
149
|
+
raise MalformedPDFError, "unterminated array" if @buffer.empty?
|
148
150
|
a << item
|
149
151
|
end
|
150
152
|
|
@@ -153,29 +155,30 @@ class PDF::Reader
|
|
153
155
|
################################################################################
|
154
156
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
155
157
|
def hex_string
|
156
|
-
str = ""
|
158
|
+
str = "".dup
|
157
159
|
|
158
160
|
loop do
|
159
161
|
token = @buffer.token
|
160
162
|
break if token == ">"
|
163
|
+
raise MalformedPDFError, "unterminated hex string" if @buffer.empty?
|
161
164
|
str << token
|
162
165
|
end
|
163
166
|
|
164
167
|
# add a missing digit if required, as required by the spec
|
165
168
|
str << "0" unless str.size % 2 == 0
|
166
|
-
str.scan(/../).map {|i| i.hex.chr}.join
|
169
|
+
str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
|
167
170
|
end
|
168
171
|
################################################################################
|
169
172
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
170
173
|
def string
|
171
174
|
str = @buffer.token
|
172
|
-
return "" if str == ")"
|
175
|
+
return "".dup.force_encoding("binary") if str == ")"
|
173
176
|
Error.assert_equal(parse_token, ")")
|
174
177
|
|
175
178
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
176
|
-
MAPPING[match] || ""
|
179
|
+
MAPPING[match] || "".dup
|
177
180
|
end
|
178
|
-
str
|
181
|
+
str.force_encoding("binary")
|
179
182
|
end
|
180
183
|
|
181
184
|
MAPPING = {
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -25,6 +26,7 @@
|
|
25
26
|
#
|
26
27
|
################################################################################
|
27
28
|
require 'digest/md5'
|
29
|
+
require 'openssl'
|
28
30
|
require 'rc4'
|
29
31
|
|
30
32
|
class PDF::Reader
|
@@ -42,51 +44,83 @@ class PDF::Reader
|
|
42
44
|
0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80,
|
43
45
|
0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a ]
|
44
46
|
|
45
|
-
attr_reader :
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
57
|
-
@
|
58
|
-
|
59
|
-
@
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
47
|
+
attr_reader :key_length, :revision, :encrypt_key
|
48
|
+
attr_reader :owner_key, :user_key, :permissions, :file_id, :password
|
49
|
+
|
50
|
+
def initialize(opts = {})
|
51
|
+
@key_length = opts[:key_length].to_i/8
|
52
|
+
@revision = opts[:revision].to_i
|
53
|
+
@owner_key = opts[:owner_key]
|
54
|
+
@user_key = opts[:user_key]
|
55
|
+
@permissions = opts[:permissions].to_i
|
56
|
+
@encryptMeta = opts.fetch(:encrypted_metadata, true)
|
57
|
+
@file_id = opts[:file_id] || ""
|
58
|
+
@encrypt_key = build_standard_key(opts[:password] || "")
|
59
|
+
@cfm = opts[:cfm]
|
60
|
+
|
61
|
+
if @key_length != 5 && @key_length != 16
|
62
|
+
msg = "StandardSecurityHandler only supports 40 and 128 bit\
|
63
|
+
encryption (#{@key_length * 8}bit)"
|
64
|
+
raise ArgumentError, msg
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# This handler supports all encryption that follows upto PDF 1.5 spec (revision 4)
|
69
|
+
def self.supports?(encrypt)
|
70
|
+
return false if encrypt.nil?
|
71
|
+
|
72
|
+
filter = encrypt.fetch(:Filter, :Standard)
|
73
|
+
version = encrypt.fetch(:V, 0)
|
74
|
+
algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
75
|
+
(filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
|
76
|
+
(version <= 3 || (version == 4 && ((algorithm == :V2) || (algorithm == :AESV2))))
|
69
77
|
end
|
70
78
|
|
71
79
|
##7.6.2 General Encryption Algorithm
|
72
80
|
#
|
73
81
|
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
74
82
|
#
|
75
|
-
# used to decrypt RC4 encrypted PDF streams (buf)
|
83
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
76
84
|
#
|
77
85
|
# buf - a string to decrypt
|
78
86
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
79
87
|
#
|
80
88
|
def decrypt( buf, ref )
|
89
|
+
case @cfm
|
90
|
+
when :AESV2
|
91
|
+
decrypt_aes128(buf, ref)
|
92
|
+
else
|
93
|
+
decrypt_rc4(buf, ref)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
# decrypt with RC4 algorithm
|
100
|
+
# version <=3 or (version == 4 and CFM == V2)
|
101
|
+
def decrypt_rc4( buf, ref )
|
81
102
|
objKey = @encrypt_key.dup
|
82
103
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
83
104
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
84
105
|
length = objKey.length < 16 ? objKey.length : 16
|
85
|
-
rc4 = RC4.new( Digest::MD5.digest(objKey)[
|
106
|
+
rc4 = RC4.new( Digest::MD5.digest(objKey)[0,length] )
|
86
107
|
rc4.decrypt(buf)
|
87
108
|
end
|
88
109
|
|
89
|
-
|
110
|
+
# decrypt with AES-128-CBC algorithm
|
111
|
+
# when (version == 4 and CFM == AESV2)
|
112
|
+
def decrypt_aes128( buf, ref )
|
113
|
+
objKey = @encrypt_key.dup
|
114
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
115
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
116
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
117
|
+
length = objKey.length < 16 ? objKey.length : 16
|
118
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
119
|
+
cipher.decrypt
|
120
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
121
|
+
cipher.iv = buf[0..15]
|
122
|
+
cipher.update(buf[16..-1]) + cipher.final
|
123
|
+
end
|
90
124
|
|
91
125
|
# Pads supplied password to 32bytes using PassPadBytes as specified on
|
92
126
|
# pp61 of spec
|
@@ -94,7 +128,7 @@ class PDF::Reader
|
|
94
128
|
if p.nil? || p.empty?
|
95
129
|
PassPadBytes.pack('C*')
|
96
130
|
else
|
97
|
-
p[
|
131
|
+
p[0, 32] + PassPadBytes[0, 32-p.length].pack('C*')
|
98
132
|
end
|
99
133
|
end
|
100
134
|
|
@@ -118,13 +152,13 @@ class PDF::Reader
|
|
118
152
|
md5 = Digest::MD5.digest(pad_pass(pass))
|
119
153
|
if @revision > 2 then
|
120
154
|
50.times { md5 = Digest::MD5.digest(md5) }
|
121
|
-
keyBegins = md5[
|
122
|
-
#first
|
155
|
+
keyBegins = md5[0, key_length]
|
156
|
+
#first iteration decrypt owner_key
|
123
157
|
out = @owner_key
|
124
|
-
#RC4 keyed with (keyBegins XOR with
|
158
|
+
#RC4 keyed with (keyBegins XOR with iteration #) to decrypt previous out
|
125
159
|
19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
|
126
160
|
else
|
127
|
-
out = RC4.new( md5[
|
161
|
+
out = RC4.new( md5[0, 5] ).decrypt( @owner_key )
|
128
162
|
end
|
129
163
|
# c) check output as user password
|
130
164
|
auth_user_pass( out )
|
@@ -142,12 +176,12 @@ class PDF::Reader
|
|
142
176
|
#
|
143
177
|
def auth_user_pass(pass)
|
144
178
|
keyBegins = make_file_key(pass)
|
145
|
-
if @revision
|
179
|
+
if @revision >= 3
|
146
180
|
#initialize out for first iteration
|
147
181
|
out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
|
148
182
|
#zero doesn't matter -> so from 0-19
|
149
|
-
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).
|
150
|
-
pass = @user_key[
|
183
|
+
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).encrypt(out) }
|
184
|
+
pass = @user_key[0, 16] == out
|
151
185
|
else
|
152
186
|
pass = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*")) == @user_key
|
153
187
|
end
|
@@ -163,20 +197,24 @@ class PDF::Reader
|
|
163
197
|
(0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
|
164
198
|
# e) add the file ID
|
165
199
|
@buf << @file_id
|
166
|
-
# f) if revision
|
167
|
-
if @revision
|
168
|
-
@buf << [
|
200
|
+
# f) if revision >= 4 and metadata not encrypted then add 4 bytes of 0xFF
|
201
|
+
if @revision >= 4 && !@encryptMeta
|
202
|
+
@buf << [0xFF,0xFF,0xFF,0xFF].pack('C*')
|
169
203
|
end
|
170
204
|
# b) init MD5 digest + g) finish the hash
|
171
205
|
md5 = Digest::MD5.digest(@buf)
|
172
206
|
# h) spin hash 50 times
|
173
|
-
if @revision
|
207
|
+
if @revision >= 3
|
174
208
|
50.times {
|
175
|
-
md5 = Digest::MD5.digest(md5[
|
209
|
+
md5 = Digest::MD5.digest(md5[0, @key_length])
|
176
210
|
}
|
177
211
|
end
|
178
|
-
# i) n = key_length revision
|
179
|
-
|
212
|
+
# i) n = key_length revision >= 3, n = 5 revision == 2
|
213
|
+
if @revision < 3
|
214
|
+
md5[0, 5]
|
215
|
+
else
|
216
|
+
md5[0, @key_length]
|
217
|
+
end
|
180
218
|
end
|
181
219
|
|
182
220
|
def build_standard_key(pass)
|