pdf-reader 0.12.0.alpha → 1.0.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/bin/pdf_callbacks +23 -0
- data/examples/extract_fonts.rb +76 -0
- data/lib/pdf/reader/buffer.rb +14 -3
- data/lib/pdf/reader/filter.rb +6 -1
- data/lib/pdf/reader/glyph_hash.rb +1 -1
- data/lib/pdf/reader/glyphlist.txt +1 -40
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/object_hash.rb +39 -34
- data/lib/pdf/reader/page.rb +15 -13
- data/lib/pdf/reader/page_text_receiver.rb +33 -24
- data/lib/pdf/reader/parser.rb +7 -6
- data/lib/pdf/reader/standard_security_handler.rb +13 -13
- metadata +37 -34
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v1.0.0.beta1 (6th October 2011)
|
2
|
+
- ensure inline images that contain "EI" are correctly parsed
|
3
|
+
(thanks Bernard Schaefer)
|
4
|
+
- fix parsing of inline image data
|
5
|
+
|
1
6
|
v0.12.0.alpha (28th August 2011)
|
2
7
|
- small breaking changes to the page-based API - it's alpha for a reason
|
3
8
|
- resource related methods on Page object return raw PDF objects
|
@@ -6,7 +11,7 @@ v0.12.0.alpha (28th August 2011)
|
|
6
11
|
need to do so themselves
|
7
12
|
- add support for RunLengthDecode filters (thanks Bernerd Schaefer)
|
8
13
|
- add support for standard PDF encryption (thanks Evan Brunner)
|
9
|
-
- add support for decoding stream
|
14
|
+
- add support for decoding stream with TIFF prediction
|
10
15
|
- new PDF::Reader::FormXObject class to simplify working with form XObjects
|
11
16
|
|
12
17
|
v0.11.0.alpha (19th July 2011)
|
data/bin/pdf_callbacks
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
receiver = PDF::Reader::PrintReceiver.new
|
10
|
+
|
11
|
+
if ARGV.empty?
|
12
|
+
browser = PDF::Reader.new($stdin)
|
13
|
+
else
|
14
|
+
browser = PDF::Reader.new(ARGV[0])
|
15
|
+
end
|
16
|
+
browser.pages.each do |page|
|
17
|
+
puts
|
18
|
+
puts "********************************"
|
19
|
+
puts "page #{page.number}"
|
20
|
+
puts page.attributes.inspect
|
21
|
+
puts "********************************"
|
22
|
+
page.walk(receiver)
|
23
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# This demonstrates a way to extract TTF fonts from a PDF. It could be expanded
|
4
|
+
# to support extra font formats if required. Be aware that many PDFs subset
|
5
|
+
# fonts before they're embedded so glyphs may be missing or re-arranged.
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
module ExtractFonts
|
10
|
+
|
11
|
+
class Extractor
|
12
|
+
|
13
|
+
def page(page)
|
14
|
+
count = 0
|
15
|
+
|
16
|
+
return count if page.fonts.nil? || page.fonts.empty?
|
17
|
+
|
18
|
+
page.fonts.each do |label, font|
|
19
|
+
next if complete_refs[font]
|
20
|
+
complete_refs[font] = true
|
21
|
+
|
22
|
+
process_font(page, font)
|
23
|
+
|
24
|
+
count += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
count
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def process_font(page, font)
|
33
|
+
font = page.objects.deref(font)
|
34
|
+
|
35
|
+
case font[:Subtype]
|
36
|
+
when :Type0 then
|
37
|
+
font[:DescendantFonts].each { |f| process_font(page, f) }
|
38
|
+
when :TrueType, :CIDFontType2 then
|
39
|
+
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
40
|
+
else
|
41
|
+
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def complete_refs
|
46
|
+
@complete_refs ||= {}
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
class TTF
|
52
|
+
|
53
|
+
def initialize(objects, font)
|
54
|
+
@objects, @font = objects, font
|
55
|
+
@descriptor = @objects.deref(@font[:FontDescriptor])
|
56
|
+
end
|
57
|
+
|
58
|
+
def save(filename)
|
59
|
+
puts "#{filename}"
|
60
|
+
if @descriptor && @descriptor[:FontFile2]
|
61
|
+
stream = @objects.deref(@descriptor[:FontFile2])
|
62
|
+
File.open(filename, "wb") { |file| file.write stream.unfiltered_data }
|
63
|
+
else
|
64
|
+
$stderr.puts "- TTF font not embedded"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
|
71
|
+
extractor = ExtractFonts::Extractor.new
|
72
|
+
|
73
|
+
PDF::Reader.open(filename) do |reader|
|
74
|
+
page = reader.page(1)
|
75
|
+
extractor.page(page)
|
76
|
+
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -209,14 +209,14 @@ class PDF::Reader
|
|
209
209
|
def prepare_inline_token
|
210
210
|
str = ""
|
211
211
|
|
212
|
-
while str
|
212
|
+
while str !~ /\sEI$/
|
213
213
|
chr = @io.read(1)
|
214
214
|
break if chr.nil?
|
215
215
|
str << chr
|
216
216
|
end
|
217
217
|
|
218
|
-
@tokens << str[0
|
219
|
-
@io.seek(-
|
218
|
+
@tokens << string_token(str[0..-3].strip)
|
219
|
+
@io.seek(-3, IO::SEEK_CUR) unless chr.nil?
|
220
220
|
end
|
221
221
|
|
222
222
|
# if we're currently inside a hex string, read hex nibbles until
|
@@ -342,5 +342,16 @@ class PDF::Reader
|
|
342
342
|
@io.seek(-1, IO::SEEK_CUR) unless chr.nil?
|
343
343
|
chr
|
344
344
|
end
|
345
|
+
|
346
|
+
# for a handful of tokens we want to tell the parser how to convert them
|
347
|
+
# into higher level tokens. This methods adds a to_token() method
|
348
|
+
# to tokens that should remain as strings.
|
349
|
+
#
|
350
|
+
def string_token(token)
|
351
|
+
def token.to_token
|
352
|
+
to_s
|
353
|
+
end
|
354
|
+
token
|
355
|
+
end
|
345
356
|
end
|
346
357
|
end
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -48,6 +48,7 @@ class PDF::Reader
|
|
48
48
|
when :DCTDecode then @filter = nil
|
49
49
|
when :FlateDecode then @filter = :flate
|
50
50
|
when :JBIG2Decode then @filter = nil
|
51
|
+
when :JPXDecode then @filter = nil
|
51
52
|
when :LZWDecode then @filter = :lzw
|
52
53
|
when :RunLengthDecode then @filter = :runlength
|
53
54
|
else
|
@@ -126,7 +127,11 @@ class PDF::Reader
|
|
126
127
|
out = ""
|
127
128
|
|
128
129
|
while pos < data.length
|
129
|
-
|
130
|
+
if data.respond_to?(:getbyte)
|
131
|
+
length = data.getbyte(pos)
|
132
|
+
else
|
133
|
+
length = data[pos]
|
134
|
+
end
|
130
135
|
pos += 1
|
131
136
|
|
132
137
|
case
|
@@ -56,7 +56,7 @@ class PDF::Reader
|
|
56
56
|
"0x#{str[3,4]}".hex
|
57
57
|
elsif str.match(/\Au[A-F\d]{4,6}\Z/)
|
58
58
|
"0x#{str[1,6]}".hex
|
59
|
-
elsif str.match(/\A[A-Za-z]\d{
|
59
|
+
elsif str.match(/\A[A-Za-z]\d{1,4}\Z/)
|
60
60
|
str[1,4].to_i
|
61
61
|
elsif str.match(/\A[A-Za-z]{2}\d{2,4}\Z/)
|
62
62
|
str[2,4].to_i
|
@@ -1,43 +1,4 @@
|
|
1
|
-
#
|
2
|
-
# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
|
3
|
-
#
|
4
|
-
# Permission is hereby granted, free of charge, to any person obtaining a
|
5
|
-
# copy of this documentation file to use, copy, publish, distribute,
|
6
|
-
# sublicense, and/or sell copies of the documentation, and to permit
|
7
|
-
# others to do the same, provided that:
|
8
|
-
# - No modification, editing or other alteration of this document is
|
9
|
-
# allowed; and
|
10
|
-
# - The above copyright notice and this permission notice shall be
|
11
|
-
# included in all copies of the documentation.
|
12
|
-
#
|
13
|
-
# Permission is hereby granted, free of charge, to any person obtaining a
|
14
|
-
# copy of this documentation file, to create their own derivative works
|
15
|
-
# from the content of this document to use, copy, publish, distribute,
|
16
|
-
# sublicense, and/or sell the derivative works, and to permit others to do
|
17
|
-
# the same, provided that the derived work is not represented as being a
|
18
|
-
# copy or version of this document.
|
19
|
-
#
|
20
|
-
# Adobe shall not be liable to any party for any loss of revenue or profit
|
21
|
-
# or for indirect, incidental, special, consequential, or other similar
|
22
|
-
# damages, whether based on tort (including without limitation negligence
|
23
|
-
# or strict liability), contract or other legal or equitable grounds even
|
24
|
-
# if Adobe has been advised or had reason to know of the possibility of
|
25
|
-
# such damages. The Adobe materials are provided on an "AS IS" basis.
|
26
|
-
# Adobe specifically disclaims all express, statutory, or implied
|
27
|
-
# warranties relating to the Adobe materials, including but not limited to
|
28
|
-
# those concerning merchantability or fitness for a particular purpose or
|
29
|
-
# non-infringement of any third party rights regarding the Adobe
|
30
|
-
# materials.
|
31
|
-
# ###################################################################################
|
32
|
-
# Name: Adobe Glyph List
|
33
|
-
# Table version: 2.0
|
34
|
-
# Date: September 20, 2002
|
35
|
-
#
|
36
|
-
# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
|
37
|
-
#
|
38
|
-
# Format: Semicolon-delimited fields:
|
39
|
-
# (1) glyph name
|
40
|
-
# (2) Unicode scalar value
|
1
|
+
# This file maps glyph names to unicode codepoints
|
41
2
|
A;0041
|
42
3
|
AE;00C6
|
43
4
|
AEacute;01FC
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -102,10 +102,12 @@ module PDF
|
|
102
102
|
old_code = code
|
103
103
|
end
|
104
104
|
#increase de size of the codes when limit reached
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
105
|
+
if string_table.string_table_pos == 511
|
106
|
+
stream.set_bits_in_chunk(10)
|
107
|
+
elsif string_table.string_table_pos == 1023
|
108
|
+
stream.set_bits_in_chunk(11)
|
109
|
+
elsif string_table.string_table_pos == 2047
|
110
|
+
stream.set_bits_in_chunk(12)
|
109
111
|
end
|
110
112
|
end
|
111
113
|
end
|
@@ -31,26 +31,15 @@ class PDF::Reader
|
|
31
31
|
attr_accessor :default
|
32
32
|
attr_reader :trailer, :pdf_version
|
33
33
|
|
34
|
-
# Creates a new ObjectHash object.
|
35
|
-
#
|
34
|
+
# Creates a new ObjectHash object. Input can be a string with a valid filename
|
35
|
+
# or an IO-like object.
|
36
36
|
#
|
37
|
-
#
|
37
|
+
# Valid options:
|
38
38
|
#
|
39
39
|
# :password - the user password to decrypt the source PDF
|
40
40
|
#
|
41
41
|
def initialize(input, opts = {})
|
42
|
-
|
43
|
-
@io = input
|
44
|
-
elsif File.file?(input.to_s)
|
45
|
-
if File.respond_to?(:binread)
|
46
|
-
input = File.binread(input.to_s)
|
47
|
-
else
|
48
|
-
input = File.read(input.to_s)
|
49
|
-
end
|
50
|
-
@io = StringIO.new(input)
|
51
|
-
else
|
52
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
53
|
-
end
|
42
|
+
@io = extract_io_from(input)
|
54
43
|
@pdf_version = read_version
|
55
44
|
@xref = PDF::Reader::XRef.new(@io)
|
56
45
|
@trailer = @xref.trailer
|
@@ -67,9 +56,7 @@ class PDF::Reader
|
|
67
56
|
|
68
57
|
# returns true if the supplied references points to an object with a stream
|
69
58
|
def stream?(ref)
|
70
|
-
self[ref].
|
71
|
-
rescue
|
72
|
-
false
|
59
|
+
self.has_key?(ref) && self[ref].is_a?(PDF::Reader::Stream)
|
73
60
|
end
|
74
61
|
|
75
62
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
@@ -83,23 +70,23 @@ class PDF::Reader
|
|
83
70
|
#
|
84
71
|
def [](key)
|
85
72
|
return default if key.to_i <= 0
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
rescue InvalidObjectError
|
101
|
-
return default
|
73
|
+
|
74
|
+
unless key.is_a?(PDF::Reader::Reference)
|
75
|
+
key = PDF::Reader::Reference.new(key.to_i, 0)
|
76
|
+
end
|
77
|
+
|
78
|
+
if @cache.has_key?(key)
|
79
|
+
@cache[key]
|
80
|
+
elsif xref[key].is_a?(Fixnum)
|
81
|
+
buf = new_buffer(xref[key])
|
82
|
+
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
83
|
+
elsif xref[key].is_a?(PDF::Reader::Reference)
|
84
|
+
container_key = xref[key]
|
85
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
86
|
+
@cache[key] = object_streams[container_key][key.id]
|
102
87
|
end
|
88
|
+
rescue InvalidObjectError
|
89
|
+
return default
|
103
90
|
end
|
104
91
|
|
105
92
|
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
@@ -315,5 +302,23 @@ class PDF::Reader
|
|
315
302
|
version.to_f
|
316
303
|
end
|
317
304
|
|
305
|
+
def extract_io_from(input)
|
306
|
+
if input.respond_to?(:seek) && input.respond_to?(:read)
|
307
|
+
input
|
308
|
+
elsif File.file?(input.to_s)
|
309
|
+
StringIO.new read_as_binary(input)
|
310
|
+
else
|
311
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def read_as_binary(input)
|
316
|
+
if File.respond_to?(:binread)
|
317
|
+
File.binread(input.to_s)
|
318
|
+
else
|
319
|
+
File.read(input.to_s)
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
318
323
|
end
|
319
324
|
end
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -49,11 +49,11 @@ module PDF
|
|
49
49
|
# attributes inherited from parents.
|
50
50
|
#
|
51
51
|
def attributes
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
{}.tap { |hash|
|
53
|
+
page_with_ancestors.reverse.each do |obj|
|
54
|
+
hash.merge!(@objects.deref(obj))
|
55
|
+
end
|
56
|
+
}
|
57
57
|
end
|
58
58
|
|
59
59
|
# Returns the resources that accompany this page. Includes
|
@@ -185,7 +185,7 @@ module PDF
|
|
185
185
|
raise MalformedPDFError, "End Of File while processing a content stream"
|
186
186
|
end
|
187
187
|
|
188
|
-
# calls the name callback method on
|
188
|
+
# calls the name callback method on each receiver object with params as the arguments
|
189
189
|
#
|
190
190
|
def callback (receivers, name, params=[])
|
191
191
|
receivers.each do |receiver|
|
@@ -193,14 +193,16 @@ module PDF
|
|
193
193
|
end
|
194
194
|
end
|
195
195
|
|
196
|
-
def page_with_ancestors
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
196
|
+
def page_with_ancestors
|
197
|
+
[ @page_object ] + ancestors
|
198
|
+
end
|
199
|
+
|
200
|
+
def ancestors(origin = @page_object[:Parent])
|
201
|
+
if origin.nil?
|
202
|
+
[]
|
202
203
|
else
|
203
|
-
|
204
|
+
obj = objects.deref(origin)
|
205
|
+
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
204
206
|
end
|
205
207
|
end
|
206
208
|
|
@@ -3,6 +3,11 @@
|
|
3
3
|
require 'matrix'
|
4
4
|
require 'yaml'
|
5
5
|
|
6
|
+
begin
|
7
|
+
require 'psych'
|
8
|
+
rescue LoadError
|
9
|
+
end
|
10
|
+
|
6
11
|
module PDF
|
7
12
|
class Reader
|
8
13
|
class PageTextReceiver
|
@@ -26,7 +31,7 @@ module PDF
|
|
26
31
|
@objects = page.objects
|
27
32
|
@fonts = build_fonts(page.fonts)
|
28
33
|
@form_fonts = {}
|
29
|
-
@content =
|
34
|
+
@content = {}
|
30
35
|
@stack = [DEFAULT_GRAPHICS_STATE]
|
31
36
|
end
|
32
37
|
|
@@ -126,10 +131,10 @@ module PDF
|
|
126
131
|
|
127
132
|
def move_text_position(x, y) # Td
|
128
133
|
temp_matrix = Matrix[
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
134
|
+
[1, 0, 0],
|
135
|
+
[0, 1, 0],
|
136
|
+
[x, y, 1]
|
137
|
+
]
|
133
138
|
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
134
139
|
end
|
135
140
|
|
@@ -140,14 +145,14 @@ module PDF
|
|
140
145
|
|
141
146
|
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
142
147
|
@text_matrix = @text_line_matrix = Matrix[
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
148
|
+
[a, b, 0],
|
149
|
+
[c, d, 0],
|
150
|
+
[e, f, 1]
|
151
|
+
]
|
147
152
|
end
|
148
153
|
|
149
154
|
def move_to_start_of_next_line # T*
|
150
|
-
move_text_position(0, state[:text_leading])
|
155
|
+
move_text_position(0, -state[:text_leading])
|
151
156
|
end
|
152
157
|
|
153
158
|
#####################################################
|
@@ -209,9 +214,11 @@ module PDF
|
|
209
214
|
# wrap the raw PDF Font objects in handy ruby Font objects.
|
210
215
|
#
|
211
216
|
def build_fonts(raw_fonts)
|
212
|
-
|
217
|
+
wrapped_fonts = raw_fonts.map { |label, font|
|
213
218
|
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
|
214
|
-
}
|
219
|
+
}
|
220
|
+
|
221
|
+
::Hash[wrapped_fonts]
|
215
222
|
end
|
216
223
|
|
217
224
|
# transform x and y co-ordinates from the current text space to the
|
@@ -219,10 +226,8 @@ module PDF
|
|
219
226
|
#
|
220
227
|
def transform(point, z = 1)
|
221
228
|
trm = text_rendering_matrix
|
222
|
-
|
223
|
-
|
224
|
-
(trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
|
225
|
-
)
|
229
|
+
|
230
|
+
point.transform(text_rendering_matrix, z)
|
226
231
|
end
|
227
232
|
|
228
233
|
def text_rendering_matrix
|
@@ -253,11 +258,14 @@ module PDF
|
|
253
258
|
if @stack.empty?
|
254
259
|
{}
|
255
260
|
else
|
256
|
-
|
257
|
-
YAML.load(yaml_state)
|
261
|
+
yaml_lib.load yaml_lib.dump(@stack.last)
|
258
262
|
end
|
259
263
|
end
|
260
264
|
|
265
|
+
def yaml_lib
|
266
|
+
Kernel.const_defined?("Psych") ? Psych : YAML
|
267
|
+
end
|
268
|
+
|
261
269
|
# return the current transformation matrix
|
262
270
|
#
|
263
271
|
def ctm
|
@@ -271,15 +279,16 @@ module PDF
|
|
271
279
|
# private class for representing points on a cartesian plain. Used
|
272
280
|
# to simplify maths in the MinPpi class.
|
273
281
|
#
|
274
|
-
class Point
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
282
|
+
class Point < Struct.new(:x, :y)
|
283
|
+
def transform(trm, z)
|
284
|
+
Point.new(
|
285
|
+
(trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
|
286
|
+
(trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
|
287
|
+
)
|
279
288
|
end
|
280
289
|
|
281
290
|
def distance(point)
|
282
|
-
Math.hypot(point.x - x, point.y - y)
|
291
|
+
Math.hypot(point.x - @x, point.y - @y)
|
283
292
|
end
|
284
293
|
end
|
285
294
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -32,10 +32,10 @@ class PDF::Reader
|
|
32
32
|
# Create a new parser around a PDF::Reader::Buffer object
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
-
#
|
36
|
-
def initialize (buffer,
|
35
|
+
# objects - a PDF::Reader::ObjectHash object that can return objects from the PDF file
|
36
|
+
def initialize (buffer, objects=nil)
|
37
37
|
@buffer = buffer
|
38
|
-
@
|
38
|
+
@objects = objects
|
39
39
|
end
|
40
40
|
################################################################################
|
41
41
|
# Reads the next token from the underlying buffer and convets it to an appropriate
|
@@ -59,7 +59,8 @@ class PDF::Reader
|
|
59
59
|
when "stream", "endstream" then return Token.new(token)
|
60
60
|
when ">>", "]", ">", ")" then return Token.new(token)
|
61
61
|
else
|
62
|
-
if
|
62
|
+
if token.respond_to?(:to_token) then return token.to_token
|
63
|
+
elsif operators.has_key?(token) then return Token.new(token)
|
63
64
|
elsif token =~ /\d*\.\d/ then return token.to_f
|
64
65
|
else return token.to_i
|
65
66
|
end
|
@@ -206,8 +207,8 @@ class PDF::Reader
|
|
206
207
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
207
208
|
def stream (dict)
|
208
209
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
209
|
-
if @
|
210
|
-
length = @
|
210
|
+
if @objects
|
211
|
+
length = @objects.deref(dict[:Length])
|
211
212
|
else
|
212
213
|
length = dict[:Length] || 0
|
213
214
|
end
|
@@ -87,7 +87,7 @@ class PDF::Reader
|
|
87
87
|
|
88
88
|
# Pads supplied password to 32bytes using PassPadBytes as specified on
|
89
89
|
# pp61 of spec
|
90
|
-
def
|
90
|
+
def pad_pass(p="")
|
91
91
|
if p.nil? || p.empty?
|
92
92
|
PassPadBytes.pack('C*')
|
93
93
|
else
|
@@ -95,7 +95,7 @@ class PDF::Reader
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
-
def
|
98
|
+
def xor_each_byte(buf, int)
|
99
99
|
buf.each_byte.map{ |b| b^int}.pack("C*")
|
100
100
|
end
|
101
101
|
|
@@ -111,20 +111,20 @@ class PDF::Reader
|
|
111
111
|
# if the supplied password is not a valid owner password for this document
|
112
112
|
# then it returns nil
|
113
113
|
#
|
114
|
-
def
|
115
|
-
md5 = Digest::MD5.digest(
|
114
|
+
def auth_owner_pass(pass)
|
115
|
+
md5 = Digest::MD5.digest(pad_pass(pass))
|
116
116
|
if @revision > 2 then
|
117
117
|
50.times { md5 = Digest::MD5.digest(md5) }
|
118
118
|
keyBegins = md5[(0...@key_length)]
|
119
119
|
#first itteration decrypt owner_key
|
120
120
|
out = @owner_key
|
121
121
|
#RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
|
122
|
-
19.downto(0).each { |i| out=RC4.new(
|
122
|
+
19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
|
123
123
|
else
|
124
124
|
out = RC4.new( md5[(0...5)] ).decrypt( @owner_key )
|
125
125
|
end
|
126
126
|
# c) check output as user password
|
127
|
-
|
127
|
+
auth_user_pass( out )
|
128
128
|
end
|
129
129
|
|
130
130
|
# Algorithm 6 - Authenticating the User Password
|
@@ -137,22 +137,22 @@ class PDF::Reader
|
|
137
137
|
# if the supplied password is not a valid user password for this document
|
138
138
|
# then it returns nil
|
139
139
|
#
|
140
|
-
def
|
141
|
-
keyBegins =
|
140
|
+
def auth_user_pass(pass)
|
141
|
+
keyBegins = make_file_key(pass)
|
142
142
|
if @revision > 2
|
143
143
|
#initialize out for first iteration
|
144
144
|
out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
|
145
145
|
#zero doesn't matter -> so from 0-19
|
146
|
-
20.times{ |i| out=RC4.new(
|
146
|
+
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).decrypt(out) }
|
147
147
|
else
|
148
148
|
out = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*"))
|
149
149
|
end
|
150
150
|
@user_key[(0...16)] == out ? keyBegins : nil
|
151
151
|
end
|
152
152
|
|
153
|
-
def
|
153
|
+
def make_file_key( user_pass )
|
154
154
|
# a) if there's a password, pad it to 32 bytes, else, just use the padding.
|
155
|
-
@buf =
|
155
|
+
@buf = pad_pass(user_pass)
|
156
156
|
# c) add owner key
|
157
157
|
@buf << @owner_key
|
158
158
|
# d) add permissions 1 byte at a time, in little-endian order
|
@@ -176,8 +176,8 @@ class PDF::Reader
|
|
176
176
|
end
|
177
177
|
|
178
178
|
def build_standard_key(pass)
|
179
|
-
encrypt_key =
|
180
|
-
encrypt_key ||=
|
179
|
+
encrypt_key = auth_owner_pass(pass)
|
180
|
+
encrypt_key ||= auth_user_pass(pass)
|
181
181
|
|
182
182
|
raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
|
183
183
|
encrypt_key
|
metadata
CHANGED
@@ -3,11 +3,11 @@ name: pdf-reader
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: true
|
5
5
|
segments:
|
6
|
+
- 1
|
6
7
|
- 0
|
7
|
-
- 12
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.
|
9
|
+
- beta1
|
10
|
+
version: 1.0.0.beta1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- James Healy
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-10-06 00:00:00 +11:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -108,6 +108,7 @@ executables:
|
|
108
108
|
- pdf_object
|
109
109
|
- pdf_text
|
110
110
|
- pdf_list_callbacks
|
111
|
+
- pdf_callbacks
|
111
112
|
extensions: []
|
112
113
|
|
113
114
|
extra_rdoc_files:
|
@@ -116,51 +117,52 @@ extra_rdoc_files:
|
|
116
117
|
- CHANGELOG
|
117
118
|
- MIT-LICENSE
|
118
119
|
files:
|
119
|
-
- examples/rspec.rb
|
120
120
|
- examples/metadata.rb
|
121
|
+
- examples/extract_images.rb
|
121
122
|
- examples/extract_bates.rb
|
122
|
-
- examples/hash.rb
|
123
123
|
- examples/callbacks.rb
|
124
|
+
- examples/rspec.rb
|
125
|
+
- examples/hash.rb
|
124
126
|
- examples/text.rb
|
125
|
-
- examples/
|
127
|
+
- examples/extract_fonts.rb
|
126
128
|
- examples/page_count.rb
|
127
|
-
- examples/
|
128
|
-
- lib/pdf/reader
|
129
|
-
- lib/pdf/
|
130
|
-
- lib/pdf/reader/glyph_hash.rb
|
131
|
-
- lib/pdf/reader/font.rb
|
132
|
-
- lib/pdf/reader/lzw.rb
|
129
|
+
- examples/version.rb
|
130
|
+
- lib/pdf/reader.rb
|
131
|
+
- lib/pdf/hash.rb
|
133
132
|
- lib/pdf/reader/print_receiver.rb
|
134
|
-
- lib/pdf/reader/
|
133
|
+
- lib/pdf/reader/xref.rb
|
134
|
+
- lib/pdf/reader/buffer.rb
|
135
|
+
- lib/pdf/reader/font.rb
|
136
|
+
- lib/pdf/reader/parser.rb
|
137
|
+
- lib/pdf/reader/error.rb
|
135
138
|
- lib/pdf/reader/filter.rb
|
136
|
-
- lib/pdf/reader/
|
139
|
+
- lib/pdf/reader/object_hash.rb
|
140
|
+
- lib/pdf/reader/stream.rb
|
137
141
|
- lib/pdf/reader/standard_security_handler.rb
|
138
|
-
- lib/pdf/reader/
|
142
|
+
- lib/pdf/reader/cmap.rb
|
139
143
|
- lib/pdf/reader/form_xobject.rb
|
140
|
-
- lib/pdf/reader/
|
141
|
-
- lib/pdf/reader/
|
144
|
+
- lib/pdf/reader/object_cache.rb
|
145
|
+
- lib/pdf/reader/object_stream.rb
|
142
146
|
- lib/pdf/reader/encoding.rb
|
143
|
-
- lib/pdf/reader/
|
147
|
+
- lib/pdf/reader/page_text_receiver.rb
|
148
|
+
- lib/pdf/reader/text_receiver.rb
|
149
|
+
- lib/pdf/reader/glyph_hash.rb
|
150
|
+
- lib/pdf/reader/glyphlist.txt
|
151
|
+
- lib/pdf/reader/lzw.rb
|
144
152
|
- lib/pdf/reader/register_receiver.rb
|
145
|
-
- lib/pdf/reader/object_hash.rb
|
146
|
-
- lib/pdf/reader/object_cache.rb
|
147
|
-
- lib/pdf/reader/token.rb
|
148
153
|
- lib/pdf/reader/page.rb
|
149
|
-
- lib/pdf/reader/
|
150
|
-
- lib/pdf/reader/
|
151
|
-
- lib/pdf/reader/
|
152
|
-
- lib/pdf/reader/metadata_strategy.rb
|
153
|
-
- lib/pdf/reader/buffer.rb
|
154
|
-
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
154
|
+
- lib/pdf/reader/abstract_strategy.rb
|
155
|
+
- lib/pdf/reader/pages_strategy.rb
|
156
|
+
- lib/pdf/reader/reference.rb
|
155
157
|
- lib/pdf/reader/encodings/standard.txt
|
156
158
|
- lib/pdf/reader/encodings/mac_roman.txt
|
157
|
-
- lib/pdf/reader/encodings/mac_expert.txt
|
158
|
-
- lib/pdf/reader/encodings/win_ansi.txt
|
159
159
|
- lib/pdf/reader/encodings/symbol.txt
|
160
|
+
- lib/pdf/reader/encodings/win_ansi.txt
|
161
|
+
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
160
162
|
- lib/pdf/reader/encodings/pdf_doc.txt
|
161
|
-
- lib/pdf/reader/
|
162
|
-
- lib/pdf/
|
163
|
-
- lib/pdf/reader.rb
|
163
|
+
- lib/pdf/reader/encodings/mac_expert.txt
|
164
|
+
- lib/pdf/reader/metadata_strategy.rb
|
165
|
+
- lib/pdf/reader/token.rb
|
164
166
|
- lib/pdf-reader.rb
|
165
167
|
- Rakefile
|
166
168
|
- README.rdoc
|
@@ -170,11 +172,12 @@ files:
|
|
170
172
|
- bin/pdf_object
|
171
173
|
- bin/pdf_text
|
172
174
|
- bin/pdf_list_callbacks
|
175
|
+
- bin/pdf_callbacks
|
173
176
|
has_rdoc: true
|
174
177
|
homepage: http://github.com/yob/pdf-reader
|
175
178
|
licenses: []
|
176
179
|
|
177
|
-
post_install_message: "\n ********************************************\n\n This is
|
180
|
+
post_install_message: "\n ********************************************\n\n This is a beta release of PDF::Reader to gather feedback on the proposed\n API changes.\n\n The old API is marked as deprecated but will continue to work with no\n visible warnings for now.\n\n The new API is documented in the README and in rdoc for the PDF::Reader,\n PDF::Reader::Page and PDF::Reader::ObjectHash classes.\n\n Do not use this in production, stick to stable releases for that. If you do\n take the new API for a spin, please send any feedback my way.\n\n ********************************************\n\n"
|
178
181
|
rdoc_options:
|
179
182
|
- --title
|
180
183
|
- PDF::Reader Documentation
|