pdf-reader 1.4.1 → 2.0.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +8 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_object +4 -1
- data/lib/pdf/reader.rb +7 -112
- data/lib/pdf/reader/buffer.rb +2 -1
- data/lib/pdf/reader/cmap.rb +26 -24
- data/lib/pdf/reader/encoding.rb +4 -5
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +1 -5
- data/lib/pdf/reader/font.rb +1 -11
- data/lib/pdf/reader/glyph_hash.rb +6 -2
- data/lib/pdf/reader/lzw.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +35 -16
- data/lib/pdf/reader/page_layout.rb +6 -17
- data/lib/pdf/reader/pages_strategy.rb +1 -304
- data/lib/pdf/reader/parser.rb +6 -4
- data/lib/pdf/reader/standard_security_handler.rb +18 -14
- data/lib/pdf/reader/text_run.rb +3 -9
- metadata +14 -47
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
data/lib/pdf/reader/parser.rb
CHANGED
@@ -118,6 +118,7 @@ class PDF::Reader
|
|
118
118
|
loop do
|
119
119
|
key = parse_token
|
120
120
|
break if key.kind_of?(Token) and key == ">>"
|
121
|
+
raise MalformedPDFError, "unterminated dict" if @buffer.empty?
|
121
122
|
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
122
123
|
|
123
124
|
value = parse_token
|
@@ -131,7 +132,6 @@ class PDF::Reader
|
|
131
132
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
132
133
|
def pdf_name
|
133
134
|
tok = @buffer.token
|
134
|
-
tok = " " if tok == "" && RUBY_VERSION < "1.9"
|
135
135
|
tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
|
136
136
|
match[1, 2].hex.chr
|
137
137
|
end
|
@@ -145,6 +145,7 @@ class PDF::Reader
|
|
145
145
|
loop do
|
146
146
|
item = parse_token
|
147
147
|
break if item.kind_of?(Token) and item == "]"
|
148
|
+
raise MalformedPDFError, "unterminated array" if @buffer.empty?
|
148
149
|
a << item
|
149
150
|
end
|
150
151
|
|
@@ -158,24 +159,25 @@ class PDF::Reader
|
|
158
159
|
loop do
|
159
160
|
token = @buffer.token
|
160
161
|
break if token == ">"
|
162
|
+
raise MalformedPDFError, "unterminated hex string" if @buffer.empty?
|
161
163
|
str << token
|
162
164
|
end
|
163
165
|
|
164
166
|
# add a missing digit if required, as required by the spec
|
165
167
|
str << "0" unless str.size % 2 == 0
|
166
|
-
str.scan(/../).map {|i| i.hex.chr}.join
|
168
|
+
str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
|
167
169
|
end
|
168
170
|
################################################################################
|
169
171
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
170
172
|
def string
|
171
173
|
str = @buffer.token
|
172
|
-
return "" if str == ")"
|
174
|
+
return "".force_encoding("binary") if str == ")"
|
173
175
|
Error.assert_equal(parse_token, ")")
|
174
176
|
|
175
177
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
176
178
|
MAPPING[match] || ""
|
177
179
|
end
|
178
|
-
str
|
180
|
+
str.force_encoding("binary")
|
179
181
|
end
|
180
182
|
|
181
183
|
MAPPING = {
|
@@ -82,7 +82,7 @@ class PDF::Reader
|
|
82
82
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
83
83
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
84
84
|
length = objKey.length < 16 ? objKey.length : 16
|
85
|
-
rc4 = RC4.new( Digest::MD5.digest(objKey)[
|
85
|
+
rc4 = RC4.new( Digest::MD5.digest(objKey)[0,length] )
|
86
86
|
rc4.decrypt(buf)
|
87
87
|
end
|
88
88
|
|
@@ -94,7 +94,7 @@ class PDF::Reader
|
|
94
94
|
if p.nil? || p.empty?
|
95
95
|
PassPadBytes.pack('C*')
|
96
96
|
else
|
97
|
-
p[
|
97
|
+
p[0, 32] + PassPadBytes[0, 32-p.length].pack('C*')
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
@@ -118,13 +118,13 @@ class PDF::Reader
|
|
118
118
|
md5 = Digest::MD5.digest(pad_pass(pass))
|
119
119
|
if @revision > 2 then
|
120
120
|
50.times { md5 = Digest::MD5.digest(md5) }
|
121
|
-
keyBegins = md5[
|
121
|
+
keyBegins = md5[0, key_length]
|
122
122
|
#first itteration decrypt owner_key
|
123
123
|
out = @owner_key
|
124
124
|
#RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
|
125
125
|
19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
|
126
126
|
else
|
127
|
-
out = RC4.new( md5[
|
127
|
+
out = RC4.new( md5[0, 5] ).decrypt( @owner_key )
|
128
128
|
end
|
129
129
|
# c) check output as user password
|
130
130
|
auth_user_pass( out )
|
@@ -142,12 +142,12 @@ class PDF::Reader
|
|
142
142
|
#
|
143
143
|
def auth_user_pass(pass)
|
144
144
|
keyBegins = make_file_key(pass)
|
145
|
-
if @revision
|
145
|
+
if @revision >= 3
|
146
146
|
#initialize out for first iteration
|
147
147
|
out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
|
148
148
|
#zero doesn't matter -> so from 0-19
|
149
|
-
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).
|
150
|
-
pass = @user_key[
|
149
|
+
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).encrypt(out) }
|
150
|
+
pass = @user_key[0, 16] == out
|
151
151
|
else
|
152
152
|
pass = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*")) == @user_key
|
153
153
|
end
|
@@ -163,20 +163,24 @@ class PDF::Reader
|
|
163
163
|
(0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
|
164
164
|
# e) add the file ID
|
165
165
|
@buf << @file_id
|
166
|
-
# f) if revision
|
167
|
-
if @revision
|
168
|
-
@buf << [
|
166
|
+
# f) if revision >= 4 and metadata not encrypted then add 4 bytes of 0xFF
|
167
|
+
if @revision >= 4 && !@encryptMeta
|
168
|
+
@buf << [0xFF,0xFF,0xFF,0xFF].pack('C*')
|
169
169
|
end
|
170
170
|
# b) init MD5 digest + g) finish the hash
|
171
171
|
md5 = Digest::MD5.digest(@buf)
|
172
172
|
# h) spin hash 50 times
|
173
|
-
if @revision
|
173
|
+
if @revision >= 3
|
174
174
|
50.times {
|
175
|
-
md5 = Digest::MD5.digest(md5[
|
175
|
+
md5 = Digest::MD5.digest(md5[0, @key_length])
|
176
176
|
}
|
177
177
|
end
|
178
|
-
# i) n = key_length revision
|
179
|
-
|
178
|
+
# i) n = key_length revision >= 3, n = 5 revision == 2
|
179
|
+
if @revision < 3
|
180
|
+
md5[0, 5]
|
181
|
+
else
|
182
|
+
md5[0, @key_length]
|
183
|
+
end
|
180
184
|
end
|
181
185
|
|
182
186
|
def build_standard_key(pass)
|
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -65,16 +65,10 @@ class PDF::Reader
|
|
65
65
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
66
|
end
|
67
67
|
|
68
|
+
# Assume string encoding is marked correctly and we can trust String#size to return a
|
69
|
+
# character count
|
68
70
|
def character_count
|
69
|
-
|
70
|
-
1.0
|
71
|
-
elsif @text.respond_to?(:bytesize)
|
72
|
-
# M17N aware VM
|
73
|
-
# so we can trust String#size to return a character count
|
74
|
-
@text.size.to_f
|
75
|
-
else
|
76
|
-
text.unpack("U*").size.to_f
|
77
|
-
end
|
71
|
+
@text.size.to_f
|
78
72
|
end
|
79
73
|
end
|
80
74
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0.beta1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -30,28 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '3.
|
33
|
+
version: '3.5'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '3.
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: ZenTest
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 4.4.2
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 4.4.2
|
40
|
+
version: '3.5'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: cane
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -70,16 +56,16 @@ dependencies:
|
|
70
56
|
name: morecane
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
|
-
- - "
|
59
|
+
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
61
|
+
version: '0.2'
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
|
-
- - "
|
66
|
+
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
68
|
+
version: '0.2'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: ir_b
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -185,22 +171,20 @@ email:
|
|
185
171
|
executables:
|
186
172
|
- pdf_object
|
187
173
|
- pdf_text
|
188
|
-
- pdf_list_callbacks
|
189
174
|
- pdf_callbacks
|
190
175
|
extensions: []
|
191
176
|
extra_rdoc_files:
|
192
|
-
- README.
|
177
|
+
- README.md
|
193
178
|
- TODO
|
194
179
|
- CHANGELOG
|
195
180
|
- MIT-LICENSE
|
196
181
|
files:
|
197
182
|
- CHANGELOG
|
198
183
|
- MIT-LICENSE
|
199
|
-
- README.
|
184
|
+
- README.md
|
200
185
|
- Rakefile
|
201
186
|
- TODO
|
202
187
|
- bin/pdf_callbacks
|
203
|
-
- bin/pdf_list_callbacks
|
204
188
|
- bin/pdf_object
|
205
189
|
- bin/pdf_text
|
206
190
|
- examples/callbacks.rb
|
@@ -217,7 +201,6 @@ files:
|
|
217
201
|
- lib/pdf-reader.rb
|
218
202
|
- lib/pdf/hash.rb
|
219
203
|
- lib/pdf/reader.rb
|
220
|
-
- lib/pdf/reader/abstract_strategy.rb
|
221
204
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
222
205
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
223
206
|
- lib/pdf/reader/afm/Courier-Oblique.afm
|
@@ -258,7 +241,6 @@ files:
|
|
258
241
|
- lib/pdf/reader/glyph_hash.rb
|
259
242
|
- lib/pdf/reader/glyphlist.txt
|
260
243
|
- lib/pdf/reader/lzw.rb
|
261
|
-
- lib/pdf/reader/metadata_strategy.rb
|
262
244
|
- lib/pdf/reader/object_cache.rb
|
263
245
|
- lib/pdf/reader/object_hash.rb
|
264
246
|
- lib/pdf/reader/object_stream.rb
|
@@ -276,7 +258,6 @@ files:
|
|
276
258
|
- lib/pdf/reader/standard_security_handler.rb
|
277
259
|
- lib/pdf/reader/stream.rb
|
278
260
|
- lib/pdf/reader/synchronized_cache.rb
|
279
|
-
- lib/pdf/reader/text_receiver.rb
|
280
261
|
- lib/pdf/reader/text_run.rb
|
281
262
|
- lib/pdf/reader/token.rb
|
282
263
|
- lib/pdf/reader/transformation_matrix.rb
|
@@ -291,26 +272,12 @@ homepage: http://github.com/yob/pdf-reader
|
|
291
272
|
licenses:
|
292
273
|
- MIT
|
293
274
|
metadata: {}
|
294
|
-
post_install_message:
|
295
|
-
|
296
|
-
********************************************
|
297
|
-
|
298
|
-
v1.0.0 of PDF::Reader introduced a new page-based API. There are extensive
|
299
|
-
examples showing how to use it in the README and examples directory.
|
300
|
-
|
301
|
-
For detailed documentation, check the rdocs for the PDF::Reader,
|
302
|
-
PDF::Reader::Page and PDF::Reader::ObjectHash classes.
|
303
|
-
|
304
|
-
The old API is marked as deprecated but will continue to work with no
|
305
|
-
visible warnings for now.
|
306
|
-
|
307
|
-
********************************************
|
308
|
-
|
275
|
+
post_install_message:
|
309
276
|
rdoc_options:
|
310
277
|
- "--title"
|
311
278
|
- PDF::Reader Documentation
|
312
279
|
- "--main"
|
313
|
-
- README.
|
280
|
+
- README.md
|
314
281
|
- "-q"
|
315
282
|
require_paths:
|
316
283
|
- lib
|
@@ -321,9 +288,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
321
288
|
version: 1.9.3
|
322
289
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
323
290
|
requirements:
|
324
|
-
- - "
|
291
|
+
- - ">"
|
325
292
|
- !ruby/object:Gem::Version
|
326
|
-
version:
|
293
|
+
version: 1.3.1
|
327
294
|
requirements: []
|
328
295
|
rubyforge_project:
|
329
296
|
rubygems_version: 2.5.2
|
data/bin/pdf_list_callbacks
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# this executable is deprecated, use pdf_callbacks instead
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
|
7
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
8
|
-
|
9
|
-
require 'pdf/reader'
|
10
|
-
|
11
|
-
receiver = PDF::Reader::PrintReceiver.new
|
12
|
-
|
13
|
-
if ARGV.empty?
|
14
|
-
PDF::Reader.new.parse($stdin, receiver)
|
15
|
-
else
|
16
|
-
PDF::Reader.file(ARGV[0], receiver)
|
17
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
class AbstractStrategy # :nodoc:
|
8
|
-
|
9
|
-
def initialize(ohash, receivers, options = {})
|
10
|
-
@ohash, @options = ohash, options
|
11
|
-
if receivers.is_a?(Array)
|
12
|
-
@receivers = receivers
|
13
|
-
else
|
14
|
-
@receivers = [receivers]
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def options
|
21
|
-
@options || {}
|
22
|
-
end
|
23
|
-
|
24
|
-
# calls the name callback method on the receiver class with params as the arguments
|
25
|
-
#
|
26
|
-
def callback(name, params=[])
|
27
|
-
@receivers.each do |receiver|
|
28
|
-
receiver.send(name, *params) if receiver.respond_to?(name)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
33
|
-
def decode_strings(obj)
|
34
|
-
case obj
|
35
|
-
when String then
|
36
|
-
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
37
|
-
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
38
|
-
else
|
39
|
-
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
40
|
-
end
|
41
|
-
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
42
|
-
when Array then obj.collect { |item| decode_strings(item) }
|
43
|
-
else
|
44
|
-
obj
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def info
|
49
|
-
ohash.object(trailer[:Info])
|
50
|
-
end
|
51
|
-
|
52
|
-
def info?
|
53
|
-
info ? true : false
|
54
|
-
end
|
55
|
-
|
56
|
-
def ohash
|
57
|
-
@ohash
|
58
|
-
end
|
59
|
-
|
60
|
-
def pages
|
61
|
-
ohash.object(root[:Pages])
|
62
|
-
end
|
63
|
-
|
64
|
-
def pages?
|
65
|
-
pages ? true : false
|
66
|
-
end
|
67
|
-
|
68
|
-
def root
|
69
|
-
ohash.object(trailer[:Root])
|
70
|
-
end
|
71
|
-
|
72
|
-
def root?
|
73
|
-
root ? true : false
|
74
|
-
end
|
75
|
-
|
76
|
-
def trailer
|
77
|
-
ohash.trailer
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
81
|
-
end
|
@@ -1,56 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
#
|
8
|
-
class MetadataStrategy < AbstractStrategy # :nodoc:
|
9
|
-
|
10
|
-
def self.to_sym
|
11
|
-
:metadata
|
12
|
-
end
|
13
|
-
|
14
|
-
def process
|
15
|
-
return false unless options[:metadata]
|
16
|
-
|
17
|
-
# may be useful to some people
|
18
|
-
callback(:pdf_version, ohash.pdf_version)
|
19
|
-
|
20
|
-
# ye olde metadata
|
21
|
-
callback(:metadata, [decoded_info]) if info?
|
22
|
-
|
23
|
-
# new style xml metadata
|
24
|
-
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
25
|
-
|
26
|
-
# page count
|
27
|
-
if pages?
|
28
|
-
count = ohash.object(pages[:Count])
|
29
|
-
callback(:page_count, count.to_i)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def xml_metadata
|
36
|
-
return @xml_metadata if defined?(@xml_metadata)
|
37
|
-
|
38
|
-
if root[:Metadata].nil?
|
39
|
-
@xml_metadata = nil
|
40
|
-
else
|
41
|
-
string = ohash.object(root[:Metadata]).unfiltered_data
|
42
|
-
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
43
|
-
@xml_metadata = string
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def xml_metadata?
|
48
|
-
xml_metadata ? true : false
|
49
|
-
end
|
50
|
-
|
51
|
-
def decoded_info
|
52
|
-
@decoded_info ||= decode_strings(info)
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
end
|