pdf-reader 1.4.1 → 2.0.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +8 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_object +4 -1
- data/lib/pdf/reader.rb +7 -112
- data/lib/pdf/reader/buffer.rb +2 -1
- data/lib/pdf/reader/cmap.rb +26 -24
- data/lib/pdf/reader/encoding.rb +4 -5
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +1 -5
- data/lib/pdf/reader/font.rb +1 -11
- data/lib/pdf/reader/glyph_hash.rb +6 -2
- data/lib/pdf/reader/lzw.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +35 -16
- data/lib/pdf/reader/page_layout.rb +6 -17
- data/lib/pdf/reader/pages_strategy.rb +1 -304
- data/lib/pdf/reader/parser.rb +6 -4
- data/lib/pdf/reader/standard_security_handler.rb +18 -14
- data/lib/pdf/reader/text_run.rb +3 -9
- metadata +14 -47
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
data/lib/pdf/reader/parser.rb
CHANGED
@@ -118,6 +118,7 @@ class PDF::Reader
|
|
118
118
|
loop do
|
119
119
|
key = parse_token
|
120
120
|
break if key.kind_of?(Token) and key == ">>"
|
121
|
+
raise MalformedPDFError, "unterminated dict" if @buffer.empty?
|
121
122
|
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
122
123
|
|
123
124
|
value = parse_token
|
@@ -131,7 +132,6 @@ class PDF::Reader
|
|
131
132
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
132
133
|
def pdf_name
|
133
134
|
tok = @buffer.token
|
134
|
-
tok = " " if tok == "" && RUBY_VERSION < "1.9"
|
135
135
|
tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
|
136
136
|
match[1, 2].hex.chr
|
137
137
|
end
|
@@ -145,6 +145,7 @@ class PDF::Reader
|
|
145
145
|
loop do
|
146
146
|
item = parse_token
|
147
147
|
break if item.kind_of?(Token) and item == "]"
|
148
|
+
raise MalformedPDFError, "unterminated array" if @buffer.empty?
|
148
149
|
a << item
|
149
150
|
end
|
150
151
|
|
@@ -158,24 +159,25 @@ class PDF::Reader
|
|
158
159
|
loop do
|
159
160
|
token = @buffer.token
|
160
161
|
break if token == ">"
|
162
|
+
raise MalformedPDFError, "unterminated hex string" if @buffer.empty?
|
161
163
|
str << token
|
162
164
|
end
|
163
165
|
|
164
166
|
# add a missing digit if required, as required by the spec
|
165
167
|
str << "0" unless str.size % 2 == 0
|
166
|
-
str.scan(/../).map {|i| i.hex.chr}.join
|
168
|
+
str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
|
167
169
|
end
|
168
170
|
################################################################################
|
169
171
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
170
172
|
def string
|
171
173
|
str = @buffer.token
|
172
|
-
return "" if str == ")"
|
174
|
+
return "".force_encoding("binary") if str == ")"
|
173
175
|
Error.assert_equal(parse_token, ")")
|
174
176
|
|
175
177
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
176
178
|
MAPPING[match] || ""
|
177
179
|
end
|
178
|
-
str
|
180
|
+
str.force_encoding("binary")
|
179
181
|
end
|
180
182
|
|
181
183
|
MAPPING = {
|
@@ -82,7 +82,7 @@ class PDF::Reader
|
|
82
82
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
83
83
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
84
84
|
length = objKey.length < 16 ? objKey.length : 16
|
85
|
-
rc4 = RC4.new( Digest::MD5.digest(objKey)[
|
85
|
+
rc4 = RC4.new( Digest::MD5.digest(objKey)[0,length] )
|
86
86
|
rc4.decrypt(buf)
|
87
87
|
end
|
88
88
|
|
@@ -94,7 +94,7 @@ class PDF::Reader
|
|
94
94
|
if p.nil? || p.empty?
|
95
95
|
PassPadBytes.pack('C*')
|
96
96
|
else
|
97
|
-
p[
|
97
|
+
p[0, 32] + PassPadBytes[0, 32-p.length].pack('C*')
|
98
98
|
end
|
99
99
|
end
|
100
100
|
|
@@ -118,13 +118,13 @@ class PDF::Reader
|
|
118
118
|
md5 = Digest::MD5.digest(pad_pass(pass))
|
119
119
|
if @revision > 2 then
|
120
120
|
50.times { md5 = Digest::MD5.digest(md5) }
|
121
|
-
keyBegins = md5[
|
121
|
+
keyBegins = md5[0, key_length]
|
122
122
|
#first itteration decrypt owner_key
|
123
123
|
out = @owner_key
|
124
124
|
#RC4 keyed with (keyBegins XOR with itteration #) to decrypt previous out
|
125
125
|
19.downto(0).each { |i| out=RC4.new(xor_each_byte(keyBegins,i)).decrypt(out) }
|
126
126
|
else
|
127
|
-
out = RC4.new( md5[
|
127
|
+
out = RC4.new( md5[0, 5] ).decrypt( @owner_key )
|
128
128
|
end
|
129
129
|
# c) check output as user password
|
130
130
|
auth_user_pass( out )
|
@@ -142,12 +142,12 @@ class PDF::Reader
|
|
142
142
|
#
|
143
143
|
def auth_user_pass(pass)
|
144
144
|
keyBegins = make_file_key(pass)
|
145
|
-
if @revision
|
145
|
+
if @revision >= 3
|
146
146
|
#initialize out for first iteration
|
147
147
|
out = Digest::MD5.digest(PassPadBytes.pack("C*") + @file_id)
|
148
148
|
#zero doesn't matter -> so from 0-19
|
149
|
-
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).
|
150
|
-
pass = @user_key[
|
149
|
+
20.times{ |i| out=RC4.new(xor_each_byte(keyBegins, i)).encrypt(out) }
|
150
|
+
pass = @user_key[0, 16] == out
|
151
151
|
else
|
152
152
|
pass = RC4.new(keyBegins).encrypt(PassPadBytes.pack("C*")) == @user_key
|
153
153
|
end
|
@@ -163,20 +163,24 @@ class PDF::Reader
|
|
163
163
|
(0..24).step(8){|e| @buf << (@permissions >> e & 0xFF)}
|
164
164
|
# e) add the file ID
|
165
165
|
@buf << @file_id
|
166
|
-
# f) if revision
|
167
|
-
if @revision
|
168
|
-
@buf << [
|
166
|
+
# f) if revision >= 4 and metadata not encrypted then add 4 bytes of 0xFF
|
167
|
+
if @revision >= 4 && !@encryptMeta
|
168
|
+
@buf << [0xFF,0xFF,0xFF,0xFF].pack('C*')
|
169
169
|
end
|
170
170
|
# b) init MD5 digest + g) finish the hash
|
171
171
|
md5 = Digest::MD5.digest(@buf)
|
172
172
|
# h) spin hash 50 times
|
173
|
-
if @revision
|
173
|
+
if @revision >= 3
|
174
174
|
50.times {
|
175
|
-
md5 = Digest::MD5.digest(md5[
|
175
|
+
md5 = Digest::MD5.digest(md5[0, @key_length])
|
176
176
|
}
|
177
177
|
end
|
178
|
-
# i) n = key_length revision
|
179
|
-
|
178
|
+
# i) n = key_length revision >= 3, n = 5 revision == 2
|
179
|
+
if @revision < 3
|
180
|
+
md5[0, 5]
|
181
|
+
else
|
182
|
+
md5[0, @key_length]
|
183
|
+
end
|
180
184
|
end
|
181
185
|
|
182
186
|
def build_standard_key(pass)
|
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -65,16 +65,10 @@ class PDF::Reader
|
|
65
65
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
66
|
end
|
67
67
|
|
68
|
+
# Assume string encoding is marked correctly and we can trust String#size to return a
|
69
|
+
# character count
|
68
70
|
def character_count
|
69
|
-
|
70
|
-
1.0
|
71
|
-
elsif @text.respond_to?(:bytesize)
|
72
|
-
# M17N aware VM
|
73
|
-
# so we can trust String#size to return a character count
|
74
|
-
@text.size.to_f
|
75
|
-
else
|
76
|
-
text.unpack("U*").size.to_f
|
77
|
-
end
|
71
|
+
@text.size.to_f
|
78
72
|
end
|
79
73
|
end
|
80
74
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0.beta1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -30,28 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '3.
|
33
|
+
version: '3.5'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '3.
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: ZenTest
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 4.4.2
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 4.4.2
|
40
|
+
version: '3.5'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: cane
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -70,16 +56,16 @@ dependencies:
|
|
70
56
|
name: morecane
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
|
-
- - "
|
59
|
+
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
61
|
+
version: '0.2'
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
|
-
- - "
|
66
|
+
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
68
|
+
version: '0.2'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: ir_b
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -185,22 +171,20 @@ email:
|
|
185
171
|
executables:
|
186
172
|
- pdf_object
|
187
173
|
- pdf_text
|
188
|
-
- pdf_list_callbacks
|
189
174
|
- pdf_callbacks
|
190
175
|
extensions: []
|
191
176
|
extra_rdoc_files:
|
192
|
-
- README.
|
177
|
+
- README.md
|
193
178
|
- TODO
|
194
179
|
- CHANGELOG
|
195
180
|
- MIT-LICENSE
|
196
181
|
files:
|
197
182
|
- CHANGELOG
|
198
183
|
- MIT-LICENSE
|
199
|
-
- README.
|
184
|
+
- README.md
|
200
185
|
- Rakefile
|
201
186
|
- TODO
|
202
187
|
- bin/pdf_callbacks
|
203
|
-
- bin/pdf_list_callbacks
|
204
188
|
- bin/pdf_object
|
205
189
|
- bin/pdf_text
|
206
190
|
- examples/callbacks.rb
|
@@ -217,7 +201,6 @@ files:
|
|
217
201
|
- lib/pdf-reader.rb
|
218
202
|
- lib/pdf/hash.rb
|
219
203
|
- lib/pdf/reader.rb
|
220
|
-
- lib/pdf/reader/abstract_strategy.rb
|
221
204
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
222
205
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
223
206
|
- lib/pdf/reader/afm/Courier-Oblique.afm
|
@@ -258,7 +241,6 @@ files:
|
|
258
241
|
- lib/pdf/reader/glyph_hash.rb
|
259
242
|
- lib/pdf/reader/glyphlist.txt
|
260
243
|
- lib/pdf/reader/lzw.rb
|
261
|
-
- lib/pdf/reader/metadata_strategy.rb
|
262
244
|
- lib/pdf/reader/object_cache.rb
|
263
245
|
- lib/pdf/reader/object_hash.rb
|
264
246
|
- lib/pdf/reader/object_stream.rb
|
@@ -276,7 +258,6 @@ files:
|
|
276
258
|
- lib/pdf/reader/standard_security_handler.rb
|
277
259
|
- lib/pdf/reader/stream.rb
|
278
260
|
- lib/pdf/reader/synchronized_cache.rb
|
279
|
-
- lib/pdf/reader/text_receiver.rb
|
280
261
|
- lib/pdf/reader/text_run.rb
|
281
262
|
- lib/pdf/reader/token.rb
|
282
263
|
- lib/pdf/reader/transformation_matrix.rb
|
@@ -291,26 +272,12 @@ homepage: http://github.com/yob/pdf-reader
|
|
291
272
|
licenses:
|
292
273
|
- MIT
|
293
274
|
metadata: {}
|
294
|
-
post_install_message:
|
295
|
-
|
296
|
-
********************************************
|
297
|
-
|
298
|
-
v1.0.0 of PDF::Reader introduced a new page-based API. There are extensive
|
299
|
-
examples showing how to use it in the README and examples directory.
|
300
|
-
|
301
|
-
For detailed documentation, check the rdocs for the PDF::Reader,
|
302
|
-
PDF::Reader::Page and PDF::Reader::ObjectHash classes.
|
303
|
-
|
304
|
-
The old API is marked as deprecated but will continue to work with no
|
305
|
-
visible warnings for now.
|
306
|
-
|
307
|
-
********************************************
|
308
|
-
|
275
|
+
post_install_message:
|
309
276
|
rdoc_options:
|
310
277
|
- "--title"
|
311
278
|
- PDF::Reader Documentation
|
312
279
|
- "--main"
|
313
|
-
- README.
|
280
|
+
- README.md
|
314
281
|
- "-q"
|
315
282
|
require_paths:
|
316
283
|
- lib
|
@@ -321,9 +288,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
321
288
|
version: 1.9.3
|
322
289
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
323
290
|
requirements:
|
324
|
-
- - "
|
291
|
+
- - ">"
|
325
292
|
- !ruby/object:Gem::Version
|
326
|
-
version:
|
293
|
+
version: 1.3.1
|
327
294
|
requirements: []
|
328
295
|
rubyforge_project:
|
329
296
|
rubygems_version: 2.5.2
|
data/bin/pdf_list_callbacks
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# this executable is deprecated, use pdf_callbacks instead
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
|
7
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
8
|
-
|
9
|
-
require 'pdf/reader'
|
10
|
-
|
11
|
-
receiver = PDF::Reader::PrintReceiver.new
|
12
|
-
|
13
|
-
if ARGV.empty?
|
14
|
-
PDF::Reader.new.parse($stdin, receiver)
|
15
|
-
else
|
16
|
-
PDF::Reader.file(ARGV[0], receiver)
|
17
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
class AbstractStrategy # :nodoc:
|
8
|
-
|
9
|
-
def initialize(ohash, receivers, options = {})
|
10
|
-
@ohash, @options = ohash, options
|
11
|
-
if receivers.is_a?(Array)
|
12
|
-
@receivers = receivers
|
13
|
-
else
|
14
|
-
@receivers = [receivers]
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def options
|
21
|
-
@options || {}
|
22
|
-
end
|
23
|
-
|
24
|
-
# calls the name callback method on the receiver class with params as the arguments
|
25
|
-
#
|
26
|
-
def callback(name, params=[])
|
27
|
-
@receivers.each do |receiver|
|
28
|
-
receiver.send(name, *params) if receiver.respond_to?(name)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
33
|
-
def decode_strings(obj)
|
34
|
-
case obj
|
35
|
-
when String then
|
36
|
-
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
37
|
-
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
38
|
-
else
|
39
|
-
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
40
|
-
end
|
41
|
-
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
42
|
-
when Array then obj.collect { |item| decode_strings(item) }
|
43
|
-
else
|
44
|
-
obj
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def info
|
49
|
-
ohash.object(trailer[:Info])
|
50
|
-
end
|
51
|
-
|
52
|
-
def info?
|
53
|
-
info ? true : false
|
54
|
-
end
|
55
|
-
|
56
|
-
def ohash
|
57
|
-
@ohash
|
58
|
-
end
|
59
|
-
|
60
|
-
def pages
|
61
|
-
ohash.object(root[:Pages])
|
62
|
-
end
|
63
|
-
|
64
|
-
def pages?
|
65
|
-
pages ? true : false
|
66
|
-
end
|
67
|
-
|
68
|
-
def root
|
69
|
-
ohash.object(trailer[:Root])
|
70
|
-
end
|
71
|
-
|
72
|
-
def root?
|
73
|
-
root ? true : false
|
74
|
-
end
|
75
|
-
|
76
|
-
def trailer
|
77
|
-
ohash.trailer
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
81
|
-
end
|
@@ -1,56 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
#
|
8
|
-
class MetadataStrategy < AbstractStrategy # :nodoc:
|
9
|
-
|
10
|
-
def self.to_sym
|
11
|
-
:metadata
|
12
|
-
end
|
13
|
-
|
14
|
-
def process
|
15
|
-
return false unless options[:metadata]
|
16
|
-
|
17
|
-
# may be useful to some people
|
18
|
-
callback(:pdf_version, ohash.pdf_version)
|
19
|
-
|
20
|
-
# ye olde metadata
|
21
|
-
callback(:metadata, [decoded_info]) if info?
|
22
|
-
|
23
|
-
# new style xml metadata
|
24
|
-
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
25
|
-
|
26
|
-
# page count
|
27
|
-
if pages?
|
28
|
-
count = ohash.object(pages[:Count])
|
29
|
-
callback(:page_count, count.to_i)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def xml_metadata
|
36
|
-
return @xml_metadata if defined?(@xml_metadata)
|
37
|
-
|
38
|
-
if root[:Metadata].nil?
|
39
|
-
@xml_metadata = nil
|
40
|
-
else
|
41
|
-
string = ohash.object(root[:Metadata]).unfiltered_data
|
42
|
-
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
43
|
-
@xml_metadata = string
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def xml_metadata?
|
48
|
-
xml_metadata ? true : false
|
49
|
-
end
|
50
|
-
|
51
|
-
def decoded_info
|
52
|
-
@decoded_info ||= decode_strings(info)
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
end
|