pdf-reader 1.4.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -0,0 +1,91 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'digest'
|
5
|
+
require 'openssl'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# class creates interface to encrypt dictionary for use in Decrypt
|
10
|
+
class StandardSecurityHandlerV5
|
11
|
+
|
12
|
+
attr_reader :key_length, :encrypt_key
|
13
|
+
|
14
|
+
def initialize(opts = {})
|
15
|
+
@key_length = 256
|
16
|
+
@O = opts[:O] # hash(32B) + validation salt(8B) + key salt(8B)
|
17
|
+
@U = opts[:U] # hash(32B) + validation salt(8B) + key salt(8B)
|
18
|
+
@OE = opts[:OE] # decryption key, encrypted w/ owner password
|
19
|
+
@UE = opts[:UE] # decryption key, encrypted w/ user password
|
20
|
+
@encrypt_key = build_standard_key(opts[:password] || '')
|
21
|
+
end
|
22
|
+
|
23
|
+
# This handler supports AES-256 encryption defined in PDF 1.7 Extension Level 3
|
24
|
+
def self.supports?(encrypt)
|
25
|
+
return false if encrypt.nil?
|
26
|
+
|
27
|
+
filter = encrypt.fetch(:Filter, :Standard)
|
28
|
+
version = encrypt.fetch(:V, 0)
|
29
|
+
revision = encrypt.fetch(:R, 0)
|
30
|
+
algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
31
|
+
(filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
|
32
|
+
((version == 5) && (revision == 5) && (algorithm == :AESV3))
|
33
|
+
end
|
34
|
+
|
35
|
+
##7.6.2 General Encryption Algorithm
|
36
|
+
#
|
37
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
38
|
+
#
|
39
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
40
|
+
#
|
41
|
+
# buf - a string to decrypt
|
42
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
43
|
+
#
|
44
|
+
def decrypt( buf, ref )
|
45
|
+
cipher = OpenSSL::Cipher.new("AES-#{@key_length}-CBC")
|
46
|
+
cipher.decrypt
|
47
|
+
cipher.key = @encrypt_key.dup
|
48
|
+
cipher.iv = buf[0..15]
|
49
|
+
cipher.update(buf[16..-1]) + cipher.final
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
# Algorithm 3.2a - Computing an encryption key
|
54
|
+
#
|
55
|
+
# Defined in PDF 1.7 Extension Level 3
|
56
|
+
#
|
57
|
+
# if the string is a valid user/owner password, this will return the decryption key
|
58
|
+
#
|
59
|
+
def auth_owner_pass(password)
|
60
|
+
if Digest::SHA256.digest(password + @O[32..39] + @U) == @O[0..31]
|
61
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
62
|
+
cipher.decrypt
|
63
|
+
cipher.key = Digest::SHA256.digest(password + @O[40..-1] + @U)
|
64
|
+
cipher.iv = "\x00" * 16
|
65
|
+
cipher.padding = 0
|
66
|
+
cipher.update(@OE) + cipher.final
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def auth_user_pass(password)
|
71
|
+
if Digest::SHA256.digest(password + @U[32..39]) == @U[0..31]
|
72
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
73
|
+
cipher.decrypt
|
74
|
+
cipher.key = Digest::SHA256.digest(password + @U[40..-1])
|
75
|
+
cipher.iv = "\x00" * 16
|
76
|
+
cipher.padding = 0
|
77
|
+
cipher.update(@UE) + cipher.final
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def build_standard_key(pass)
|
82
|
+
pass = pass.byteslice(0...127) # UTF-8 encoded password. first 127 bytes
|
83
|
+
|
84
|
+
encrypt_key = auth_owner_pass(pass)
|
85
|
+
encrypt_key ||= auth_user_pass(pass)
|
86
|
+
|
87
|
+
raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
|
88
|
+
encrypt_key
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# A value object that represents one or more consecutive characters on a page.
|
@@ -37,6 +38,10 @@ class PDF::Reader
|
|
37
38
|
@endx ||= x + width
|
38
39
|
end
|
39
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
40
45
|
def mean_character_width
|
41
46
|
@width / character_count
|
42
47
|
end
|
@@ -59,22 +64,36 @@ class PDF::Reader
|
|
59
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
60
65
|
end
|
61
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
62
83
|
private
|
63
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
64
89
|
def mergable_range
|
65
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
91
|
end
|
67
92
|
|
93
|
+
# Assume string encoding is marked correctly and we can trust String#size to return a
|
94
|
+
# character count
|
68
95
|
def character_count
|
69
|
-
|
70
|
-
1.0
|
71
|
-
elsif @text.respond_to?(:bytesize)
|
72
|
-
# M17N aware VM
|
73
|
-
# so we can trust String#size to return a character count
|
74
|
-
@text.size.to_f
|
75
|
-
else
|
76
|
-
text.unpack("U*").size.to_f
|
77
|
-
end
|
96
|
+
@text.size.to_f
|
78
97
|
end
|
79
98
|
end
|
80
99
|
end
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
|
6
|
+
# Security handler for when we don't support the flavour of encryption
|
7
|
+
# used in a PDF.
|
8
|
+
class UnimplementedSecurityHandler
|
9
|
+
def self.supports?(encrypt)
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
def decrypt(buf, ref)
|
14
|
+
raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'afm'
|
4
5
|
require 'pdf/reader/synchronized_cache'
|
@@ -11,11 +12,20 @@ class PDF::Reader
|
|
11
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
12
13
|
class BuiltIn
|
13
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
14
23
|
def initialize(font)
|
15
24
|
@font = font
|
16
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
17
26
|
|
18
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
19
29
|
|
20
30
|
if File.file?(metrics_path)
|
21
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -27,23 +37,15 @@ class PDF::Reader
|
|
27
37
|
def glyph_width(code_point)
|
28
38
|
return 0 if code_point.nil? || code_point < 0
|
29
39
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
m = names.map { |name|
|
35
|
-
@metrics.char_metrics[name.to_s]
|
36
|
-
}.compact.first
|
37
|
-
end
|
40
|
+
names = @font.encoding.int_to_name(code_point)
|
41
|
+
metrics = names.map { |name|
|
42
|
+
@metrics.char_metrics[name.to_s]
|
43
|
+
}.compact.first
|
38
44
|
|
39
|
-
if
|
40
|
-
|
41
|
-
elsif @font.widths[code_point - 1]
|
42
|
-
@font.widths[code_point - 1]
|
43
|
-
elsif control_character?(code_point)
|
44
|
-
0
|
45
|
+
if metrics
|
46
|
+
metrics[:wx]
|
45
47
|
else
|
46
|
-
0
|
48
|
+
@font.widths[code_point - 1] || 0
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
@@ -53,6 +55,13 @@ class PDF::Reader
|
|
53
55
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
54
56
|
end
|
55
57
|
|
58
|
+
def extract_basefont(font_name)
|
59
|
+
if BUILTINS.include?(font_name)
|
60
|
+
font_name
|
61
|
+
else
|
62
|
+
"Times-Roman"
|
63
|
+
end
|
64
|
+
end
|
56
65
|
end
|
57
66
|
end
|
58
67
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
module WidthCalculator
|
@@ -17,8 +18,7 @@ class PDF::Reader
|
|
17
18
|
|
18
19
|
def glyph_width(code_point)
|
19
20
|
return 0 if code_point.nil? || code_point < 0
|
20
|
-
|
21
|
-
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -203,8 +204,10 @@ class PDF::Reader
|
|
203
204
|
("\x00" + bytes).unpack("N")[0]
|
204
205
|
elsif bytes.size == 4
|
205
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
206
209
|
else
|
207
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
208
211
|
end
|
209
212
|
end
|
210
213
|
################################################################################
|
@@ -227,18 +230,21 @@ class PDF::Reader
|
|
227
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
228
231
|
# in the real world.
|
229
232
|
#
|
230
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
231
237
|
#
|
232
238
|
def calc_junk_offset(io)
|
233
239
|
io.rewind
|
234
240
|
offset = io.pos
|
235
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
236
242
|
offset += 1
|
237
243
|
end
|
238
244
|
io.rewind
|
239
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
240
246
|
rescue EOFError
|
241
|
-
|
247
|
+
nil
|
242
248
|
end
|
243
249
|
end
|
244
250
|
################################################################################
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -110,16 +111,10 @@ module PDF
|
|
110
111
|
#
|
111
112
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
112
113
|
#
|
113
|
-
def initialize(input
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
|
-
else
|
119
|
-
msg = "Calling PDF::Reader#new with no arguments is deprecated and will be removed "
|
120
|
-
msg += "in the 2.0 release"
|
121
|
-
$stderr.puts(msg)
|
122
|
-
end
|
114
|
+
def initialize(input, opts = {})
|
115
|
+
@cache = PDF::Reader::ObjectCache.new
|
116
|
+
opts.merge!(:cache => @cache)
|
117
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
123
118
|
end
|
124
119
|
|
125
120
|
def info
|
@@ -133,13 +128,16 @@ module PDF
|
|
133
128
|
nil
|
134
129
|
else
|
135
130
|
xml = stream.unfiltered_data
|
136
|
-
xml.force_encoding("utf-8")
|
131
|
+
xml.force_encoding("utf-8")
|
137
132
|
xml
|
138
133
|
end
|
139
134
|
end
|
140
135
|
|
141
136
|
def page_count
|
142
137
|
pages = @objects.deref(root[:Pages])
|
138
|
+
unless pages.kind_of?(::Hash)
|
139
|
+
raise MalformedPDFError, 'Pages structure is missing'
|
140
|
+
end
|
143
141
|
@page_count ||= @objects.deref(pages[:Count])
|
144
142
|
end
|
145
143
|
|
@@ -164,61 +162,6 @@ module PDF
|
|
164
162
|
yield PDF::Reader.new(input, opts)
|
165
163
|
end
|
166
164
|
|
167
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
168
|
-
# eventually be removed
|
169
|
-
#
|
170
|
-
#
|
171
|
-
# Parse the file with the given name, sending events to the given receiver.
|
172
|
-
#
|
173
|
-
def self.file(name, receivers, opts = {})
|
174
|
-
msg = "PDF::Reader#file is deprecated and will be removed in the 2.0 release"
|
175
|
-
$stderr.puts(msg)
|
176
|
-
File.open(name,"rb") do |f|
|
177
|
-
new.parse(f, receivers, opts)
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
182
|
-
# eventually be removed
|
183
|
-
#
|
184
|
-
# Parse the given string, sending events to the given receiver.
|
185
|
-
#
|
186
|
-
def self.string(str, receivers, opts = {})
|
187
|
-
msg = "PDF::Reader#string is deprecated and will be removed in the 2.0 release"
|
188
|
-
$stderr.puts(msg)
|
189
|
-
StringIO.open(str) do |s|
|
190
|
-
new.parse(s, receivers, opts)
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
195
|
-
# eventually be removed
|
196
|
-
#
|
197
|
-
# Parse the file with the given name, returning an unmarshalled ruby version of
|
198
|
-
# represents the requested pdf object
|
199
|
-
#
|
200
|
-
def self.object_file(name, id, gen = 0)
|
201
|
-
msg = "PDF::Reader#object_file is deprecated and will be removed in the 2.0 release"
|
202
|
-
$stderr.puts(msg)
|
203
|
-
File.open(name,"rb") { |f|
|
204
|
-
new.object(f, id.to_i, gen.to_i)
|
205
|
-
}
|
206
|
-
end
|
207
|
-
|
208
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
209
|
-
# eventually be removed
|
210
|
-
#
|
211
|
-
# Parse the given string, returning an unmarshalled ruby version of represents
|
212
|
-
# the requested pdf object
|
213
|
-
#
|
214
|
-
def self.object_string(str, id, gen = 0)
|
215
|
-
msg = "PDF::Reader#object_string is deprecated and will be removed in the 2.0 release"
|
216
|
-
$stderr.puts(msg)
|
217
|
-
StringIO.open(str) { |s|
|
218
|
-
new.object(s, id.to_i, gen.to_i)
|
219
|
-
}
|
220
|
-
end
|
221
|
-
|
222
165
|
# returns an array of PDF::Reader::Page objects, one for each
|
223
166
|
# page in the source PDF.
|
224
167
|
#
|
@@ -234,9 +177,13 @@ module PDF
|
|
234
177
|
# methods available on each page
|
235
178
|
#
|
236
179
|
def pages
|
237
|
-
(1..self.page_count).map
|
238
|
-
|
239
|
-
|
180
|
+
(1..self.page_count).map do |num|
|
181
|
+
begin
|
182
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
+
rescue InvalidPageError
|
184
|
+
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
|
+
end
|
186
|
+
end
|
240
187
|
end
|
241
188
|
|
242
189
|
# returns a single PDF::Reader::Page for the specified page.
|
@@ -254,45 +201,11 @@ module PDF
|
|
254
201
|
def page(num)
|
255
202
|
num = num.to_i
|
256
203
|
if num < 1 || num > self.page_count
|
257
|
-
raise
|
204
|
+
raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
|
258
205
|
end
|
259
206
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
260
207
|
end
|
261
208
|
|
262
|
-
|
263
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
264
|
-
# eventually be removed
|
265
|
-
#
|
266
|
-
# Given an IO object that contains PDF data, parse it.
|
267
|
-
#
|
268
|
-
def parse(io, receivers, opts = {})
|
269
|
-
msg = "PDF::Reader#parse is deprecated and will be removed in the 2.0 release"
|
270
|
-
$stderr.puts(msg)
|
271
|
-
ohash = ObjectHash.new(io)
|
272
|
-
|
273
|
-
options = {:pages => true, :raw_text => false, :metadata => true}
|
274
|
-
options.merge!(opts)
|
275
|
-
|
276
|
-
strategies.each do |s|
|
277
|
-
s.new(ohash, receivers, options).process
|
278
|
-
end
|
279
|
-
|
280
|
-
self
|
281
|
-
end
|
282
|
-
|
283
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
284
|
-
# eventually be removed
|
285
|
-
#
|
286
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
287
|
-
#
|
288
|
-
def object(io, id, gen)
|
289
|
-
msg = "PDF::Reader#object is deprecated and will be removed in the 2.0 release"
|
290
|
-
$stderr.puts(msg)
|
291
|
-
@objects = ObjectHash.new(io)
|
292
|
-
|
293
|
-
@objects.deref(Reference.new(id, gen))
|
294
|
-
end
|
295
|
-
|
296
209
|
private
|
297
210
|
|
298
211
|
# recursively convert strings from outside a content stream into UTF-8
|
@@ -314,14 +227,14 @@ module PDF
|
|
314
227
|
pdfdoc_to_utf8(obj)
|
315
228
|
end
|
316
229
|
else
|
317
|
-
obj
|
230
|
+
@objects.deref(obj)
|
318
231
|
end
|
319
232
|
end
|
320
233
|
|
321
234
|
# TODO find a PDF I can use to spec this behaviour
|
322
235
|
#
|
323
236
|
def pdfdoc_to_utf8(obj)
|
324
|
-
obj.force_encoding("utf-8")
|
237
|
+
obj.force_encoding("utf-8")
|
325
238
|
obj
|
326
239
|
end
|
327
240
|
|
@@ -331,19 +244,18 @@ module PDF
|
|
331
244
|
def utf16_to_utf8(obj)
|
332
245
|
str = obj[2, obj.size]
|
333
246
|
str = str.unpack("n*").pack("U*")
|
334
|
-
str.force_encoding("utf-8")
|
247
|
+
str.force_encoding("utf-8")
|
335
248
|
str
|
336
249
|
end
|
337
250
|
|
338
|
-
def strategies
|
339
|
-
@strategies ||= [
|
340
|
-
::PDF::Reader::MetadataStrategy,
|
341
|
-
::PDF::Reader::PagesStrategy
|
342
|
-
]
|
343
|
-
end
|
344
|
-
|
345
251
|
def root
|
346
|
-
@root ||=
|
252
|
+
@root ||= begin
|
253
|
+
obj = @objects.deref(@objects.trailer[:Root])
|
254
|
+
unless obj.kind_of?(::Hash)
|
255
|
+
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
|
+
end
|
257
|
+
obj
|
258
|
+
end
|
347
259
|
end
|
348
260
|
|
349
261
|
end
|
@@ -351,7 +263,6 @@ end
|
|
351
263
|
################################################################################
|
352
264
|
|
353
265
|
require 'pdf/reader/resource_methods'
|
354
|
-
require 'pdf/reader/abstract_strategy'
|
355
266
|
require 'pdf/reader/buffer'
|
356
267
|
require 'pdf/reader/cid_widths'
|
357
268
|
require 'pdf/reader/cmap'
|
@@ -370,7 +281,6 @@ require 'pdf/reader/font_descriptor'
|
|
370
281
|
require 'pdf/reader/form_xobject'
|
371
282
|
require 'pdf/reader/glyph_hash'
|
372
283
|
require 'pdf/reader/lzw'
|
373
|
-
require 'pdf/reader/metadata_strategy'
|
374
284
|
require 'pdf/reader/object_cache'
|
375
285
|
require 'pdf/reader/object_hash'
|
376
286
|
require 'pdf/reader/object_stream'
|
@@ -379,9 +289,11 @@ require 'pdf/reader/parser'
|
|
379
289
|
require 'pdf/reader/print_receiver'
|
380
290
|
require 'pdf/reader/reference'
|
381
291
|
require 'pdf/reader/register_receiver'
|
292
|
+
require 'pdf/reader/null_security_handler'
|
382
293
|
require 'pdf/reader/standard_security_handler'
|
294
|
+
require 'pdf/reader/standard_security_handler_v5'
|
295
|
+
require 'pdf/reader/unimplemented_security_handler'
|
383
296
|
require 'pdf/reader/stream'
|
384
|
-
require 'pdf/reader/text_receiver'
|
385
297
|
require 'pdf/reader/text_run'
|
386
298
|
require 'pdf/reader/page_state'
|
387
299
|
require 'pdf/reader/page_text_receiver'
|
@@ -389,4 +301,3 @@ require 'pdf/reader/token'
|
|
389
301
|
require 'pdf/reader/xref'
|
390
302
|
require 'pdf/reader/orientation_detector'
|
391
303
|
require 'pdf/reader/page'
|
392
|
-
require 'pdf/hash'
|
data/lib/pdf-reader.rb
CHANGED