pdf-reader 1.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -0,0 +1,91 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'digest'
|
5
|
+
require 'openssl'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# class creates interface to encrypt dictionary for use in Decrypt
|
10
|
+
class StandardSecurityHandlerV5
|
11
|
+
|
12
|
+
attr_reader :key_length, :encrypt_key
|
13
|
+
|
14
|
+
def initialize(opts = {})
|
15
|
+
@key_length = 256
|
16
|
+
@O = opts[:O] # hash(32B) + validation salt(8B) + key salt(8B)
|
17
|
+
@U = opts[:U] # hash(32B) + validation salt(8B) + key salt(8B)
|
18
|
+
@OE = opts[:OE] # decryption key, encrypted w/ owner password
|
19
|
+
@UE = opts[:UE] # decryption key, encrypted w/ user password
|
20
|
+
@encrypt_key = build_standard_key(opts[:password] || '')
|
21
|
+
end
|
22
|
+
|
23
|
+
# This handler supports AES-256 encryption defined in PDF 1.7 Extension Level 3
|
24
|
+
def self.supports?(encrypt)
|
25
|
+
return false if encrypt.nil?
|
26
|
+
|
27
|
+
filter = encrypt.fetch(:Filter, :Standard)
|
28
|
+
version = encrypt.fetch(:V, 0)
|
29
|
+
revision = encrypt.fetch(:R, 0)
|
30
|
+
algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
31
|
+
(filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
|
32
|
+
((version == 5) && (revision == 5) && (algorithm == :AESV3))
|
33
|
+
end
|
34
|
+
|
35
|
+
##7.6.2 General Encryption Algorithm
|
36
|
+
#
|
37
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
38
|
+
#
|
39
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
40
|
+
#
|
41
|
+
# buf - a string to decrypt
|
42
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
43
|
+
#
|
44
|
+
def decrypt( buf, ref )
|
45
|
+
cipher = OpenSSL::Cipher.new("AES-#{@key_length}-CBC")
|
46
|
+
cipher.decrypt
|
47
|
+
cipher.key = @encrypt_key.dup
|
48
|
+
cipher.iv = buf[0..15]
|
49
|
+
cipher.update(buf[16..-1]) + cipher.final
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
# Algorithm 3.2a - Computing an encryption key
|
54
|
+
#
|
55
|
+
# Defined in PDF 1.7 Extension Level 3
|
56
|
+
#
|
57
|
+
# if the string is a valid user/owner password, this will return the decryption key
|
58
|
+
#
|
59
|
+
def auth_owner_pass(password)
|
60
|
+
if Digest::SHA256.digest(password + @O[32..39] + @U) == @O[0..31]
|
61
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
62
|
+
cipher.decrypt
|
63
|
+
cipher.key = Digest::SHA256.digest(password + @O[40..-1] + @U)
|
64
|
+
cipher.iv = "\x00" * 16
|
65
|
+
cipher.padding = 0
|
66
|
+
cipher.update(@OE) + cipher.final
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def auth_user_pass(password)
|
71
|
+
if Digest::SHA256.digest(password + @U[32..39]) == @U[0..31]
|
72
|
+
cipher = OpenSSL::Cipher.new('AES-256-CBC')
|
73
|
+
cipher.decrypt
|
74
|
+
cipher.key = Digest::SHA256.digest(password + @U[40..-1])
|
75
|
+
cipher.iv = "\x00" * 16
|
76
|
+
cipher.padding = 0
|
77
|
+
cipher.update(@UE) + cipher.final
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def build_standard_key(pass)
|
82
|
+
pass = pass.byteslice(0...127) # UTF-8 encoded password. first 127 bytes
|
83
|
+
|
84
|
+
encrypt_key = auth_owner_pass(pass)
|
85
|
+
encrypt_key ||= auth_user_pass(pass)
|
86
|
+
|
87
|
+
raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
|
88
|
+
encrypt_key
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# A value object that represents one or more consecutive characters on a page.
|
@@ -37,6 +38,10 @@ class PDF::Reader
|
|
37
38
|
@endx ||= x + width
|
38
39
|
end
|
39
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
40
45
|
def mean_character_width
|
41
46
|
@width / character_count
|
42
47
|
end
|
@@ -59,22 +64,36 @@ class PDF::Reader
|
|
59
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
60
65
|
end
|
61
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
62
83
|
private
|
63
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
64
89
|
def mergable_range
|
65
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
91
|
end
|
67
92
|
|
93
|
+
# Assume string encoding is marked correctly and we can trust String#size to return a
|
94
|
+
# character count
|
68
95
|
def character_count
|
69
|
-
|
70
|
-
1.0
|
71
|
-
elsif @text.respond_to?(:bytesize)
|
72
|
-
# M17N aware VM
|
73
|
-
# so we can trust String#size to return a character count
|
74
|
-
@text.size.to_f
|
75
|
-
else
|
76
|
-
text.unpack("U*").size.to_f
|
77
|
-
end
|
96
|
+
@text.size.to_f
|
78
97
|
end
|
79
98
|
end
|
80
99
|
end
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
|
6
|
+
# Security handler for when we don't support the flavour of encryption
|
7
|
+
# used in a PDF.
|
8
|
+
class UnimplementedSecurityHandler
|
9
|
+
def self.supports?(encrypt)
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
def decrypt(buf, ref)
|
14
|
+
raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'afm'
|
4
5
|
require 'pdf/reader/synchronized_cache'
|
@@ -11,11 +12,20 @@ class PDF::Reader
|
|
11
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
12
13
|
class BuiltIn
|
13
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
14
23
|
def initialize(font)
|
15
24
|
@font = font
|
16
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
17
26
|
|
18
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
19
29
|
|
20
30
|
if File.file?(metrics_path)
|
21
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -27,23 +37,15 @@ class PDF::Reader
|
|
27
37
|
def glyph_width(code_point)
|
28
38
|
return 0 if code_point.nil? || code_point < 0
|
29
39
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
m = names.map { |name|
|
35
|
-
@metrics.char_metrics[name.to_s]
|
36
|
-
}.compact.first
|
37
|
-
end
|
40
|
+
names = @font.encoding.int_to_name(code_point)
|
41
|
+
metrics = names.map { |name|
|
42
|
+
@metrics.char_metrics[name.to_s]
|
43
|
+
}.compact.first
|
38
44
|
|
39
|
-
if
|
40
|
-
|
41
|
-
elsif @font.widths[code_point - 1]
|
42
|
-
@font.widths[code_point - 1]
|
43
|
-
elsif control_character?(code_point)
|
44
|
-
0
|
45
|
+
if metrics
|
46
|
+
metrics[:wx]
|
45
47
|
else
|
46
|
-
0
|
48
|
+
@font.widths[code_point - 1] || 0
|
47
49
|
end
|
48
50
|
end
|
49
51
|
|
@@ -53,6 +55,13 @@ class PDF::Reader
|
|
53
55
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
54
56
|
end
|
55
57
|
|
58
|
+
def extract_basefont(font_name)
|
59
|
+
if BUILTINS.include?(font_name)
|
60
|
+
font_name
|
61
|
+
else
|
62
|
+
"Times-Roman"
|
63
|
+
end
|
64
|
+
end
|
56
65
|
end
|
57
66
|
end
|
58
67
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
module WidthCalculator
|
@@ -17,8 +18,7 @@ class PDF::Reader
|
|
17
18
|
|
18
19
|
def glyph_width(code_point)
|
19
20
|
return 0 if code_point.nil? || code_point < 0
|
20
|
-
|
21
|
-
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -203,8 +204,10 @@ class PDF::Reader
|
|
203
204
|
("\x00" + bytes).unpack("N")[0]
|
204
205
|
elsif bytes.size == 4
|
205
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
206
209
|
else
|
207
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
208
211
|
end
|
209
212
|
end
|
210
213
|
################################################################################
|
@@ -227,18 +230,21 @@ class PDF::Reader
|
|
227
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
228
231
|
# in the real world.
|
229
232
|
#
|
230
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
231
237
|
#
|
232
238
|
def calc_junk_offset(io)
|
233
239
|
io.rewind
|
234
240
|
offset = io.pos
|
235
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
236
242
|
offset += 1
|
237
243
|
end
|
238
244
|
io.rewind
|
239
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
240
246
|
rescue EOFError
|
241
|
-
|
247
|
+
nil
|
242
248
|
end
|
243
249
|
end
|
244
250
|
################################################################################
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -110,16 +111,10 @@ module PDF
|
|
110
111
|
#
|
111
112
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
112
113
|
#
|
113
|
-
def initialize(input
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
|
-
else
|
119
|
-
msg = "Calling PDF::Reader#new with no arguments is deprecated and will be removed "
|
120
|
-
msg += "in the 2.0 release"
|
121
|
-
$stderr.puts(msg)
|
122
|
-
end
|
114
|
+
def initialize(input, opts = {})
|
115
|
+
@cache = PDF::Reader::ObjectCache.new
|
116
|
+
opts.merge!(:cache => @cache)
|
117
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
123
118
|
end
|
124
119
|
|
125
120
|
def info
|
@@ -133,13 +128,16 @@ module PDF
|
|
133
128
|
nil
|
134
129
|
else
|
135
130
|
xml = stream.unfiltered_data
|
136
|
-
xml.force_encoding("utf-8")
|
131
|
+
xml.force_encoding("utf-8")
|
137
132
|
xml
|
138
133
|
end
|
139
134
|
end
|
140
135
|
|
141
136
|
def page_count
|
142
137
|
pages = @objects.deref(root[:Pages])
|
138
|
+
unless pages.kind_of?(::Hash)
|
139
|
+
raise MalformedPDFError, 'Pages structure is missing'
|
140
|
+
end
|
143
141
|
@page_count ||= @objects.deref(pages[:Count])
|
144
142
|
end
|
145
143
|
|
@@ -164,61 +162,6 @@ module PDF
|
|
164
162
|
yield PDF::Reader.new(input, opts)
|
165
163
|
end
|
166
164
|
|
167
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
168
|
-
# eventually be removed
|
169
|
-
#
|
170
|
-
#
|
171
|
-
# Parse the file with the given name, sending events to the given receiver.
|
172
|
-
#
|
173
|
-
def self.file(name, receivers, opts = {})
|
174
|
-
msg = "PDF::Reader#file is deprecated and will be removed in the 2.0 release"
|
175
|
-
$stderr.puts(msg)
|
176
|
-
File.open(name,"rb") do |f|
|
177
|
-
new.parse(f, receivers, opts)
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
182
|
-
# eventually be removed
|
183
|
-
#
|
184
|
-
# Parse the given string, sending events to the given receiver.
|
185
|
-
#
|
186
|
-
def self.string(str, receivers, opts = {})
|
187
|
-
msg = "PDF::Reader#string is deprecated and will be removed in the 2.0 release"
|
188
|
-
$stderr.puts(msg)
|
189
|
-
StringIO.open(str) do |s|
|
190
|
-
new.parse(s, receivers, opts)
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
195
|
-
# eventually be removed
|
196
|
-
#
|
197
|
-
# Parse the file with the given name, returning an unmarshalled ruby version of
|
198
|
-
# represents the requested pdf object
|
199
|
-
#
|
200
|
-
def self.object_file(name, id, gen = 0)
|
201
|
-
msg = "PDF::Reader#object_file is deprecated and will be removed in the 2.0 release"
|
202
|
-
$stderr.puts(msg)
|
203
|
-
File.open(name,"rb") { |f|
|
204
|
-
new.object(f, id.to_i, gen.to_i)
|
205
|
-
}
|
206
|
-
end
|
207
|
-
|
208
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
209
|
-
# eventually be removed
|
210
|
-
#
|
211
|
-
# Parse the given string, returning an unmarshalled ruby version of represents
|
212
|
-
# the requested pdf object
|
213
|
-
#
|
214
|
-
def self.object_string(str, id, gen = 0)
|
215
|
-
msg = "PDF::Reader#object_string is deprecated and will be removed in the 2.0 release"
|
216
|
-
$stderr.puts(msg)
|
217
|
-
StringIO.open(str) { |s|
|
218
|
-
new.object(s, id.to_i, gen.to_i)
|
219
|
-
}
|
220
|
-
end
|
221
|
-
|
222
165
|
# returns an array of PDF::Reader::Page objects, one for each
|
223
166
|
# page in the source PDF.
|
224
167
|
#
|
@@ -234,9 +177,13 @@ module PDF
|
|
234
177
|
# methods available on each page
|
235
178
|
#
|
236
179
|
def pages
|
237
|
-
(1..self.page_count).map
|
238
|
-
|
239
|
-
|
180
|
+
(1..self.page_count).map do |num|
|
181
|
+
begin
|
182
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
+
rescue InvalidPageError
|
184
|
+
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
|
+
end
|
186
|
+
end
|
240
187
|
end
|
241
188
|
|
242
189
|
# returns a single PDF::Reader::Page for the specified page.
|
@@ -254,45 +201,11 @@ module PDF
|
|
254
201
|
def page(num)
|
255
202
|
num = num.to_i
|
256
203
|
if num < 1 || num > self.page_count
|
257
|
-
raise
|
204
|
+
raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
|
258
205
|
end
|
259
206
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
260
207
|
end
|
261
208
|
|
262
|
-
|
263
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
264
|
-
# eventually be removed
|
265
|
-
#
|
266
|
-
# Given an IO object that contains PDF data, parse it.
|
267
|
-
#
|
268
|
-
def parse(io, receivers, opts = {})
|
269
|
-
msg = "PDF::Reader#parse is deprecated and will be removed in the 2.0 release"
|
270
|
-
$stderr.puts(msg)
|
271
|
-
ohash = ObjectHash.new(io)
|
272
|
-
|
273
|
-
options = {:pages => true, :raw_text => false, :metadata => true}
|
274
|
-
options.merge!(opts)
|
275
|
-
|
276
|
-
strategies.each do |s|
|
277
|
-
s.new(ohash, receivers, options).process
|
278
|
-
end
|
279
|
-
|
280
|
-
self
|
281
|
-
end
|
282
|
-
|
283
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
284
|
-
# eventually be removed
|
285
|
-
#
|
286
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
287
|
-
#
|
288
|
-
def object(io, id, gen)
|
289
|
-
msg = "PDF::Reader#object is deprecated and will be removed in the 2.0 release"
|
290
|
-
$stderr.puts(msg)
|
291
|
-
@objects = ObjectHash.new(io)
|
292
|
-
|
293
|
-
@objects.deref(Reference.new(id, gen))
|
294
|
-
end
|
295
|
-
|
296
209
|
private
|
297
210
|
|
298
211
|
# recursively convert strings from outside a content stream into UTF-8
|
@@ -314,14 +227,14 @@ module PDF
|
|
314
227
|
pdfdoc_to_utf8(obj)
|
315
228
|
end
|
316
229
|
else
|
317
|
-
obj
|
230
|
+
@objects.deref(obj)
|
318
231
|
end
|
319
232
|
end
|
320
233
|
|
321
234
|
# TODO find a PDF I can use to spec this behaviour
|
322
235
|
#
|
323
236
|
def pdfdoc_to_utf8(obj)
|
324
|
-
obj.force_encoding("utf-8")
|
237
|
+
obj.force_encoding("utf-8")
|
325
238
|
obj
|
326
239
|
end
|
327
240
|
|
@@ -331,19 +244,18 @@ module PDF
|
|
331
244
|
def utf16_to_utf8(obj)
|
332
245
|
str = obj[2, obj.size]
|
333
246
|
str = str.unpack("n*").pack("U*")
|
334
|
-
str.force_encoding("utf-8")
|
247
|
+
str.force_encoding("utf-8")
|
335
248
|
str
|
336
249
|
end
|
337
250
|
|
338
|
-
def strategies
|
339
|
-
@strategies ||= [
|
340
|
-
::PDF::Reader::MetadataStrategy,
|
341
|
-
::PDF::Reader::PagesStrategy
|
342
|
-
]
|
343
|
-
end
|
344
|
-
|
345
251
|
def root
|
346
|
-
@root ||=
|
252
|
+
@root ||= begin
|
253
|
+
obj = @objects.deref(@objects.trailer[:Root])
|
254
|
+
unless obj.kind_of?(::Hash)
|
255
|
+
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
|
+
end
|
257
|
+
obj
|
258
|
+
end
|
347
259
|
end
|
348
260
|
|
349
261
|
end
|
@@ -351,7 +263,6 @@ end
|
|
351
263
|
################################################################################
|
352
264
|
|
353
265
|
require 'pdf/reader/resource_methods'
|
354
|
-
require 'pdf/reader/abstract_strategy'
|
355
266
|
require 'pdf/reader/buffer'
|
356
267
|
require 'pdf/reader/cid_widths'
|
357
268
|
require 'pdf/reader/cmap'
|
@@ -370,7 +281,6 @@ require 'pdf/reader/font_descriptor'
|
|
370
281
|
require 'pdf/reader/form_xobject'
|
371
282
|
require 'pdf/reader/glyph_hash'
|
372
283
|
require 'pdf/reader/lzw'
|
373
|
-
require 'pdf/reader/metadata_strategy'
|
374
284
|
require 'pdf/reader/object_cache'
|
375
285
|
require 'pdf/reader/object_hash'
|
376
286
|
require 'pdf/reader/object_stream'
|
@@ -379,9 +289,11 @@ require 'pdf/reader/parser'
|
|
379
289
|
require 'pdf/reader/print_receiver'
|
380
290
|
require 'pdf/reader/reference'
|
381
291
|
require 'pdf/reader/register_receiver'
|
292
|
+
require 'pdf/reader/null_security_handler'
|
382
293
|
require 'pdf/reader/standard_security_handler'
|
294
|
+
require 'pdf/reader/standard_security_handler_v5'
|
295
|
+
require 'pdf/reader/unimplemented_security_handler'
|
383
296
|
require 'pdf/reader/stream'
|
384
|
-
require 'pdf/reader/text_receiver'
|
385
297
|
require 'pdf/reader/text_run'
|
386
298
|
require 'pdf/reader/page_state'
|
387
299
|
require 'pdf/reader/page_text_receiver'
|
@@ -389,4 +301,3 @@ require 'pdf/reader/token'
|
|
389
301
|
require 'pdf/reader/xref'
|
390
302
|
require 'pdf/reader/orientation_detector'
|
391
303
|
require 'pdf/reader/page'
|
392
|
-
require 'pdf/hash'
|
data/lib/pdf-reader.rb
CHANGED