pdf-reader 2.5.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +42 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +90 -46
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +19 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +11 -9
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +2 -1
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +36 -37
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +23 -12
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +27 -4
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +21 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -47,7 +48,11 @@ class PDF::Reader
|
|
47
48
|
@trailer = @xref.trailer
|
48
49
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
49
50
|
@sec_handler = NullSecurityHandler.new
|
50
|
-
@sec_handler =
|
51
|
+
@sec_handler = SecurityHandlerFactory.build(
|
52
|
+
deref(trailer[:Encrypt]),
|
53
|
+
deref(trailer[:ID]),
|
54
|
+
opts[:password]
|
55
|
+
)
|
51
56
|
end
|
52
57
|
|
53
58
|
# returns the type of object a ref points to
|
@@ -91,6 +96,218 @@ class PDF::Reader
|
|
91
96
|
end
|
92
97
|
alias :deref :object
|
93
98
|
|
99
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
100
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
101
|
+
#
|
102
|
+
# Guaranteed to only return an Array or nil. If the dereference results in
|
103
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
104
|
+
# expecting an Array and no other type will do.
|
105
|
+
def deref_array(key)
|
106
|
+
obj = deref(key)
|
107
|
+
|
108
|
+
return obj if obj.nil?
|
109
|
+
|
110
|
+
obj.tap { |obj|
|
111
|
+
raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
|
112
|
+
}
|
113
|
+
end
|
114
|
+
|
115
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
116
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
117
|
+
#
|
118
|
+
# Guaranteed to only return an Array of Numerics or nil. If the dereference results in
|
119
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
120
|
+
# expecting an Array and no other type will do.
|
121
|
+
#
|
122
|
+
# Some effort to cast array elements to a number is made for any non-numeric elements.
|
123
|
+
def deref_array_of_numbers(key)
|
124
|
+
arr = deref(key)
|
125
|
+
|
126
|
+
return arr if arr.nil?
|
127
|
+
|
128
|
+
raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
|
129
|
+
|
130
|
+
arr.map { |item|
|
131
|
+
if item.is_a?(Numeric)
|
132
|
+
item
|
133
|
+
elsif item.respond_to?(:to_f)
|
134
|
+
item.to_f
|
135
|
+
elsif item.respond_to?(:to_i)
|
136
|
+
item.to_i
|
137
|
+
else
|
138
|
+
raise MalformedPDFError, "expected object to be a number"
|
139
|
+
end
|
140
|
+
}
|
141
|
+
end
|
142
|
+
|
143
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
144
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
145
|
+
#
|
146
|
+
# Guaranteed to only return a Hash or nil. If the dereference results in
|
147
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
148
|
+
# expecting an Array and no other type will do.
|
149
|
+
def deref_hash(key)
|
150
|
+
obj = deref(key)
|
151
|
+
|
152
|
+
return obj if obj.nil?
|
153
|
+
|
154
|
+
obj.tap { |obj|
|
155
|
+
raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
160
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
161
|
+
#
|
162
|
+
# Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
|
163
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
164
|
+
# expecting an Array and no other type will do.
|
165
|
+
#
|
166
|
+
# Some effort to cast to a symbol is made when the reference points to a non-symbol.
|
167
|
+
def deref_name(key)
|
168
|
+
obj = deref(key)
|
169
|
+
|
170
|
+
return obj if obj.nil?
|
171
|
+
|
172
|
+
if !obj.is_a?(Symbol)
|
173
|
+
if obj.respond_to?(:to_sym)
|
174
|
+
obj = obj.to_sym
|
175
|
+
else
|
176
|
+
raise MalformedPDFError, "expected object to be a Name"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
obj
|
181
|
+
end
|
182
|
+
|
183
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
184
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
185
|
+
#
|
186
|
+
# Guaranteed to only return an Integer or nil. If the dereference results in
|
187
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
188
|
+
# expecting an Array and no other type will do.
|
189
|
+
#
|
190
|
+
# Some effort to cast to an int is made when the reference points to a non-integer.
|
191
|
+
def deref_integer(key)
|
192
|
+
obj = deref(key)
|
193
|
+
|
194
|
+
return obj if obj.nil?
|
195
|
+
|
196
|
+
if !obj.is_a?(Integer)
|
197
|
+
if obj.respond_to?(:to_i)
|
198
|
+
obj = obj.to_i
|
199
|
+
else
|
200
|
+
raise MalformedPDFError, "expected object to be an Integer"
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
obj
|
205
|
+
end
|
206
|
+
|
207
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
208
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
209
|
+
#
|
210
|
+
# Guaranteed to only return a Numeric or nil. If the dereference results in
|
211
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
212
|
+
# expecting an Array and no other type will do.
|
213
|
+
#
|
214
|
+
# Some effort to cast to a number is made when the reference points to a non-number.
|
215
|
+
def deref_number(key)
|
216
|
+
obj = deref(key)
|
217
|
+
|
218
|
+
return obj if obj.nil?
|
219
|
+
|
220
|
+
if !obj.is_a?(Numeric)
|
221
|
+
if obj.respond_to?(:to_f)
|
222
|
+
obj = obj.to_f
|
223
|
+
elsif obj.respond_to?(:to_i)
|
224
|
+
obj.to_i
|
225
|
+
else
|
226
|
+
raise MalformedPDFError, "expected object to be a number"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
obj
|
231
|
+
end
|
232
|
+
|
233
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
234
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
235
|
+
#
|
236
|
+
# Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
|
237
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
238
|
+
# expecting a stream and no other type will do.
|
239
|
+
def deref_stream(key)
|
240
|
+
obj = deref(key)
|
241
|
+
|
242
|
+
return obj if obj.nil?
|
243
|
+
|
244
|
+
obj.tap { |obj|
|
245
|
+
if !obj.is_a?(PDF::Reader::Stream)
|
246
|
+
raise MalformedPDFError, "expected object to be an Array or nil"
|
247
|
+
end
|
248
|
+
}
|
249
|
+
end
|
250
|
+
|
251
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
252
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
253
|
+
#
|
254
|
+
# Guaranteed to only return a String or nil. If the dereference results in
|
255
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
256
|
+
# expecting a string and no other type will do.
|
257
|
+
#
|
258
|
+
# Some effort to cast to a string is made when the reference points to a non-string.
|
259
|
+
def deref_string(key)
|
260
|
+
obj = deref(key)
|
261
|
+
|
262
|
+
return obj if obj.nil?
|
263
|
+
|
264
|
+
if !obj.is_a?(String)
|
265
|
+
if obj.respond_to?(:to_s)
|
266
|
+
obj = obj.to_s
|
267
|
+
else
|
268
|
+
raise MalformedPDFError, "expected object to be a string"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
obj
|
273
|
+
end
|
274
|
+
|
275
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
276
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
277
|
+
#
|
278
|
+
# Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
|
279
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
280
|
+
# expecting a Name or Array and no other type will do.
|
281
|
+
def deref_name_or_array(key)
|
282
|
+
obj = deref(key)
|
283
|
+
|
284
|
+
return obj if obj.nil?
|
285
|
+
|
286
|
+
obj.tap { |obj|
|
287
|
+
if !obj.is_a?(Symbol) && !obj.is_a?(Array)
|
288
|
+
raise MalformedPDFError, "expected object to be an Array or Name"
|
289
|
+
end
|
290
|
+
}
|
291
|
+
end
|
292
|
+
|
293
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
294
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
295
|
+
#
|
296
|
+
# Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
|
297
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
298
|
+
# expecting a stream or Array and no other type will do.
|
299
|
+
def deref_stream_or_array(key)
|
300
|
+
obj = deref(key)
|
301
|
+
|
302
|
+
return obj if obj.nil?
|
303
|
+
|
304
|
+
obj.tap { |obj|
|
305
|
+
if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
|
306
|
+
raise MalformedPDFError, "expected object to be an Array or Stream"
|
307
|
+
end
|
308
|
+
}
|
309
|
+
end
|
310
|
+
|
94
311
|
# Recursively dereferences the object refered to be +key+. If +key+ is not
|
95
312
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
96
313
|
#
|
@@ -98,6 +315,22 @@ class PDF::Reader
|
|
98
315
|
deref_internal!(key, {})
|
99
316
|
end
|
100
317
|
|
318
|
+
def deref_array!(key)
|
319
|
+
deref!(key).tap { |obj|
|
320
|
+
if !obj.nil? && !obj.is_a?(Array)
|
321
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
|
322
|
+
end
|
323
|
+
}
|
324
|
+
end
|
325
|
+
|
326
|
+
def deref_hash!(key)
|
327
|
+
deref!(key).tap { |obj|
|
328
|
+
if !obj.nil? && !obj.is_a?(Hash)
|
329
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
|
330
|
+
end
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
101
334
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
102
335
|
# object.
|
103
336
|
#
|
@@ -232,7 +465,10 @@ class PDF::Reader
|
|
232
465
|
#
|
233
466
|
def page_references
|
234
467
|
root = fetch(trailer[:Root])
|
235
|
-
@page_references ||=
|
468
|
+
@page_references ||= begin
|
469
|
+
pages_root = deref_hash(root[:Pages]) || {}
|
470
|
+
get_page_objects(pages_root)
|
471
|
+
end
|
236
472
|
end
|
237
473
|
|
238
474
|
def encrypted?
|
@@ -298,36 +534,6 @@ class PDF::Reader
|
|
298
534
|
end
|
299
535
|
end
|
300
536
|
|
301
|
-
def build_security_handler(opts = {})
|
302
|
-
encrypt = deref(trailer[:Encrypt])
|
303
|
-
if NullSecurityHandler.supports?(encrypt)
|
304
|
-
NullSecurityHandler.new
|
305
|
-
elsif StandardSecurityHandler.supports?(encrypt)
|
306
|
-
encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
|
307
|
-
StandardSecurityHandler.new(
|
308
|
-
key_length: (encrypt[:Length] || 40).to_i,
|
309
|
-
revision: encrypt[:R],
|
310
|
-
owner_key: encrypt[:O],
|
311
|
-
user_key: encrypt[:U],
|
312
|
-
permissions: encrypt[:P].to_i,
|
313
|
-
encrypted_metadata: encmeta,
|
314
|
-
file_id: (deref(trailer[:ID]) || []).first,
|
315
|
-
password: opts[:password],
|
316
|
-
cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
317
|
-
)
|
318
|
-
elsif StandardSecurityHandlerV5.supports?(encrypt)
|
319
|
-
StandardSecurityHandlerV5.new(
|
320
|
-
O: encrypt[:O],
|
321
|
-
U: encrypt[:U],
|
322
|
-
OE: encrypt[:OE],
|
323
|
-
UE: encrypt[:UE],
|
324
|
-
password: opts[:password]
|
325
|
-
)
|
326
|
-
else
|
327
|
-
UnimplementedSecurityHandler.new
|
328
|
-
end
|
329
|
-
end
|
330
|
-
|
331
537
|
def decrypt(ref, obj)
|
332
538
|
case obj
|
333
539
|
when PDF::Reader::Stream then
|
@@ -336,8 +542,10 @@ class PDF::Reader
|
|
336
542
|
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
337
543
|
obj
|
338
544
|
when Hash then
|
339
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
340
|
-
|
545
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
546
|
+
arr.each_with_object({}) { |(k,v), accum|
|
547
|
+
accum[k] = v
|
548
|
+
}
|
341
549
|
when Array then
|
342
550
|
obj.collect { |item| decrypt(ref, item) }
|
343
551
|
when String
|
@@ -359,19 +567,19 @@ class PDF::Reader
|
|
359
567
|
@object_stream ||= {}
|
360
568
|
end
|
361
569
|
|
362
|
-
# returns
|
570
|
+
# returns an array of object references for all pages in this object store. The ordering of
|
571
|
+
# the Array is significant and matches the page ordering of the document
|
363
572
|
#
|
364
|
-
def get_page_objects(
|
365
|
-
obj = deref(ref)
|
366
|
-
|
367
|
-
unless obj.kind_of?(::Hash)
|
368
|
-
raise MalformedPDFError, "Dereferenced page object must be a dict"
|
369
|
-
end
|
370
|
-
|
573
|
+
def get_page_objects(obj)
|
371
574
|
if obj[:Type] == :Page
|
372
|
-
|
575
|
+
[obj]
|
373
576
|
elsif obj[:Kids]
|
374
|
-
|
577
|
+
kids = deref_array(obj[:Kids]) || []
|
578
|
+
kids.map { |kid|
|
579
|
+
get_page_objects(deref_hash(kid) || {})
|
580
|
+
}.flatten
|
581
|
+
else
|
582
|
+
raise MalformedPDFError, "Expected Page or Pages object"
|
375
583
|
end
|
376
584
|
end
|
377
585
|
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -13,7 +14,7 @@ module PDF
|
|
13
14
|
# objects accessor to help walk the page dictionary in any useful way.
|
14
15
|
#
|
15
16
|
class Page
|
16
|
-
|
17
|
+
extend Forwardable
|
17
18
|
|
18
19
|
# lowlevel hash-like access to all objects in the underlying PDF
|
19
20
|
attr_reader :objects
|
@@ -26,6 +27,15 @@ module PDF
|
|
26
27
|
# operations
|
27
28
|
attr_reader :cache
|
28
29
|
|
30
|
+
def_delegators :resources, :color_spaces
|
31
|
+
def_delegators :resources, :fonts
|
32
|
+
def_delegators :resources, :graphic_states
|
33
|
+
def_delegators :resources, :patterns
|
34
|
+
def_delegators :resources, :procedure_sets
|
35
|
+
def_delegators :resources, :properties
|
36
|
+
def_delegators :resources, :shadings
|
37
|
+
def_delegators :resources, :xobjects
|
38
|
+
|
29
39
|
# creates a new page wrapper.
|
30
40
|
#
|
31
41
|
# * objects - an ObjectHash instance that wraps a PDF file
|
@@ -33,7 +43,7 @@ module PDF
|
|
33
43
|
#
|
34
44
|
def initialize(objects, pagenum, options = {})
|
35
45
|
@objects, @pagenum = objects, pagenum
|
36
|
-
@page_object = objects.
|
46
|
+
@page_object = objects.deref_hash(objects.page_references[pagenum - 1])
|
37
47
|
@cache = options[:cache] || {}
|
38
48
|
|
39
49
|
unless @page_object.is_a?(::Hash)
|
@@ -59,7 +69,7 @@ module PDF
|
|
59
69
|
def attributes
|
60
70
|
@attributes ||= {}.tap { |hash|
|
61
71
|
page_with_ancestors.reverse.each do |obj|
|
62
|
-
hash.merge!(@objects.
|
72
|
+
hash.merge!(@objects.deref_hash(obj) || {})
|
63
73
|
end
|
64
74
|
}
|
65
75
|
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
|
@@ -68,22 +78,56 @@ module PDF
|
|
68
78
|
@attributes
|
69
79
|
end
|
70
80
|
|
81
|
+
def height
|
82
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
83
|
+
rect.apply_rotation(rotate) if rotate > 0
|
84
|
+
rect.height
|
85
|
+
end
|
86
|
+
|
87
|
+
def width
|
88
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
89
|
+
rect.apply_rotation(rotate) if rotate > 0
|
90
|
+
rect.width
|
91
|
+
end
|
92
|
+
|
93
|
+
def origin
|
94
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
95
|
+
rect.apply_rotation(rotate) if rotate > 0
|
96
|
+
|
97
|
+
rect.bottom_left
|
98
|
+
end
|
99
|
+
|
71
100
|
# Convenience method to identify the page's orientation.
|
72
101
|
#
|
73
102
|
def orientation
|
74
|
-
|
103
|
+
if height > width
|
104
|
+
"portrait"
|
105
|
+
else
|
106
|
+
"landscape"
|
107
|
+
end
|
75
108
|
end
|
76
109
|
|
77
110
|
# returns the plain text content of this page encoded as UTF-8. Any
|
78
111
|
# characters that can't be translated will be returned as a ▯
|
79
112
|
#
|
80
|
-
def text
|
113
|
+
def text(opts = {})
|
81
114
|
receiver = PageTextReceiver.new
|
82
115
|
walk(receiver)
|
83
|
-
receiver.
|
116
|
+
runs = receiver.runs(opts)
|
117
|
+
|
118
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
119
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
120
|
+
|
121
|
+
PageLayout.new(runs, mediabox).to_s
|
84
122
|
end
|
85
123
|
alias :to_s :text
|
86
124
|
|
125
|
+
def runs(opts = {})
|
126
|
+
receiver = PageTextReceiver.new
|
127
|
+
walk(receiver)
|
128
|
+
receiver.runs(opts)
|
129
|
+
end
|
130
|
+
|
87
131
|
# processes the raw content stream for this page in sequential order and
|
88
132
|
# passes callbacks to the receiver objects.
|
89
133
|
#
|
@@ -108,6 +152,9 @@ module PDF
|
|
108
152
|
# the program in the correct order and calls out to your implementation.
|
109
153
|
#
|
110
154
|
def walk(*receivers)
|
155
|
+
receivers = receivers.map { |receiver|
|
156
|
+
ValidatingReceiver.new(receiver)
|
157
|
+
}
|
111
158
|
callback(receivers, :page=, [self])
|
112
159
|
content_stream(receivers, raw_content)
|
113
160
|
end
|
@@ -116,10 +163,10 @@ module PDF
|
|
116
163
|
# see here unless you're a PDF nerd like me.
|
117
164
|
#
|
118
165
|
def raw_content
|
119
|
-
contents = objects.
|
166
|
+
contents = objects.deref_stream_or_array(@page_object[:Contents])
|
120
167
|
[contents].flatten.compact.map { |obj|
|
121
|
-
objects.
|
122
|
-
}.map { |obj|
|
168
|
+
objects.deref_stream(obj)
|
169
|
+
}.compact.map { |obj|
|
123
170
|
obj.unfiltered_data
|
124
171
|
}.join(" ")
|
125
172
|
end
|
@@ -139,30 +186,62 @@ module PDF
|
|
139
186
|
# returns the "boxes" that define the page object.
|
140
187
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
188
|
#
|
189
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
190
|
+
#
|
142
191
|
def boxes
|
143
|
-
|
144
|
-
|
192
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
193
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
194
|
+
end
|
195
|
+
|
196
|
+
# returns the "boxes" that define the page object.
|
197
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
198
|
+
#
|
199
|
+
def rectangles
|
200
|
+
# attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
201
|
+
mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
|
202
|
+
cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
|
203
|
+
bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
|
204
|
+
trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
|
205
|
+
artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
|
206
|
+
|
207
|
+
begin
|
208
|
+
mediarect = Rectangle.from_array(mediabox)
|
209
|
+
croprect = Rectangle.from_array(cropbox)
|
210
|
+
bleedrect = Rectangle.from_array(bleedbox)
|
211
|
+
trimrect = Rectangle.from_array(trimbox)
|
212
|
+
artrect = Rectangle.from_array(artbox)
|
213
|
+
rescue ArgumentError => e
|
214
|
+
raise MalformedPDFError, e.message
|
215
|
+
end
|
216
|
+
|
217
|
+
if rotate > 0
|
218
|
+
mediarect.apply_rotation(rotate)
|
219
|
+
croprect.apply_rotation(rotate)
|
220
|
+
bleedrect.apply_rotation(rotate)
|
221
|
+
trimrect.apply_rotation(rotate)
|
222
|
+
artrect.apply_rotation(rotate)
|
223
|
+
end
|
145
224
|
|
146
225
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
226
|
+
MediaBox: mediarect,
|
227
|
+
CropBox: croprect,
|
228
|
+
BleedBox: bleedrect,
|
229
|
+
TrimBox: trimrect,
|
230
|
+
ArtBox: artrect,
|
152
231
|
}
|
153
232
|
end
|
154
233
|
|
155
234
|
private
|
156
235
|
|
157
236
|
def root
|
158
|
-
root ||= objects.
|
237
|
+
@root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
|
159
238
|
end
|
160
239
|
|
161
240
|
# Returns the resources that accompany this page. Includes
|
162
241
|
# resources inherited from parents.
|
163
242
|
#
|
164
243
|
def resources
|
165
|
-
@resources ||= @objects.
|
244
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
|
166
245
|
end
|
167
246
|
|
168
247
|
def content_stream(receivers, instructions)
|
@@ -198,7 +277,8 @@ module PDF
|
|
198
277
|
if origin.nil?
|
199
278
|
[]
|
200
279
|
else
|
201
|
-
obj = objects.
|
280
|
+
obj = objects.deref_hash(origin)
|
281
|
+
PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
|
202
282
|
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
203
283
|
end
|
204
284
|
end
|