pdf-reader 2.7.0 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +20 -0
- data/Rakefile +1 -1
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +10 -11
- data/lib/pdf/reader/font.rb +71 -16
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +251 -44
- data/lib/pdf/reader/page.rb +51 -22
- data/lib/pdf/reader/page_layout.rb +14 -28
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +52 -10
- data/lib/pdf/reader/parser.rb +22 -7
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +20 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/text_run.rb +13 -6
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +32 -11
- data/rbi/pdf-reader.rbi +408 -174
- metadata +16 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -48,7 +48,11 @@ class PDF::Reader
|
|
48
48
|
@trailer = @xref.trailer
|
49
49
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
50
50
|
@sec_handler = NullSecurityHandler.new
|
51
|
-
@sec_handler =
|
51
|
+
@sec_handler = SecurityHandlerFactory.build(
|
52
|
+
deref(trailer[:Encrypt]),
|
53
|
+
deref(trailer[:ID]),
|
54
|
+
opts[:password]
|
55
|
+
)
|
52
56
|
end
|
53
57
|
|
54
58
|
# returns the type of object a ref points to
|
@@ -92,6 +96,218 @@ class PDF::Reader
|
|
92
96
|
end
|
93
97
|
alias :deref :object
|
94
98
|
|
99
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
100
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
101
|
+
#
|
102
|
+
# Guaranteed to only return an Array or nil. If the dereference results in
|
103
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
104
|
+
# expecting an Array and no other type will do.
|
105
|
+
def deref_array(key)
|
106
|
+
obj = deref(key)
|
107
|
+
|
108
|
+
return obj if obj.nil?
|
109
|
+
|
110
|
+
obj.tap { |obj|
|
111
|
+
raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
|
112
|
+
}
|
113
|
+
end
|
114
|
+
|
115
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
116
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
117
|
+
#
|
118
|
+
# Guaranteed to only return an Array of Numerics or nil. If the dereference results in
|
119
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
120
|
+
# expecting an Array and no other type will do.
|
121
|
+
#
|
122
|
+
# Some effort to cast array elements to a number is made for any non-numeric elements.
|
123
|
+
def deref_array_of_numbers(key)
|
124
|
+
arr = deref(key)
|
125
|
+
|
126
|
+
return arr if arr.nil?
|
127
|
+
|
128
|
+
raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
|
129
|
+
|
130
|
+
arr.map { |item|
|
131
|
+
if item.is_a?(Numeric)
|
132
|
+
item
|
133
|
+
elsif item.respond_to?(:to_f)
|
134
|
+
item.to_f
|
135
|
+
elsif item.respond_to?(:to_i)
|
136
|
+
item.to_i
|
137
|
+
else
|
138
|
+
raise MalformedPDFError, "expected object to be a number"
|
139
|
+
end
|
140
|
+
}
|
141
|
+
end
|
142
|
+
|
143
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
144
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
145
|
+
#
|
146
|
+
# Guaranteed to only return a Hash or nil. If the dereference results in
|
147
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
148
|
+
# expecting an Array and no other type will do.
|
149
|
+
def deref_hash(key)
|
150
|
+
obj = deref(key)
|
151
|
+
|
152
|
+
return obj if obj.nil?
|
153
|
+
|
154
|
+
obj.tap { |obj|
|
155
|
+
raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
160
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
161
|
+
#
|
162
|
+
# Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
|
163
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
164
|
+
# expecting an Array and no other type will do.
|
165
|
+
#
|
166
|
+
# Some effort to cast to a symbol is made when the reference points to a non-symbol.
|
167
|
+
def deref_name(key)
|
168
|
+
obj = deref(key)
|
169
|
+
|
170
|
+
return obj if obj.nil?
|
171
|
+
|
172
|
+
if !obj.is_a?(Symbol)
|
173
|
+
if obj.respond_to?(:to_sym)
|
174
|
+
obj = obj.to_sym
|
175
|
+
else
|
176
|
+
raise MalformedPDFError, "expected object to be a Name"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
obj
|
181
|
+
end
|
182
|
+
|
183
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
184
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
185
|
+
#
|
186
|
+
# Guaranteed to only return an Integer or nil. If the dereference results in
|
187
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
188
|
+
# expecting an Array and no other type will do.
|
189
|
+
#
|
190
|
+
# Some effort to cast to an int is made when the reference points to a non-integer.
|
191
|
+
def deref_integer(key)
|
192
|
+
obj = deref(key)
|
193
|
+
|
194
|
+
return obj if obj.nil?
|
195
|
+
|
196
|
+
if !obj.is_a?(Integer)
|
197
|
+
if obj.respond_to?(:to_i)
|
198
|
+
obj = obj.to_i
|
199
|
+
else
|
200
|
+
raise MalformedPDFError, "expected object to be an Integer"
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
obj
|
205
|
+
end
|
206
|
+
|
207
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
208
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
209
|
+
#
|
210
|
+
# Guaranteed to only return a Numeric or nil. If the dereference results in
|
211
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
212
|
+
# expecting an Array and no other type will do.
|
213
|
+
#
|
214
|
+
# Some effort to cast to a number is made when the reference points to a non-number.
|
215
|
+
def deref_number(key)
|
216
|
+
obj = deref(key)
|
217
|
+
|
218
|
+
return obj if obj.nil?
|
219
|
+
|
220
|
+
if !obj.is_a?(Numeric)
|
221
|
+
if obj.respond_to?(:to_f)
|
222
|
+
obj = obj.to_f
|
223
|
+
elsif obj.respond_to?(:to_i)
|
224
|
+
obj.to_i
|
225
|
+
else
|
226
|
+
raise MalformedPDFError, "expected object to be a number"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
obj
|
231
|
+
end
|
232
|
+
|
233
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
234
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
235
|
+
#
|
236
|
+
# Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
|
237
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
238
|
+
# expecting a stream and no other type will do.
|
239
|
+
def deref_stream(key)
|
240
|
+
obj = deref(key)
|
241
|
+
|
242
|
+
return obj if obj.nil?
|
243
|
+
|
244
|
+
obj.tap { |obj|
|
245
|
+
if !obj.is_a?(PDF::Reader::Stream)
|
246
|
+
raise MalformedPDFError, "expected object to be an Array or nil"
|
247
|
+
end
|
248
|
+
}
|
249
|
+
end
|
250
|
+
|
251
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
252
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
253
|
+
#
|
254
|
+
# Guaranteed to only return a String or nil. If the dereference results in
|
255
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
256
|
+
# expecting a string and no other type will do.
|
257
|
+
#
|
258
|
+
# Some effort to cast to a string is made when the reference points to a non-string.
|
259
|
+
def deref_string(key)
|
260
|
+
obj = deref(key)
|
261
|
+
|
262
|
+
return obj if obj.nil?
|
263
|
+
|
264
|
+
if !obj.is_a?(String)
|
265
|
+
if obj.respond_to?(:to_s)
|
266
|
+
obj = obj.to_s
|
267
|
+
else
|
268
|
+
raise MalformedPDFError, "expected object to be a string"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
obj
|
273
|
+
end
|
274
|
+
|
275
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
276
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
277
|
+
#
|
278
|
+
# Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
|
279
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
280
|
+
# expecting a Name or Array and no other type will do.
|
281
|
+
def deref_name_or_array(key)
|
282
|
+
obj = deref(key)
|
283
|
+
|
284
|
+
return obj if obj.nil?
|
285
|
+
|
286
|
+
obj.tap { |obj|
|
287
|
+
if !obj.is_a?(Symbol) && !obj.is_a?(Array)
|
288
|
+
raise MalformedPDFError, "expected object to be an Array or Name"
|
289
|
+
end
|
290
|
+
}
|
291
|
+
end
|
292
|
+
|
293
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
294
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
295
|
+
#
|
296
|
+
# Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
|
297
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
298
|
+
# expecting a stream or Array and no other type will do.
|
299
|
+
def deref_stream_or_array(key)
|
300
|
+
obj = deref(key)
|
301
|
+
|
302
|
+
return obj if obj.nil?
|
303
|
+
|
304
|
+
obj.tap { |obj|
|
305
|
+
if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
|
306
|
+
raise MalformedPDFError, "expected object to be an Array or Stream"
|
307
|
+
end
|
308
|
+
}
|
309
|
+
end
|
310
|
+
|
95
311
|
# Recursively dereferences the object refered to be +key+. If +key+ is not
|
96
312
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
97
313
|
#
|
@@ -99,6 +315,22 @@ class PDF::Reader
|
|
99
315
|
deref_internal!(key, {})
|
100
316
|
end
|
101
317
|
|
318
|
+
def deref_array!(key)
|
319
|
+
deref!(key).tap { |obj|
|
320
|
+
if !obj.nil? && !obj.is_a?(Array)
|
321
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
|
322
|
+
end
|
323
|
+
}
|
324
|
+
end
|
325
|
+
|
326
|
+
def deref_hash!(key)
|
327
|
+
deref!(key).tap { |obj|
|
328
|
+
if !obj.nil? && !obj.is_a?(Hash)
|
329
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
|
330
|
+
end
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
102
334
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
103
335
|
# object.
|
104
336
|
#
|
@@ -233,7 +465,10 @@ class PDF::Reader
|
|
233
465
|
#
|
234
466
|
def page_references
|
235
467
|
root = fetch(trailer[:Root])
|
236
|
-
@page_references ||=
|
468
|
+
@page_references ||= begin
|
469
|
+
pages_root = deref_hash(root[:Pages]) || {}
|
470
|
+
get_page_objects(pages_root)
|
471
|
+
end
|
237
472
|
end
|
238
473
|
|
239
474
|
def encrypted?
|
@@ -299,36 +534,6 @@ class PDF::Reader
|
|
299
534
|
end
|
300
535
|
end
|
301
536
|
|
302
|
-
def build_security_handler(opts = {})
|
303
|
-
encrypt = deref(trailer[:Encrypt])
|
304
|
-
if NullSecurityHandler.supports?(encrypt)
|
305
|
-
NullSecurityHandler.new
|
306
|
-
elsif StandardSecurityHandler.supports?(encrypt)
|
307
|
-
encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
|
308
|
-
StandardSecurityHandler.new(
|
309
|
-
key_length: (encrypt[:Length] || 40).to_i,
|
310
|
-
revision: encrypt[:R],
|
311
|
-
owner_key: encrypt[:O],
|
312
|
-
user_key: encrypt[:U],
|
313
|
-
permissions: encrypt[:P].to_i,
|
314
|
-
encrypted_metadata: encmeta,
|
315
|
-
file_id: (deref(trailer[:ID]) || []).first,
|
316
|
-
password: opts[:password],
|
317
|
-
cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
318
|
-
)
|
319
|
-
elsif StandardSecurityHandlerV5.supports?(encrypt)
|
320
|
-
StandardSecurityHandlerV5.new(
|
321
|
-
O: encrypt[:O],
|
322
|
-
U: encrypt[:U],
|
323
|
-
OE: encrypt[:OE],
|
324
|
-
UE: encrypt[:UE],
|
325
|
-
password: opts[:password]
|
326
|
-
)
|
327
|
-
else
|
328
|
-
UnimplementedSecurityHandler.new
|
329
|
-
end
|
330
|
-
end
|
331
|
-
|
332
537
|
def decrypt(ref, obj)
|
333
538
|
case obj
|
334
539
|
when PDF::Reader::Stream then
|
@@ -362,19 +567,21 @@ class PDF::Reader
|
|
362
567
|
@object_stream ||= {}
|
363
568
|
end
|
364
569
|
|
365
|
-
# returns
|
570
|
+
# returns an array of object references for all pages in this object store. The ordering of
|
571
|
+
# the Array is significant and matches the page ordering of the document
|
366
572
|
#
|
367
|
-
def get_page_objects(
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
573
|
+
def get_page_objects(obj)
|
574
|
+
derefed_obj = deref_hash(obj)
|
575
|
+
|
576
|
+
if derefed_obj[:Type] == :Page
|
577
|
+
[obj]
|
578
|
+
elsif derefed_obj[:Kids]
|
579
|
+
kids = deref_array(derefed_obj[:Kids]) || []
|
580
|
+
kids.map { |kid|
|
581
|
+
get_page_objects(kid)
|
582
|
+
}.flatten
|
583
|
+
else
|
584
|
+
raise MalformedPDFError, "Expected Page or Pages object"
|
378
585
|
end
|
379
586
|
end
|
380
587
|
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -14,7 +14,7 @@ module PDF
|
|
14
14
|
# objects accessor to help walk the page dictionary in any useful way.
|
15
15
|
#
|
16
16
|
class Page
|
17
|
-
|
17
|
+
extend Forwardable
|
18
18
|
|
19
19
|
# lowlevel hash-like access to all objects in the underlying PDF
|
20
20
|
attr_reader :objects
|
@@ -27,6 +27,15 @@ module PDF
|
|
27
27
|
# operations
|
28
28
|
attr_reader :cache
|
29
29
|
|
30
|
+
def_delegators :resources, :color_spaces
|
31
|
+
def_delegators :resources, :fonts
|
32
|
+
def_delegators :resources, :graphic_states
|
33
|
+
def_delegators :resources, :patterns
|
34
|
+
def_delegators :resources, :procedure_sets
|
35
|
+
def_delegators :resources, :properties
|
36
|
+
def_delegators :resources, :shadings
|
37
|
+
def_delegators :resources, :xobjects
|
38
|
+
|
30
39
|
# creates a new page wrapper.
|
31
40
|
#
|
32
41
|
# * objects - an ObjectHash instance that wraps a PDF file
|
@@ -34,7 +43,7 @@ module PDF
|
|
34
43
|
#
|
35
44
|
def initialize(objects, pagenum, options = {})
|
36
45
|
@objects, @pagenum = objects, pagenum
|
37
|
-
@page_object = objects.
|
46
|
+
@page_object = objects.deref_hash(objects.page_references[pagenum - 1])
|
38
47
|
@cache = options[:cache] || {}
|
39
48
|
|
40
49
|
unless @page_object.is_a?(::Hash)
|
@@ -60,7 +69,7 @@ module PDF
|
|
60
69
|
def attributes
|
61
70
|
@attributes ||= {}.tap { |hash|
|
62
71
|
page_with_ancestors.reverse.each do |obj|
|
63
|
-
hash.merge!(@objects.
|
72
|
+
hash.merge!(@objects.deref_hash(obj) || {})
|
64
73
|
end
|
65
74
|
}
|
66
75
|
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
|
@@ -101,13 +110,24 @@ module PDF
|
|
101
110
|
# returns the plain text content of this page encoded as UTF-8. Any
|
102
111
|
# characters that can't be translated will be returned as a ▯
|
103
112
|
#
|
104
|
-
def text
|
113
|
+
def text(opts = {})
|
105
114
|
receiver = PageTextReceiver.new
|
106
115
|
walk(receiver)
|
107
|
-
receiver.
|
116
|
+
runs = receiver.runs(opts)
|
117
|
+
|
118
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
119
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
120
|
+
|
121
|
+
PageLayout.new(runs, mediabox).to_s
|
108
122
|
end
|
109
123
|
alias :to_s :text
|
110
124
|
|
125
|
+
def runs(opts = {})
|
126
|
+
receiver = PageTextReceiver.new
|
127
|
+
walk(receiver)
|
128
|
+
receiver.runs(opts)
|
129
|
+
end
|
130
|
+
|
111
131
|
# processes the raw content stream for this page in sequential order and
|
112
132
|
# passes callbacks to the receiver objects.
|
113
133
|
#
|
@@ -132,6 +152,9 @@ module PDF
|
|
132
152
|
# the program in the correct order and calls out to your implementation.
|
133
153
|
#
|
134
154
|
def walk(*receivers)
|
155
|
+
receivers = receivers.map { |receiver|
|
156
|
+
ValidatingReceiver.new(receiver)
|
157
|
+
}
|
135
158
|
callback(receivers, :page=, [self])
|
136
159
|
content_stream(receivers, raw_content)
|
137
160
|
end
|
@@ -140,10 +163,10 @@ module PDF
|
|
140
163
|
# see here unless you're a PDF nerd like me.
|
141
164
|
#
|
142
165
|
def raw_content
|
143
|
-
contents = objects.
|
166
|
+
contents = objects.deref_stream_or_array(@page_object[:Contents])
|
144
167
|
[contents].flatten.compact.map { |obj|
|
145
|
-
objects.
|
146
|
-
}.map { |obj|
|
168
|
+
objects.deref_stream(obj)
|
169
|
+
}.compact.map { |obj|
|
147
170
|
obj.unfiltered_data
|
148
171
|
}.join(" ")
|
149
172
|
end
|
@@ -174,17 +197,22 @@ module PDF
|
|
174
197
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
198
|
#
|
176
199
|
def rectangles
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
200
|
+
# attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
201
|
+
mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
|
202
|
+
cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
|
203
|
+
bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
|
204
|
+
trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
|
205
|
+
artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
|
206
|
+
|
207
|
+
begin
|
208
|
+
mediarect = Rectangle.from_array(mediabox)
|
209
|
+
croprect = Rectangle.from_array(cropbox)
|
210
|
+
bleedrect = Rectangle.from_array(bleedbox)
|
211
|
+
trimrect = Rectangle.from_array(trimbox)
|
212
|
+
artrect = Rectangle.from_array(artbox)
|
213
|
+
rescue ArgumentError => e
|
214
|
+
raise MalformedPDFError, e.message
|
215
|
+
end
|
188
216
|
|
189
217
|
if rotate > 0
|
190
218
|
mediarect.apply_rotation(rotate)
|
@@ -206,14 +234,14 @@ module PDF
|
|
206
234
|
private
|
207
235
|
|
208
236
|
def root
|
209
|
-
|
237
|
+
@root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
|
210
238
|
end
|
211
239
|
|
212
240
|
# Returns the resources that accompany this page. Includes
|
213
241
|
# resources inherited from parents.
|
214
242
|
#
|
215
243
|
def resources
|
216
|
-
@resources ||= @objects.
|
244
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
|
217
245
|
end
|
218
246
|
|
219
247
|
def content_stream(receivers, instructions)
|
@@ -249,7 +277,8 @@ module PDF
|
|
249
277
|
if origin.nil?
|
250
278
|
[]
|
251
279
|
else
|
252
|
-
obj = objects.
|
280
|
+
obj = objects.deref_hash(origin)
|
281
|
+
PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
|
253
282
|
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
254
283
|
end
|
255
284
|
end
|
@@ -21,10 +21,8 @@ class PDF::Reader
|
|
21
21
|
# PDF::Reader::Rectangle at some point
|
22
22
|
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
23
23
|
|
24
|
-
|
25
|
-
runs =
|
26
|
-
@mediabox = mediabox
|
27
|
-
@runs = merge_runs(runs)
|
24
|
+
@mediabox = process_mediabox(mediabox)
|
25
|
+
@runs = runs
|
28
26
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
29
27
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
30
28
|
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
@@ -51,13 +49,11 @@ class PDF::Reader
|
|
51
49
|
private
|
52
50
|
|
53
51
|
def page_width
|
54
|
-
|
55
|
-
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
52
|
+
@mediabox.width
|
56
53
|
end
|
57
54
|
|
58
55
|
def page_height
|
59
|
-
|
60
|
-
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
56
|
+
@mediabox.height
|
61
57
|
end
|
62
58
|
|
63
59
|
# given an array of strings, return a new array with empty rows from the
|
@@ -109,30 +105,20 @@ class PDF::Reader
|
|
109
105
|
end
|
110
106
|
end
|
111
107
|
|
112
|
-
|
113
|
-
|
114
|
-
def merge_runs(runs)
|
115
|
-
runs.group_by { |char|
|
116
|
-
char.y.to_i
|
117
|
-
}.map { |y, chars|
|
118
|
-
group_chars_into_runs(chars.sort)
|
119
|
-
}.flatten.sort
|
108
|
+
def local_string_insert(haystack, needle, index)
|
109
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
120
110
|
end
|
121
111
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
end
|
112
|
+
def process_mediabox(mediabox)
|
113
|
+
if mediabox.is_a?(Array)
|
114
|
+
msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
|
115
|
+
" please use a Rectangle instead"
|
116
|
+
$stderr.puts msg
|
117
|
+
PDF::Reader::Rectangle.from_array(mediabox)
|
118
|
+
else
|
119
|
+
mediabox
|
131
120
|
end
|
132
121
|
end
|
133
122
|
|
134
|
-
def local_string_insert(haystack, needle, index)
|
135
|
-
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
136
|
-
end
|
137
123
|
end
|
138
124
|
end
|
@@ -384,7 +384,7 @@ class PDF::Reader
|
|
384
384
|
#
|
385
385
|
def build_fonts(raw_fonts)
|
386
386
|
wrapped_fonts = raw_fonts.map { |label, font|
|
387
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
387
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
|
388
388
|
}
|
389
389
|
|
390
390
|
::Hash[wrapped_fonts]
|
@@ -47,9 +47,32 @@ module PDF
|
|
47
47
|
@characters = []
|
48
48
|
end
|
49
49
|
|
50
|
+
def runs(opts = {})
|
51
|
+
runs = @characters
|
52
|
+
|
53
|
+
if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
|
54
|
+
runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
|
55
|
+
end
|
56
|
+
|
57
|
+
if opts.fetch(:skip_zero_width, true)
|
58
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
59
|
+
end
|
60
|
+
|
61
|
+
if opts.fetch(:skip_overlapping, true)
|
62
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
63
|
+
end
|
64
|
+
|
65
|
+
if opts.fetch(:merge, true)
|
66
|
+
runs = merge_runs(runs)
|
67
|
+
end
|
68
|
+
|
69
|
+
runs
|
70
|
+
end
|
71
|
+
|
72
|
+
# deprecated
|
50
73
|
def content
|
51
|
-
mediabox = @page.rectangles[:MediaBox]
|
52
|
-
PageLayout.new(
|
74
|
+
mediabox = @page.rectangles[:MediaBox]
|
75
|
+
PageLayout.new(runs, mediabox).to_s
|
53
76
|
end
|
54
77
|
|
55
78
|
#####################################################
|
@@ -64,8 +87,10 @@ module PDF
|
|
64
87
|
params.each do |arg|
|
65
88
|
if arg.is_a?(String)
|
66
89
|
internal_show_text(arg)
|
67
|
-
|
90
|
+
elsif arg.is_a?(Numeric)
|
68
91
|
@state.process_glyph_displacement(0, arg, false)
|
92
|
+
else
|
93
|
+
# skip it
|
69
94
|
end
|
70
95
|
end
|
71
96
|
end
|
@@ -96,6 +121,7 @@ module PDF
|
|
96
121
|
private
|
97
122
|
|
98
123
|
def internal_show_text(string)
|
124
|
+
PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
|
99
125
|
if @state.current_font.nil?
|
100
126
|
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
101
127
|
end
|
@@ -109,7 +135,7 @@ module PDF
|
|
109
135
|
|
110
136
|
# apply to glyph displacment for the current glyph so the next
|
111
137
|
# glyph will appear in the correct position
|
112
|
-
glyph_width = @state.current_font.
|
138
|
+
glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
|
113
139
|
th = 1
|
114
140
|
scaled_glyph_width = glyph_width * @state.font_size * th
|
115
141
|
unless utf8_chars == SPACE
|
@@ -119,12 +145,6 @@ module PDF
|
|
119
145
|
end
|
120
146
|
end
|
121
147
|
|
122
|
-
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
-
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
-
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
-
#
|
126
|
-
# We should provide an API for extracting the text with positioning data and spec
|
127
|
-
# that. I suspect the co-ords might be wrong for rotated pages
|
128
148
|
def apply_rotation(x, y)
|
129
149
|
if @page.rotate == 90
|
130
150
|
tmp = x
|
@@ -141,6 +161,28 @@ module PDF
|
|
141
161
|
return x, y
|
142
162
|
end
|
143
163
|
|
164
|
+
# take a collection of TextRun objects and merge any that are in close
|
165
|
+
# proximity
|
166
|
+
def merge_runs(runs)
|
167
|
+
runs.group_by { |char|
|
168
|
+
char.y.to_i
|
169
|
+
}.map { |y, chars|
|
170
|
+
group_chars_into_runs(chars.sort)
|
171
|
+
}.flatten.sort
|
172
|
+
end
|
173
|
+
|
174
|
+
def group_chars_into_runs(chars)
|
175
|
+
chars.each_with_object([]) do |char, runs|
|
176
|
+
if runs.empty?
|
177
|
+
runs << char
|
178
|
+
elsif runs.last.mergable?(char)
|
179
|
+
runs[-1] = runs.last + char
|
180
|
+
else
|
181
|
+
runs << char
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
144
186
|
end
|
145
187
|
end
|
146
188
|
end
|