pdf-reader 2.7.0 → 2.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +20 -0
- data/Rakefile +1 -1
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +10 -11
- data/lib/pdf/reader/font.rb +71 -16
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +251 -44
- data/lib/pdf/reader/page.rb +51 -22
- data/lib/pdf/reader/page_layout.rb +14 -28
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +52 -10
- data/lib/pdf/reader/parser.rb +22 -7
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +20 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/text_run.rb +13 -6
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +32 -11
- data/rbi/pdf-reader.rbi +408 -174
- metadata +16 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -48,7 +48,11 @@ class PDF::Reader
|
|
48
48
|
@trailer = @xref.trailer
|
49
49
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
50
50
|
@sec_handler = NullSecurityHandler.new
|
51
|
-
@sec_handler =
|
51
|
+
@sec_handler = SecurityHandlerFactory.build(
|
52
|
+
deref(trailer[:Encrypt]),
|
53
|
+
deref(trailer[:ID]),
|
54
|
+
opts[:password]
|
55
|
+
)
|
52
56
|
end
|
53
57
|
|
54
58
|
# returns the type of object a ref points to
|
@@ -92,6 +96,218 @@ class PDF::Reader
|
|
92
96
|
end
|
93
97
|
alias :deref :object
|
94
98
|
|
99
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
100
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
101
|
+
#
|
102
|
+
# Guaranteed to only return an Array or nil. If the dereference results in
|
103
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
104
|
+
# expecting an Array and no other type will do.
|
105
|
+
def deref_array(key)
|
106
|
+
obj = deref(key)
|
107
|
+
|
108
|
+
return obj if obj.nil?
|
109
|
+
|
110
|
+
obj.tap { |obj|
|
111
|
+
raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
|
112
|
+
}
|
113
|
+
end
|
114
|
+
|
115
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
116
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
117
|
+
#
|
118
|
+
# Guaranteed to only return an Array of Numerics or nil. If the dereference results in
|
119
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
120
|
+
# expecting an Array and no other type will do.
|
121
|
+
#
|
122
|
+
# Some effort to cast array elements to a number is made for any non-numeric elements.
|
123
|
+
def deref_array_of_numbers(key)
|
124
|
+
arr = deref(key)
|
125
|
+
|
126
|
+
return arr if arr.nil?
|
127
|
+
|
128
|
+
raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
|
129
|
+
|
130
|
+
arr.map { |item|
|
131
|
+
if item.is_a?(Numeric)
|
132
|
+
item
|
133
|
+
elsif item.respond_to?(:to_f)
|
134
|
+
item.to_f
|
135
|
+
elsif item.respond_to?(:to_i)
|
136
|
+
item.to_i
|
137
|
+
else
|
138
|
+
raise MalformedPDFError, "expected object to be a number"
|
139
|
+
end
|
140
|
+
}
|
141
|
+
end
|
142
|
+
|
143
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
144
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
145
|
+
#
|
146
|
+
# Guaranteed to only return a Hash or nil. If the dereference results in
|
147
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
148
|
+
# expecting an Array and no other type will do.
|
149
|
+
def deref_hash(key)
|
150
|
+
obj = deref(key)
|
151
|
+
|
152
|
+
return obj if obj.nil?
|
153
|
+
|
154
|
+
obj.tap { |obj|
|
155
|
+
raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
160
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
161
|
+
#
|
162
|
+
# Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
|
163
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
164
|
+
# expecting an Array and no other type will do.
|
165
|
+
#
|
166
|
+
# Some effort to cast to a symbol is made when the reference points to a non-symbol.
|
167
|
+
def deref_name(key)
|
168
|
+
obj = deref(key)
|
169
|
+
|
170
|
+
return obj if obj.nil?
|
171
|
+
|
172
|
+
if !obj.is_a?(Symbol)
|
173
|
+
if obj.respond_to?(:to_sym)
|
174
|
+
obj = obj.to_sym
|
175
|
+
else
|
176
|
+
raise MalformedPDFError, "expected object to be a Name"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
obj
|
181
|
+
end
|
182
|
+
|
183
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
184
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
185
|
+
#
|
186
|
+
# Guaranteed to only return an Integer or nil. If the dereference results in
|
187
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
188
|
+
# expecting an Array and no other type will do.
|
189
|
+
#
|
190
|
+
# Some effort to cast to an int is made when the reference points to a non-integer.
|
191
|
+
def deref_integer(key)
|
192
|
+
obj = deref(key)
|
193
|
+
|
194
|
+
return obj if obj.nil?
|
195
|
+
|
196
|
+
if !obj.is_a?(Integer)
|
197
|
+
if obj.respond_to?(:to_i)
|
198
|
+
obj = obj.to_i
|
199
|
+
else
|
200
|
+
raise MalformedPDFError, "expected object to be an Integer"
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
obj
|
205
|
+
end
|
206
|
+
|
207
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
208
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
209
|
+
#
|
210
|
+
# Guaranteed to only return a Numeric or nil. If the dereference results in
|
211
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
212
|
+
# expecting an Array and no other type will do.
|
213
|
+
#
|
214
|
+
# Some effort to cast to a number is made when the reference points to a non-number.
|
215
|
+
def deref_number(key)
|
216
|
+
obj = deref(key)
|
217
|
+
|
218
|
+
return obj if obj.nil?
|
219
|
+
|
220
|
+
if !obj.is_a?(Numeric)
|
221
|
+
if obj.respond_to?(:to_f)
|
222
|
+
obj = obj.to_f
|
223
|
+
elsif obj.respond_to?(:to_i)
|
224
|
+
obj.to_i
|
225
|
+
else
|
226
|
+
raise MalformedPDFError, "expected object to be a number"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
obj
|
231
|
+
end
|
232
|
+
|
233
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
234
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
235
|
+
#
|
236
|
+
# Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
|
237
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
238
|
+
# expecting a stream and no other type will do.
|
239
|
+
def deref_stream(key)
|
240
|
+
obj = deref(key)
|
241
|
+
|
242
|
+
return obj if obj.nil?
|
243
|
+
|
244
|
+
obj.tap { |obj|
|
245
|
+
if !obj.is_a?(PDF::Reader::Stream)
|
246
|
+
raise MalformedPDFError, "expected object to be an Array or nil"
|
247
|
+
end
|
248
|
+
}
|
249
|
+
end
|
250
|
+
|
251
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
252
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
253
|
+
#
|
254
|
+
# Guaranteed to only return a String or nil. If the dereference results in
|
255
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
256
|
+
# expecting a string and no other type will do.
|
257
|
+
#
|
258
|
+
# Some effort to cast to a string is made when the reference points to a non-string.
|
259
|
+
def deref_string(key)
|
260
|
+
obj = deref(key)
|
261
|
+
|
262
|
+
return obj if obj.nil?
|
263
|
+
|
264
|
+
if !obj.is_a?(String)
|
265
|
+
if obj.respond_to?(:to_s)
|
266
|
+
obj = obj.to_s
|
267
|
+
else
|
268
|
+
raise MalformedPDFError, "expected object to be a string"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
obj
|
273
|
+
end
|
274
|
+
|
275
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
276
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
277
|
+
#
|
278
|
+
# Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
|
279
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
280
|
+
# expecting a Name or Array and no other type will do.
|
281
|
+
def deref_name_or_array(key)
|
282
|
+
obj = deref(key)
|
283
|
+
|
284
|
+
return obj if obj.nil?
|
285
|
+
|
286
|
+
obj.tap { |obj|
|
287
|
+
if !obj.is_a?(Symbol) && !obj.is_a?(Array)
|
288
|
+
raise MalformedPDFError, "expected object to be an Array or Name"
|
289
|
+
end
|
290
|
+
}
|
291
|
+
end
|
292
|
+
|
293
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
294
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
295
|
+
#
|
296
|
+
# Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
|
297
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
298
|
+
# expecting a stream or Array and no other type will do.
|
299
|
+
def deref_stream_or_array(key)
|
300
|
+
obj = deref(key)
|
301
|
+
|
302
|
+
return obj if obj.nil?
|
303
|
+
|
304
|
+
obj.tap { |obj|
|
305
|
+
if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
|
306
|
+
raise MalformedPDFError, "expected object to be an Array or Stream"
|
307
|
+
end
|
308
|
+
}
|
309
|
+
end
|
310
|
+
|
95
311
|
# Recursively dereferences the object refered to be +key+. If +key+ is not
|
96
312
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
97
313
|
#
|
@@ -99,6 +315,22 @@ class PDF::Reader
|
|
99
315
|
deref_internal!(key, {})
|
100
316
|
end
|
101
317
|
|
318
|
+
def deref_array!(key)
|
319
|
+
deref!(key).tap { |obj|
|
320
|
+
if !obj.nil? && !obj.is_a?(Array)
|
321
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
|
322
|
+
end
|
323
|
+
}
|
324
|
+
end
|
325
|
+
|
326
|
+
def deref_hash!(key)
|
327
|
+
deref!(key).tap { |obj|
|
328
|
+
if !obj.nil? && !obj.is_a?(Hash)
|
329
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
|
330
|
+
end
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
102
334
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
103
335
|
# object.
|
104
336
|
#
|
@@ -233,7 +465,10 @@ class PDF::Reader
|
|
233
465
|
#
|
234
466
|
def page_references
|
235
467
|
root = fetch(trailer[:Root])
|
236
|
-
@page_references ||=
|
468
|
+
@page_references ||= begin
|
469
|
+
pages_root = deref_hash(root[:Pages]) || {}
|
470
|
+
get_page_objects(pages_root)
|
471
|
+
end
|
237
472
|
end
|
238
473
|
|
239
474
|
def encrypted?
|
@@ -299,36 +534,6 @@ class PDF::Reader
|
|
299
534
|
end
|
300
535
|
end
|
301
536
|
|
302
|
-
def build_security_handler(opts = {})
|
303
|
-
encrypt = deref(trailer[:Encrypt])
|
304
|
-
if NullSecurityHandler.supports?(encrypt)
|
305
|
-
NullSecurityHandler.new
|
306
|
-
elsif StandardSecurityHandler.supports?(encrypt)
|
307
|
-
encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
|
308
|
-
StandardSecurityHandler.new(
|
309
|
-
key_length: (encrypt[:Length] || 40).to_i,
|
310
|
-
revision: encrypt[:R],
|
311
|
-
owner_key: encrypt[:O],
|
312
|
-
user_key: encrypt[:U],
|
313
|
-
permissions: encrypt[:P].to_i,
|
314
|
-
encrypted_metadata: encmeta,
|
315
|
-
file_id: (deref(trailer[:ID]) || []).first,
|
316
|
-
password: opts[:password],
|
317
|
-
cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
318
|
-
)
|
319
|
-
elsif StandardSecurityHandlerV5.supports?(encrypt)
|
320
|
-
StandardSecurityHandlerV5.new(
|
321
|
-
O: encrypt[:O],
|
322
|
-
U: encrypt[:U],
|
323
|
-
OE: encrypt[:OE],
|
324
|
-
UE: encrypt[:UE],
|
325
|
-
password: opts[:password]
|
326
|
-
)
|
327
|
-
else
|
328
|
-
UnimplementedSecurityHandler.new
|
329
|
-
end
|
330
|
-
end
|
331
|
-
|
332
537
|
def decrypt(ref, obj)
|
333
538
|
case obj
|
334
539
|
when PDF::Reader::Stream then
|
@@ -362,19 +567,21 @@ class PDF::Reader
|
|
362
567
|
@object_stream ||= {}
|
363
568
|
end
|
364
569
|
|
365
|
-
# returns
|
570
|
+
# returns an array of object references for all pages in this object store. The ordering of
|
571
|
+
# the Array is significant and matches the page ordering of the document
|
366
572
|
#
|
367
|
-
def get_page_objects(
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
573
|
+
def get_page_objects(obj)
|
574
|
+
derefed_obj = deref_hash(obj)
|
575
|
+
|
576
|
+
if derefed_obj[:Type] == :Page
|
577
|
+
[obj]
|
578
|
+
elsif derefed_obj[:Kids]
|
579
|
+
kids = deref_array(derefed_obj[:Kids]) || []
|
580
|
+
kids.map { |kid|
|
581
|
+
get_page_objects(kid)
|
582
|
+
}.flatten
|
583
|
+
else
|
584
|
+
raise MalformedPDFError, "Expected Page or Pages object"
|
378
585
|
end
|
379
586
|
end
|
380
587
|
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -14,7 +14,7 @@ module PDF
|
|
14
14
|
# objects accessor to help walk the page dictionary in any useful way.
|
15
15
|
#
|
16
16
|
class Page
|
17
|
-
|
17
|
+
extend Forwardable
|
18
18
|
|
19
19
|
# lowlevel hash-like access to all objects in the underlying PDF
|
20
20
|
attr_reader :objects
|
@@ -27,6 +27,15 @@ module PDF
|
|
27
27
|
# operations
|
28
28
|
attr_reader :cache
|
29
29
|
|
30
|
+
def_delegators :resources, :color_spaces
|
31
|
+
def_delegators :resources, :fonts
|
32
|
+
def_delegators :resources, :graphic_states
|
33
|
+
def_delegators :resources, :patterns
|
34
|
+
def_delegators :resources, :procedure_sets
|
35
|
+
def_delegators :resources, :properties
|
36
|
+
def_delegators :resources, :shadings
|
37
|
+
def_delegators :resources, :xobjects
|
38
|
+
|
30
39
|
# creates a new page wrapper.
|
31
40
|
#
|
32
41
|
# * objects - an ObjectHash instance that wraps a PDF file
|
@@ -34,7 +43,7 @@ module PDF
|
|
34
43
|
#
|
35
44
|
def initialize(objects, pagenum, options = {})
|
36
45
|
@objects, @pagenum = objects, pagenum
|
37
|
-
@page_object = objects.
|
46
|
+
@page_object = objects.deref_hash(objects.page_references[pagenum - 1])
|
38
47
|
@cache = options[:cache] || {}
|
39
48
|
|
40
49
|
unless @page_object.is_a?(::Hash)
|
@@ -60,7 +69,7 @@ module PDF
|
|
60
69
|
def attributes
|
61
70
|
@attributes ||= {}.tap { |hash|
|
62
71
|
page_with_ancestors.reverse.each do |obj|
|
63
|
-
hash.merge!(@objects.
|
72
|
+
hash.merge!(@objects.deref_hash(obj) || {})
|
64
73
|
end
|
65
74
|
}
|
66
75
|
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
|
@@ -101,13 +110,24 @@ module PDF
|
|
101
110
|
# returns the plain text content of this page encoded as UTF-8. Any
|
102
111
|
# characters that can't be translated will be returned as a ▯
|
103
112
|
#
|
104
|
-
def text
|
113
|
+
def text(opts = {})
|
105
114
|
receiver = PageTextReceiver.new
|
106
115
|
walk(receiver)
|
107
|
-
receiver.
|
116
|
+
runs = receiver.runs(opts)
|
117
|
+
|
118
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
119
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
120
|
+
|
121
|
+
PageLayout.new(runs, mediabox).to_s
|
108
122
|
end
|
109
123
|
alias :to_s :text
|
110
124
|
|
125
|
+
def runs(opts = {})
|
126
|
+
receiver = PageTextReceiver.new
|
127
|
+
walk(receiver)
|
128
|
+
receiver.runs(opts)
|
129
|
+
end
|
130
|
+
|
111
131
|
# processes the raw content stream for this page in sequential order and
|
112
132
|
# passes callbacks to the receiver objects.
|
113
133
|
#
|
@@ -132,6 +152,9 @@ module PDF
|
|
132
152
|
# the program in the correct order and calls out to your implementation.
|
133
153
|
#
|
134
154
|
def walk(*receivers)
|
155
|
+
receivers = receivers.map { |receiver|
|
156
|
+
ValidatingReceiver.new(receiver)
|
157
|
+
}
|
135
158
|
callback(receivers, :page=, [self])
|
136
159
|
content_stream(receivers, raw_content)
|
137
160
|
end
|
@@ -140,10 +163,10 @@ module PDF
|
|
140
163
|
# see here unless you're a PDF nerd like me.
|
141
164
|
#
|
142
165
|
def raw_content
|
143
|
-
contents = objects.
|
166
|
+
contents = objects.deref_stream_or_array(@page_object[:Contents])
|
144
167
|
[contents].flatten.compact.map { |obj|
|
145
|
-
objects.
|
146
|
-
}.map { |obj|
|
168
|
+
objects.deref_stream(obj)
|
169
|
+
}.compact.map { |obj|
|
147
170
|
obj.unfiltered_data
|
148
171
|
}.join(" ")
|
149
172
|
end
|
@@ -174,17 +197,22 @@ module PDF
|
|
174
197
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
198
|
#
|
176
199
|
def rectangles
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
200
|
+
# attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
201
|
+
mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
|
202
|
+
cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
|
203
|
+
bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
|
204
|
+
trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
|
205
|
+
artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
|
206
|
+
|
207
|
+
begin
|
208
|
+
mediarect = Rectangle.from_array(mediabox)
|
209
|
+
croprect = Rectangle.from_array(cropbox)
|
210
|
+
bleedrect = Rectangle.from_array(bleedbox)
|
211
|
+
trimrect = Rectangle.from_array(trimbox)
|
212
|
+
artrect = Rectangle.from_array(artbox)
|
213
|
+
rescue ArgumentError => e
|
214
|
+
raise MalformedPDFError, e.message
|
215
|
+
end
|
188
216
|
|
189
217
|
if rotate > 0
|
190
218
|
mediarect.apply_rotation(rotate)
|
@@ -206,14 +234,14 @@ module PDF
|
|
206
234
|
private
|
207
235
|
|
208
236
|
def root
|
209
|
-
|
237
|
+
@root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
|
210
238
|
end
|
211
239
|
|
212
240
|
# Returns the resources that accompany this page. Includes
|
213
241
|
# resources inherited from parents.
|
214
242
|
#
|
215
243
|
def resources
|
216
|
-
@resources ||= @objects.
|
244
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
|
217
245
|
end
|
218
246
|
|
219
247
|
def content_stream(receivers, instructions)
|
@@ -249,7 +277,8 @@ module PDF
|
|
249
277
|
if origin.nil?
|
250
278
|
[]
|
251
279
|
else
|
252
|
-
obj = objects.
|
280
|
+
obj = objects.deref_hash(origin)
|
281
|
+
PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
|
253
282
|
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
254
283
|
end
|
255
284
|
end
|
@@ -21,10 +21,8 @@ class PDF::Reader
|
|
21
21
|
# PDF::Reader::Rectangle at some point
|
22
22
|
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
23
23
|
|
24
|
-
|
25
|
-
runs =
|
26
|
-
@mediabox = mediabox
|
27
|
-
@runs = merge_runs(runs)
|
24
|
+
@mediabox = process_mediabox(mediabox)
|
25
|
+
@runs = runs
|
28
26
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
29
27
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
30
28
|
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
@@ -51,13 +49,11 @@ class PDF::Reader
|
|
51
49
|
private
|
52
50
|
|
53
51
|
def page_width
|
54
|
-
|
55
|
-
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
52
|
+
@mediabox.width
|
56
53
|
end
|
57
54
|
|
58
55
|
def page_height
|
59
|
-
|
60
|
-
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
56
|
+
@mediabox.height
|
61
57
|
end
|
62
58
|
|
63
59
|
# given an array of strings, return a new array with empty rows from the
|
@@ -109,30 +105,20 @@ class PDF::Reader
|
|
109
105
|
end
|
110
106
|
end
|
111
107
|
|
112
|
-
|
113
|
-
|
114
|
-
def merge_runs(runs)
|
115
|
-
runs.group_by { |char|
|
116
|
-
char.y.to_i
|
117
|
-
}.map { |y, chars|
|
118
|
-
group_chars_into_runs(chars.sort)
|
119
|
-
}.flatten.sort
|
108
|
+
def local_string_insert(haystack, needle, index)
|
109
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
120
110
|
end
|
121
111
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
end
|
112
|
+
def process_mediabox(mediabox)
|
113
|
+
if mediabox.is_a?(Array)
|
114
|
+
msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
|
115
|
+
" please use a Rectangle instead"
|
116
|
+
$stderr.puts msg
|
117
|
+
PDF::Reader::Rectangle.from_array(mediabox)
|
118
|
+
else
|
119
|
+
mediabox
|
131
120
|
end
|
132
121
|
end
|
133
122
|
|
134
|
-
def local_string_insert(haystack, needle, index)
|
135
|
-
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
136
|
-
end
|
137
123
|
end
|
138
124
|
end
|
@@ -384,7 +384,7 @@ class PDF::Reader
|
|
384
384
|
#
|
385
385
|
def build_fonts(raw_fonts)
|
386
386
|
wrapped_fonts = raw_fonts.map { |label, font|
|
387
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
387
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
|
388
388
|
}
|
389
389
|
|
390
390
|
::Hash[wrapped_fonts]
|
@@ -47,9 +47,32 @@ module PDF
|
|
47
47
|
@characters = []
|
48
48
|
end
|
49
49
|
|
50
|
+
def runs(opts = {})
|
51
|
+
runs = @characters
|
52
|
+
|
53
|
+
if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
|
54
|
+
runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
|
55
|
+
end
|
56
|
+
|
57
|
+
if opts.fetch(:skip_zero_width, true)
|
58
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
59
|
+
end
|
60
|
+
|
61
|
+
if opts.fetch(:skip_overlapping, true)
|
62
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
63
|
+
end
|
64
|
+
|
65
|
+
if opts.fetch(:merge, true)
|
66
|
+
runs = merge_runs(runs)
|
67
|
+
end
|
68
|
+
|
69
|
+
runs
|
70
|
+
end
|
71
|
+
|
72
|
+
# deprecated
|
50
73
|
def content
|
51
|
-
mediabox = @page.rectangles[:MediaBox]
|
52
|
-
PageLayout.new(
|
74
|
+
mediabox = @page.rectangles[:MediaBox]
|
75
|
+
PageLayout.new(runs, mediabox).to_s
|
53
76
|
end
|
54
77
|
|
55
78
|
#####################################################
|
@@ -64,8 +87,10 @@ module PDF
|
|
64
87
|
params.each do |arg|
|
65
88
|
if arg.is_a?(String)
|
66
89
|
internal_show_text(arg)
|
67
|
-
|
90
|
+
elsif arg.is_a?(Numeric)
|
68
91
|
@state.process_glyph_displacement(0, arg, false)
|
92
|
+
else
|
93
|
+
# skip it
|
69
94
|
end
|
70
95
|
end
|
71
96
|
end
|
@@ -96,6 +121,7 @@ module PDF
|
|
96
121
|
private
|
97
122
|
|
98
123
|
def internal_show_text(string)
|
124
|
+
PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
|
99
125
|
if @state.current_font.nil?
|
100
126
|
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
101
127
|
end
|
@@ -109,7 +135,7 @@ module PDF
|
|
109
135
|
|
110
136
|
# apply to glyph displacment for the current glyph so the next
|
111
137
|
# glyph will appear in the correct position
|
112
|
-
glyph_width = @state.current_font.
|
138
|
+
glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
|
113
139
|
th = 1
|
114
140
|
scaled_glyph_width = glyph_width * @state.font_size * th
|
115
141
|
unless utf8_chars == SPACE
|
@@ -119,12 +145,6 @@ module PDF
|
|
119
145
|
end
|
120
146
|
end
|
121
147
|
|
122
|
-
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
-
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
-
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
-
#
|
126
|
-
# We should provide an API for extracting the text with positioning data and spec
|
127
|
-
# that. I suspect the co-ords might be wrong for rotated pages
|
128
148
|
def apply_rotation(x, y)
|
129
149
|
if @page.rotate == 90
|
130
150
|
tmp = x
|
@@ -141,6 +161,28 @@ module PDF
|
|
141
161
|
return x, y
|
142
162
|
end
|
143
163
|
|
164
|
+
# take a collection of TextRun objects and merge any that are in close
|
165
|
+
# proximity
|
166
|
+
def merge_runs(runs)
|
167
|
+
runs.group_by { |char|
|
168
|
+
char.y.to_i
|
169
|
+
}.map { |y, chars|
|
170
|
+
group_chars_into_runs(chars.sort)
|
171
|
+
}.flatten.sort
|
172
|
+
end
|
173
|
+
|
174
|
+
def group_chars_into_runs(chars)
|
175
|
+
chars.each_with_object([]) do |char, runs|
|
176
|
+
if runs.empty?
|
177
|
+
runs << char
|
178
|
+
elsif runs.last.mergable?(char)
|
179
|
+
runs[-1] = runs.last + char
|
180
|
+
else
|
181
|
+
runs << char
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
144
186
|
end
|
145
187
|
end
|
146
188
|
end
|