pdf-reader 2.2.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,6 +1,9 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
require 'tempfile'
|
6
|
+
|
4
7
|
class PDF::Reader
|
5
8
|
# Provides low level access to the objects in a PDF file via a hash-like
|
6
9
|
# object.
|
@@ -47,7 +50,11 @@ class PDF::Reader
|
|
47
50
|
@trailer = @xref.trailer
|
48
51
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
49
52
|
@sec_handler = NullSecurityHandler.new
|
50
|
-
@sec_handler =
|
53
|
+
@sec_handler = SecurityHandlerFactory.build(
|
54
|
+
deref(trailer[:Encrypt]),
|
55
|
+
deref(trailer[:ID]),
|
56
|
+
opts[:password]
|
57
|
+
)
|
51
58
|
end
|
52
59
|
|
53
60
|
# returns the type of object a ref points to
|
@@ -78,16 +85,7 @@ class PDF::Reader
|
|
78
85
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
79
86
|
end
|
80
87
|
|
81
|
-
|
82
|
-
@cache[key]
|
83
|
-
elsif xref[key].is_a?(Integer)
|
84
|
-
buf = new_buffer(xref[key])
|
85
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
86
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
87
|
-
container_key = xref[key]
|
88
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
89
|
-
@cache[key] = object_streams[container_key][key.id]
|
90
|
-
end
|
88
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
91
89
|
rescue InvalidObjectError
|
92
90
|
return default
|
93
91
|
end
|
@@ -100,6 +98,218 @@ class PDF::Reader
|
|
100
98
|
end
|
101
99
|
alias :deref :object
|
102
100
|
|
101
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
102
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
103
|
+
#
|
104
|
+
# Guaranteed to only return an Array or nil. If the dereference results in
|
105
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
106
|
+
# expecting an Array and no other type will do.
|
107
|
+
def deref_array(key)
|
108
|
+
obj = deref(key)
|
109
|
+
|
110
|
+
return obj if obj.nil?
|
111
|
+
|
112
|
+
obj.tap { |obj|
|
113
|
+
raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
118
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
119
|
+
#
|
120
|
+
# Guaranteed to only return an Array of Numerics or nil. If the dereference results in
|
121
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
122
|
+
# expecting an Array and no other type will do.
|
123
|
+
#
|
124
|
+
# Some effort to cast array elements to a number is made for any non-numeric elements.
|
125
|
+
def deref_array_of_numbers(key)
|
126
|
+
arr = deref(key)
|
127
|
+
|
128
|
+
return arr if arr.nil?
|
129
|
+
|
130
|
+
raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
|
131
|
+
|
132
|
+
arr.map { |item|
|
133
|
+
if item.is_a?(Numeric)
|
134
|
+
item
|
135
|
+
elsif item.respond_to?(:to_f)
|
136
|
+
item.to_f
|
137
|
+
elsif item.respond_to?(:to_i)
|
138
|
+
item.to_i
|
139
|
+
else
|
140
|
+
raise MalformedPDFError, "expected object to be a number"
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
|
145
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
146
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
147
|
+
#
|
148
|
+
# Guaranteed to only return a Hash or nil. If the dereference results in
|
149
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
150
|
+
# expecting an Array and no other type will do.
|
151
|
+
def deref_hash(key)
|
152
|
+
obj = deref(key)
|
153
|
+
|
154
|
+
return obj if obj.nil?
|
155
|
+
|
156
|
+
obj.tap { |obj|
|
157
|
+
raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
162
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
163
|
+
#
|
164
|
+
# Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
|
165
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
166
|
+
# expecting an Array and no other type will do.
|
167
|
+
#
|
168
|
+
# Some effort to cast to a symbol is made when the reference points to a non-symbol.
|
169
|
+
def deref_name(key)
|
170
|
+
obj = deref(key)
|
171
|
+
|
172
|
+
return obj if obj.nil?
|
173
|
+
|
174
|
+
if !obj.is_a?(Symbol)
|
175
|
+
if obj.respond_to?(:to_sym)
|
176
|
+
obj = obj.to_sym
|
177
|
+
else
|
178
|
+
raise MalformedPDFError, "expected object to be a Name"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
obj
|
183
|
+
end
|
184
|
+
|
185
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
186
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
187
|
+
#
|
188
|
+
# Guaranteed to only return an Integer or nil. If the dereference results in
|
189
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
190
|
+
# expecting an Array and no other type will do.
|
191
|
+
#
|
192
|
+
# Some effort to cast to an int is made when the reference points to a non-integer.
|
193
|
+
def deref_integer(key)
|
194
|
+
obj = deref(key)
|
195
|
+
|
196
|
+
return obj if obj.nil?
|
197
|
+
|
198
|
+
if !obj.is_a?(Integer)
|
199
|
+
if obj.respond_to?(:to_i)
|
200
|
+
obj = obj.to_i
|
201
|
+
else
|
202
|
+
raise MalformedPDFError, "expected object to be an Integer"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
obj
|
207
|
+
end
|
208
|
+
|
209
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
210
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
211
|
+
#
|
212
|
+
# Guaranteed to only return a Numeric or nil. If the dereference results in
|
213
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
214
|
+
# expecting an Array and no other type will do.
|
215
|
+
#
|
216
|
+
# Some effort to cast to a number is made when the reference points to a non-number.
|
217
|
+
def deref_number(key)
|
218
|
+
obj = deref(key)
|
219
|
+
|
220
|
+
return obj if obj.nil?
|
221
|
+
|
222
|
+
if !obj.is_a?(Numeric)
|
223
|
+
if obj.respond_to?(:to_f)
|
224
|
+
obj = obj.to_f
|
225
|
+
elsif obj.respond_to?(:to_i)
|
226
|
+
obj.to_i
|
227
|
+
else
|
228
|
+
raise MalformedPDFError, "expected object to be a number"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
obj
|
233
|
+
end
|
234
|
+
|
235
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
236
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
237
|
+
#
|
238
|
+
# Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
|
239
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
240
|
+
# expecting a stream and no other type will do.
|
241
|
+
def deref_stream(key)
|
242
|
+
obj = deref(key)
|
243
|
+
|
244
|
+
return obj if obj.nil?
|
245
|
+
|
246
|
+
obj.tap { |obj|
|
247
|
+
if !obj.is_a?(PDF::Reader::Stream)
|
248
|
+
raise MalformedPDFError, "expected object to be a Stream or nil"
|
249
|
+
end
|
250
|
+
}
|
251
|
+
end
|
252
|
+
|
253
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
254
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
255
|
+
#
|
256
|
+
# Guaranteed to only return a String or nil. If the dereference results in
|
257
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
258
|
+
# expecting a string and no other type will do.
|
259
|
+
#
|
260
|
+
# Some effort to cast to a string is made when the reference points to a non-string.
|
261
|
+
def deref_string(key)
|
262
|
+
obj = deref(key)
|
263
|
+
|
264
|
+
return obj if obj.nil?
|
265
|
+
|
266
|
+
if !obj.is_a?(String)
|
267
|
+
if obj.respond_to?(:to_s)
|
268
|
+
obj = obj.to_s
|
269
|
+
else
|
270
|
+
raise MalformedPDFError, "expected object to be a string"
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
obj
|
275
|
+
end
|
276
|
+
|
277
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
278
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
279
|
+
#
|
280
|
+
# Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
|
281
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
282
|
+
# expecting a Name or Array and no other type will do.
|
283
|
+
def deref_name_or_array(key)
|
284
|
+
obj = deref(key)
|
285
|
+
|
286
|
+
return obj if obj.nil?
|
287
|
+
|
288
|
+
obj.tap { |obj|
|
289
|
+
if !obj.is_a?(Symbol) && !obj.is_a?(Array)
|
290
|
+
raise MalformedPDFError, "expected object to be an Array or Name"
|
291
|
+
end
|
292
|
+
}
|
293
|
+
end
|
294
|
+
|
295
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
296
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
297
|
+
#
|
298
|
+
# Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
|
299
|
+
# any other type then a MalformedPDFError exception will raise. Useful when
|
300
|
+
# expecting a stream or Array and no other type will do.
|
301
|
+
def deref_stream_or_array(key)
|
302
|
+
obj = deref(key)
|
303
|
+
|
304
|
+
return obj if obj.nil?
|
305
|
+
|
306
|
+
obj.tap { |obj|
|
307
|
+
if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
|
308
|
+
raise MalformedPDFError, "expected object to be an Array or Stream"
|
309
|
+
end
|
310
|
+
}
|
311
|
+
end
|
312
|
+
|
103
313
|
# Recursively dereferences the object refered to be +key+. If +key+ is not
|
104
314
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
105
315
|
#
|
@@ -107,6 +317,22 @@ class PDF::Reader
|
|
107
317
|
deref_internal!(key, {})
|
108
318
|
end
|
109
319
|
|
320
|
+
def deref_array!(key)
|
321
|
+
deref!(key).tap { |obj|
|
322
|
+
if !obj.nil? && !obj.is_a?(Array)
|
323
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
|
324
|
+
end
|
325
|
+
}
|
326
|
+
end
|
327
|
+
|
328
|
+
def deref_hash!(key)
|
329
|
+
deref!(key).tap { |obj|
|
330
|
+
if !obj.nil? && !obj.is_a?(Hash)
|
331
|
+
raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
|
332
|
+
end
|
333
|
+
}
|
334
|
+
end
|
335
|
+
|
110
336
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
111
337
|
# object.
|
112
338
|
#
|
@@ -241,7 +467,10 @@ class PDF::Reader
|
|
241
467
|
#
|
242
468
|
def page_references
|
243
469
|
root = fetch(trailer[:Root])
|
244
|
-
@page_references ||=
|
470
|
+
@page_references ||= begin
|
471
|
+
pages_root = deref_hash(root[:Pages]) || {}
|
472
|
+
get_page_objects(pages_root)
|
473
|
+
end
|
245
474
|
end
|
246
475
|
|
247
476
|
def encrypted?
|
@@ -254,6 +483,28 @@ class PDF::Reader
|
|
254
483
|
|
255
484
|
private
|
256
485
|
|
486
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
487
|
+
# in the xref table
|
488
|
+
#
|
489
|
+
def fetch_object(key)
|
490
|
+
if xref[key].is_a?(Integer)
|
491
|
+
buf = new_buffer(xref[key])
|
492
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
496
|
+
# parse a object that's embedded in an object stream in the PDF
|
497
|
+
#
|
498
|
+
def fetch_object_stream(key)
|
499
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
500
|
+
container_key = xref[key]
|
501
|
+
stream = deref_stream(container_key)
|
502
|
+
raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil?
|
503
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream)
|
504
|
+
object_streams[container_key][key.id]
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
257
508
|
# Private implementation of deref!, which exists to ensure the `seen` argument
|
258
509
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
259
510
|
# doesn't need to be part of the public API.
|
@@ -287,44 +538,18 @@ class PDF::Reader
|
|
287
538
|
end
|
288
539
|
end
|
289
540
|
|
290
|
-
def build_security_handler(opts = {})
|
291
|
-
encrypt = deref(trailer[:Encrypt])
|
292
|
-
if NullSecurityHandler.supports?(encrypt)
|
293
|
-
NullSecurityHandler.new
|
294
|
-
elsif StandardSecurityHandler.supports?(encrypt)
|
295
|
-
encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
|
296
|
-
StandardSecurityHandler.new(
|
297
|
-
key_length: (encrypt[:Length] || 40).to_i,
|
298
|
-
revision: encrypt[:R],
|
299
|
-
owner_key: encrypt[:O],
|
300
|
-
user_key: encrypt[:U],
|
301
|
-
permissions: encrypt[:P].to_i,
|
302
|
-
encrypted_metadata: encmeta,
|
303
|
-
file_id: (deref(trailer[:ID]) || []).first,
|
304
|
-
password: opts[:password],
|
305
|
-
cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
306
|
-
)
|
307
|
-
elsif StandardSecurityHandlerV5.supports?(encrypt)
|
308
|
-
StandardSecurityHandlerV5.new(
|
309
|
-
O: encrypt[:O],
|
310
|
-
U: encrypt[:U],
|
311
|
-
OE: encrypt[:OE],
|
312
|
-
UE: encrypt[:UE],
|
313
|
-
password: opts[:password]
|
314
|
-
)
|
315
|
-
else
|
316
|
-
UnimplementedSecurityHandler.new
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
320
541
|
def decrypt(ref, obj)
|
321
542
|
case obj
|
322
543
|
when PDF::Reader::Stream then
|
323
|
-
|
544
|
+
# PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
|
545
|
+
# Therefore we shouldn't try to decrypt it.
|
546
|
+
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
324
547
|
obj
|
325
548
|
when Hash then
|
326
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
327
|
-
|
549
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
550
|
+
arr.each_with_object({}) { |(k,v), accum|
|
551
|
+
accum[k] = v
|
552
|
+
}
|
328
553
|
when Array then
|
329
554
|
obj.collect { |item| decrypt(ref, item) }
|
330
555
|
when String
|
@@ -343,39 +568,43 @@ class PDF::Reader
|
|
343
568
|
end
|
344
569
|
|
345
570
|
def object_streams
|
346
|
-
@
|
571
|
+
@object_streams ||= {}
|
347
572
|
end
|
348
573
|
|
349
|
-
# returns
|
574
|
+
# returns an array of object references for all pages in this object store. The ordering of
|
575
|
+
# the Array is significant and matches the page ordering of the document
|
350
576
|
#
|
351
|
-
def get_page_objects(
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
raise MalformedPDFError, "
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
577
|
+
def get_page_objects(obj)
|
578
|
+
derefed_obj = deref_hash(obj)
|
579
|
+
|
580
|
+
if derefed_obj.nil?
|
581
|
+
raise MalformedPDFError, "Expected Page or Pages object, got nil"
|
582
|
+
elsif derefed_obj[:Type] == :Page
|
583
|
+
[obj]
|
584
|
+
elsif derefed_obj[:Kids]
|
585
|
+
kids = deref_array(derefed_obj[:Kids]) || []
|
586
|
+
kids.map { |kid|
|
587
|
+
get_page_objects(kid)
|
588
|
+
}.flatten
|
589
|
+
else
|
590
|
+
raise MalformedPDFError, "Expected Page or Pages object"
|
362
591
|
end
|
363
592
|
end
|
364
593
|
|
365
594
|
def read_version
|
366
595
|
@io.seek(0)
|
367
|
-
_m, version = *@io.read(10).match(/PDF-(\d.\d)/)
|
596
|
+
_m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
|
368
597
|
@io.seek(0)
|
369
598
|
version.to_f
|
370
599
|
end
|
371
600
|
|
372
601
|
def extract_io_from(input)
|
373
|
-
if input.
|
602
|
+
if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
|
374
603
|
input
|
375
604
|
elsif File.file?(input.to_s)
|
376
|
-
StringIO.new read_as_binary(input)
|
605
|
+
StringIO.new read_as_binary(input.to_s)
|
377
606
|
else
|
378
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
607
|
+
raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
|
379
608
|
end
|
380
609
|
end
|
381
610
|
|
@@ -383,7 +612,7 @@ class PDF::Reader
|
|
383
612
|
if File.respond_to?(:binread)
|
384
613
|
File.binread(input.to_s)
|
385
614
|
else
|
386
|
-
File.open(input.to_s,"rb") { |f| f.read }
|
615
|
+
File.open(input.to_s,"rb") { |f| f.read } || ""
|
387
616
|
end
|
388
617
|
end
|
389
618
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -23,7 +24,7 @@ class PDF::Reader
|
|
23
24
|
end
|
24
25
|
|
25
26
|
def size
|
26
|
-
@dict[:N]
|
27
|
+
TypeCheck.cast_to_int!(@dict[:N])
|
27
28
|
end
|
28
29
|
|
29
30
|
private
|
@@ -39,7 +40,7 @@ class PDF::Reader
|
|
39
40
|
end
|
40
41
|
|
41
42
|
def first
|
42
|
-
@dict[:First]
|
43
|
+
TypeCheck.cast_to_int!(@dict[:First])
|
43
44
|
end
|
44
45
|
|
45
46
|
def buffer
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
# typed: strict
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
7
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
8
|
+
class OverlappingRunsFilter
|
9
|
+
|
10
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
11
|
+
# have identical characters) then one will be discarded
|
12
|
+
OVERLAPPING_THRESHOLD = 0.5
|
13
|
+
|
14
|
+
def self.exclude_redundant_runs(runs)
|
15
|
+
sweep_line_status = Array.new
|
16
|
+
event_point_schedule = Array.new
|
17
|
+
to_exclude = []
|
18
|
+
|
19
|
+
runs.each do |run|
|
20
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
21
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
22
|
+
end
|
23
|
+
|
24
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
25
|
+
|
26
|
+
event_point_schedule.each do |event_point|
|
27
|
+
run = event_point.run
|
28
|
+
|
29
|
+
if event_point.start?
|
30
|
+
if detect_intersection(sweep_line_status, event_point)
|
31
|
+
to_exclude << run
|
32
|
+
end
|
33
|
+
sweep_line_status.push(run)
|
34
|
+
else
|
35
|
+
sweep_line_status.delete(run)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
runs - to_exclude
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
42
|
+
sweep_line_status.each do |open_text_run|
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
45
|
+
event_point.x <= open_text_run.endx &&
|
46
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
47
|
+
return true
|
48
|
+
end
|
49
|
+
end
|
50
|
+
return false
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
55
|
+
# looking for duplicates
|
56
|
+
class EventPoint
|
57
|
+
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
65
|
+
end
|
66
|
+
|
67
|
+
def start?
|
68
|
+
@x == @run.x
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|