pdf-reader 2.9.2 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +39 -0
- data/README.md +33 -33
- data/Rakefile +2 -2
- data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
- data/lib/pdf/reader/buffer.rb +39 -22
- data/lib/pdf/reader/cid_widths.rb +14 -6
- data/lib/pdf/reader/cmap.rb +16 -5
- data/lib/pdf/reader/encoding.rb +42 -18
- data/lib/pdf/reader/error.rb +6 -4
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +6 -2
- data/lib/pdf/reader/filter/flate.rb +5 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +99 -32
- data/lib/pdf/reader/font_descriptor.rb +79 -24
- data/lib/pdf/reader/form_xobject.rb +15 -1
- data/lib/pdf/reader/glyph_hash.rb +41 -8
- data/lib/pdf/reader/key_builder_v5.rb +17 -9
- data/lib/pdf/reader/lzw.rb +42 -16
- data/lib/pdf/reader/no_text_filter.rb +15 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +7 -2
- data/lib/pdf/reader/object_hash.rb +129 -16
- data/lib/pdf/reader/object_stream.rb +22 -5
- data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
- data/lib/pdf/reader/page.rb +66 -13
- data/lib/pdf/reader/page_layout.rb +26 -9
- data/lib/pdf/reader/page_state.rb +12 -3
- data/lib/pdf/reader/page_text_receiver.rb +16 -2
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +52 -13
- data/lib/pdf/reader/point.rb +9 -2
- data/lib/pdf/reader/print_receiver.rb +2 -6
- data/lib/pdf/reader/rc4_security_handler.rb +2 -0
- data/lib/pdf/reader/rectangle.rb +24 -1
- data/lib/pdf/reader/reference.rb +13 -3
- data/lib/pdf/reader/register_receiver.rb +15 -2
- data/lib/pdf/reader/resources.rb +12 -2
- data/lib/pdf/reader/security_handler_factory.rb +13 -0
- data/lib/pdf/reader/standard_key_builder.rb +37 -23
- data/lib/pdf/reader/stream.rb +9 -3
- data/lib/pdf/reader/synchronized_cache.rb +6 -3
- data/lib/pdf/reader/text_run.rb +33 -3
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +41 -10
- data/lib/pdf/reader/type_check.rb +53 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
- data/lib/pdf/reader/validating_receiver.rb +29 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
- data/lib/pdf/reader/width_calculator/composite.rb +11 -3
- data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
- data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
- data/lib/pdf/reader/xref.rb +31 -10
- data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
- data/lib/pdf/reader.rb +24 -12
- data/rbi/pdf-reader.rbi +1504 -1480
- metadata +34 -17
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
module PDF
|
@@ -22,25 +22,31 @@ module PDF
|
|
22
22
|
# Wraps an LZW encoded string
|
23
23
|
class BitStream # :nodoc:
|
24
24
|
|
25
|
+
#: (String, Integer) -> void
|
25
26
|
def initialize(data, bits_in_chunk)
|
26
27
|
@data = data
|
27
28
|
@data.force_encoding("BINARY")
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
29
|
+
@current_pos = 0 #: Integer
|
30
|
+
@bits_left_in_byte = 8 #: Integer
|
31
|
+
@bits_in_chunk = 0 #: Integer
|
32
|
+
set_bits_in_chunk(bits_in_chunk)
|
31
33
|
end
|
32
34
|
|
35
|
+
#: (Integer) -> void
|
33
36
|
def set_bits_in_chunk(bits_in_chunk)
|
37
|
+
raise MalformedPDFError, "invalid LZW bits" if bits_in_chunk < 9 || bits_in_chunk > 12
|
38
|
+
|
34
39
|
@bits_in_chunk = bits_in_chunk
|
35
40
|
end
|
36
41
|
|
42
|
+
#: () -> Integer
|
37
43
|
def read
|
38
44
|
bits_left_in_chunk = @bits_in_chunk
|
39
45
|
chunk = -1
|
40
46
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
41
47
|
chunk = 0 if chunk < 0
|
42
|
-
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
43
|
-
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
48
|
+
codepoint = @data[@current_pos, 1].to_s.unpack("C*")[0].to_i
|
49
|
+
current_byte = codepoint & (2**@bits_left_in_byte - 1).to_i #clear consumed bits
|
44
50
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
45
51
|
if dif > 0 then current_byte <<= dif
|
46
52
|
elsif dif < 0 then current_byte >>= dif.abs
|
@@ -57,33 +63,43 @@ module PDF
|
|
57
63
|
end
|
58
64
|
end
|
59
65
|
|
60
|
-
CODE_EOD = 257 #end of data
|
61
|
-
CODE_CLEAR_TABLE = 256 #clear table
|
66
|
+
CODE_EOD = 257 #: Integer #end of data
|
67
|
+
CODE_CLEAR_TABLE = 256 #: Integer #clear table
|
62
68
|
|
63
69
|
# stores de pairs code => string
|
64
|
-
class StringTable
|
70
|
+
class StringTable
|
71
|
+
#: Integer
|
65
72
|
attr_reader :string_table_pos
|
66
73
|
|
74
|
+
#: () -> void
|
67
75
|
def initialize
|
68
|
-
|
69
|
-
|
76
|
+
@data = Hash.new #: Hash[Integer, String]
|
77
|
+
# The initial code
|
78
|
+
@string_table_pos = 258 #: Integer
|
70
79
|
end
|
71
80
|
|
72
81
|
#if code less than 258 return fixed string
|
82
|
+
#: (Integer) -> String?
|
73
83
|
def [](key)
|
74
|
-
if key > 257
|
84
|
+
if key > 257
|
85
|
+
@data[key]
|
86
|
+
else
|
87
|
+
key.chr
|
88
|
+
end
|
75
89
|
end
|
76
90
|
|
91
|
+
#: (String) -> void
|
77
92
|
def add(string)
|
78
|
-
store(@string_table_pos, string)
|
93
|
+
@data.store(@string_table_pos, string)
|
79
94
|
@string_table_pos += 1
|
80
95
|
end
|
81
96
|
end
|
82
97
|
|
83
98
|
# Decompresses a LZW compressed string.
|
84
99
|
#
|
100
|
+
#: (String) -> String
|
85
101
|
def self.decode(data)
|
86
|
-
stream = BitStream.new
|
102
|
+
stream = BitStream.new(data.to_s, 9) # size of codes between 9 and 12 bits
|
87
103
|
string_table = StringTable.new
|
88
104
|
result = "".dup
|
89
105
|
until (code = stream.read) == CODE_EOD
|
@@ -119,8 +135,18 @@ module PDF
|
|
119
135
|
result
|
120
136
|
end
|
121
137
|
|
122
|
-
|
123
|
-
|
138
|
+
#: (PDF::Reader::LZW::StringTable, Integer?, Integer?) -> String
|
139
|
+
def self.create_new_string(string_table, some_code, other_code)
|
140
|
+
raise MalformedPDFError, "invalid LZW data" if some_code.nil? || other_code.nil?
|
141
|
+
|
142
|
+
item_one = string_table[some_code]
|
143
|
+
item_two = string_table[other_code]
|
144
|
+
|
145
|
+
if item_one && item_two
|
146
|
+
item_one + item_two.chr
|
147
|
+
else
|
148
|
+
raise MalformedPDFError, "invalid LZW data"
|
149
|
+
end
|
124
150
|
end
|
125
151
|
private_class_method :create_new_string
|
126
152
|
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class NoTextFilter
|
8
|
+
|
9
|
+
#: (Array[PDF::Reader::TextRun]) -> Array[PDF::Reader::TextRun]
|
10
|
+
def self.exclude_empty_strings(runs)
|
11
|
+
runs.reject { |run| run.text.to_s.size == 0 }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -15,10 +15,15 @@ class PDF::Reader
|
|
15
15
|
# These object types use little memory and are accessed a heap of times as
|
16
16
|
# part of random page access, so we'll cache the unmarshalled objects and
|
17
17
|
# avoid lots of repetitive (and expensive) tokenising
|
18
|
-
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
18
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages] #: Array[Symbol]
|
19
19
|
|
20
|
-
|
20
|
+
#: untyped
|
21
|
+
attr_reader :hits
|
21
22
|
|
23
|
+
#: untyped
|
24
|
+
attr_reader :misses
|
25
|
+
|
26
|
+
#: (?untyped) -> void
|
22
27
|
def initialize(lru_size = 1000)
|
23
28
|
@objects = {}
|
24
29
|
@lru_cache = Hashery::LRUHash.new(lru_size.to_i)
|
@@ -2,6 +2,8 @@
|
|
2
2
|
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
|
+
require 'tempfile'
|
6
|
+
|
5
7
|
class PDF::Reader
|
6
8
|
# Provides low level access to the objects in a PDF file via a hash-like
|
7
9
|
# object.
|
@@ -30,8 +32,24 @@ class PDF::Reader
|
|
30
32
|
class ObjectHash
|
31
33
|
include Enumerable
|
32
34
|
|
35
|
+
#: type securityHandler = (
|
36
|
+
#| PDF::Reader::NullSecurityHandler |
|
37
|
+
#| PDF::Reader::AesV2SecurityHandler |
|
38
|
+
#| PDF::Reader::Rc4SecurityHandler |
|
39
|
+
#| PDF::Reader::AesV3SecurityHandler |
|
40
|
+
#| PDF::Reader::UnimplementedSecurityHandler
|
41
|
+
#| )
|
42
|
+
|
43
|
+
#: untyped
|
33
44
|
attr_accessor :default
|
34
|
-
|
45
|
+
|
46
|
+
#: Hash[Symbol, untyped]
|
47
|
+
attr_reader :trailer
|
48
|
+
|
49
|
+
#: Float
|
50
|
+
attr_reader :pdf_version
|
51
|
+
|
52
|
+
#: securityHandler
|
35
53
|
attr_reader :sec_handler
|
36
54
|
|
37
55
|
# Creates a new ObjectHash object. Input can be a string with a valid filename
|
@@ -41,21 +59,25 @@ class PDF::Reader
|
|
41
59
|
#
|
42
60
|
# :password - the user password to decrypt the source PDF
|
43
61
|
#
|
62
|
+
#: ((IO | Tempfile | StringIO | String), ?Hash[Symbol, untyped]) -> void
|
44
63
|
def initialize(input, opts = {})
|
45
|
-
@io = extract_io_from(input)
|
46
|
-
@xref = PDF::Reader::XRef.new(@io)
|
47
|
-
@pdf_version = read_version
|
48
|
-
@trailer = @xref.trailer
|
49
|
-
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
50
|
-
@sec_handler = NullSecurityHandler.new
|
64
|
+
@io = extract_io_from(input) #: IO | Tempfile | StringIO
|
65
|
+
@xref = PDF::Reader::XRef.new(@io) #: PDF::Reader::XRef[PDF::Reader::Reference]
|
66
|
+
@pdf_version = read_version #: Float
|
67
|
+
@trailer = @xref.trailer #: Hash[Symbol, untyped]
|
68
|
+
@cache = opts[:cache] || PDF::Reader::ObjectCache.new #: PDF::Reader::ObjectCache
|
69
|
+
@sec_handler = NullSecurityHandler.new #: securityHandler
|
51
70
|
@sec_handler = SecurityHandlerFactory.build(
|
52
71
|
deref(trailer[:Encrypt]),
|
53
72
|
deref(trailer[:ID]),
|
54
73
|
opts[:password]
|
55
74
|
)
|
75
|
+
@page_references = nil #: Array[PDF::Reader::Reference | Hash[Symbol, untyped]]?
|
76
|
+
@object_streams = nil #: Hash[PDF::Reader::Reference, PDF::Reader::ObjectStream]?
|
56
77
|
end
|
57
78
|
|
58
79
|
# returns the type of object a ref points to
|
80
|
+
#: ((Integer | PDF::Reader::Reference)) -> Symbol?
|
59
81
|
def obj_type(ref)
|
60
82
|
self[ref].class.to_s.to_sym
|
61
83
|
rescue
|
@@ -63,6 +85,7 @@ class PDF::Reader
|
|
63
85
|
end
|
64
86
|
|
65
87
|
# returns true if the supplied references points to an object with a stream
|
88
|
+
#: ((Integer | PDF::Reader::Reference)) -> bool
|
66
89
|
def stream?(ref)
|
67
90
|
self.has_key?(ref) && self[ref].is_a?(PDF::Reader::Stream)
|
68
91
|
end
|
@@ -76,6 +99,7 @@ class PDF::Reader
|
|
76
99
|
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
77
100
|
# can be specified.
|
78
101
|
#
|
102
|
+
#: ((Integer | PDF::Reader::Reference)) -> untyped
|
79
103
|
def [](key)
|
80
104
|
return default if key.to_i <= 0
|
81
105
|
|
@@ -91,6 +115,7 @@ class PDF::Reader
|
|
91
115
|
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
92
116
|
# object in the PDF and return it. Otherwise return key untouched.
|
93
117
|
#
|
118
|
+
#: (untyped) -> untyped
|
94
119
|
def object(key)
|
95
120
|
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
96
121
|
end
|
@@ -102,6 +127,7 @@ class PDF::Reader
|
|
102
127
|
# Guaranteed to only return an Array or nil. If the dereference results in
|
103
128
|
# any other type then a MalformedPDFError exception will raise. Useful when
|
104
129
|
# expecting an Array and no other type will do.
|
130
|
+
#: (untyped) -> Array[untyped]?
|
105
131
|
def deref_array(key)
|
106
132
|
obj = deref(key)
|
107
133
|
|
@@ -120,6 +146,7 @@ class PDF::Reader
|
|
120
146
|
# expecting an Array and no other type will do.
|
121
147
|
#
|
122
148
|
# Some effort to cast array elements to a number is made for any non-numeric elements.
|
149
|
+
#: (untyped) -> Array[Numeric]?
|
123
150
|
def deref_array_of_numbers(key)
|
124
151
|
arr = deref(key)
|
125
152
|
|
@@ -146,6 +173,7 @@ class PDF::Reader
|
|
146
173
|
# Guaranteed to only return a Hash or nil. If the dereference results in
|
147
174
|
# any other type then a MalformedPDFError exception will raise. Useful when
|
148
175
|
# expecting an Array and no other type will do.
|
176
|
+
#: (untyped) -> Hash[Symbol, untyped]?
|
149
177
|
def deref_hash(key)
|
150
178
|
obj = deref(key)
|
151
179
|
|
@@ -164,6 +192,7 @@ class PDF::Reader
|
|
164
192
|
# expecting an Array and no other type will do.
|
165
193
|
#
|
166
194
|
# Some effort to cast to a symbol is made when the reference points to a non-symbol.
|
195
|
+
#: (untyped) -> Symbol?
|
167
196
|
def deref_name(key)
|
168
197
|
obj = deref(key)
|
169
198
|
|
@@ -188,6 +217,7 @@ class PDF::Reader
|
|
188
217
|
# expecting an Array and no other type will do.
|
189
218
|
#
|
190
219
|
# Some effort to cast to an int is made when the reference points to a non-integer.
|
220
|
+
#: (untyped) -> Integer?
|
191
221
|
def deref_integer(key)
|
192
222
|
obj = deref(key)
|
193
223
|
|
@@ -212,6 +242,7 @@ class PDF::Reader
|
|
212
242
|
# expecting an Array and no other type will do.
|
213
243
|
#
|
214
244
|
# Some effort to cast to a number is made when the reference points to a non-number.
|
245
|
+
#: (untyped) -> Numeric?
|
215
246
|
def deref_number(key)
|
216
247
|
obj = deref(key)
|
217
248
|
|
@@ -236,6 +267,7 @@ class PDF::Reader
|
|
236
267
|
# Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
|
237
268
|
# any other type then a MalformedPDFError exception will raise. Useful when
|
238
269
|
# expecting a stream and no other type will do.
|
270
|
+
#: (untyped) -> PDF::Reader::Stream?
|
239
271
|
def deref_stream(key)
|
240
272
|
obj = deref(key)
|
241
273
|
|
@@ -243,7 +275,7 @@ class PDF::Reader
|
|
243
275
|
|
244
276
|
obj.tap { |obj|
|
245
277
|
if !obj.is_a?(PDF::Reader::Stream)
|
246
|
-
raise MalformedPDFError, "expected object to be
|
278
|
+
raise MalformedPDFError, "expected object to be a Stream or nil"
|
247
279
|
end
|
248
280
|
}
|
249
281
|
end
|
@@ -256,6 +288,7 @@ class PDF::Reader
|
|
256
288
|
# expecting a string and no other type will do.
|
257
289
|
#
|
258
290
|
# Some effort to cast to a string is made when the reference points to a non-string.
|
291
|
+
#: (untyped) -> String?
|
259
292
|
def deref_string(key)
|
260
293
|
obj = deref(key)
|
261
294
|
|
@@ -278,6 +311,7 @@ class PDF::Reader
|
|
278
311
|
# Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
|
279
312
|
# any other type then a MalformedPDFError exception will raise. Useful when
|
280
313
|
# expecting a Name or Array and no other type will do.
|
314
|
+
#: (untyped) -> (Symbol | Array[untyped] | nil)
|
281
315
|
def deref_name_or_array(key)
|
282
316
|
obj = deref(key)
|
283
317
|
|
@@ -296,6 +330,7 @@ class PDF::Reader
|
|
296
330
|
# Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
|
297
331
|
# any other type then a MalformedPDFError exception will raise. Useful when
|
298
332
|
# expecting a stream or Array and no other type will do.
|
333
|
+
#: (untyped) -> (PDF::Reader::Stream | Array[untyped] | nil)
|
299
334
|
def deref_stream_or_array(key)
|
300
335
|
obj = deref(key)
|
301
336
|
|
@@ -311,10 +346,12 @@ class PDF::Reader
|
|
311
346
|
# Recursively dereferences the object refered to be +key+. If +key+ is not
|
312
347
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
313
348
|
#
|
349
|
+
#: (untyped) -> untyped
|
314
350
|
def deref!(key)
|
315
351
|
deref_internal!(key, {})
|
316
352
|
end
|
317
353
|
|
354
|
+
#: (untyped) -> Array[untyped]?
|
318
355
|
def deref_array!(key)
|
319
356
|
deref!(key).tap { |obj|
|
320
357
|
if !obj.nil? && !obj.is_a?(Array)
|
@@ -323,6 +360,7 @@ class PDF::Reader
|
|
323
360
|
}
|
324
361
|
end
|
325
362
|
|
363
|
+
#: (untyped) -> Hash[Symbol, untyped]?
|
326
364
|
def deref_hash!(key)
|
327
365
|
deref!(key).tap { |obj|
|
328
366
|
if !obj.nil? && !obj.is_a?(Hash)
|
@@ -343,6 +381,7 @@ class PDF::Reader
|
|
343
381
|
# local_default is the object that will be returned if the requested key doesn't
|
344
382
|
# exist.
|
345
383
|
#
|
384
|
+
#: (untyped, ?untyped) -> untyped
|
346
385
|
def fetch(key, local_default = nil)
|
347
386
|
obj = self[key]
|
348
387
|
if obj
|
@@ -356,6 +395,8 @@ class PDF::Reader
|
|
356
395
|
|
357
396
|
# iterate over each key, value. Just like a ruby hash.
|
358
397
|
#
|
398
|
+
# @override(allow_incompatible: true)
|
399
|
+
#: () { (PDF::Reader::Reference, untyped) -> untyped } -> untyped
|
359
400
|
def each(&block)
|
360
401
|
@xref.each do |ref|
|
361
402
|
yield ref, self[ref]
|
@@ -365,6 +406,7 @@ class PDF::Reader
|
|
365
406
|
|
366
407
|
# iterate over each key. Just like a ruby hash.
|
367
408
|
#
|
409
|
+
#: { (PDF::Reader::Reference) -> untyped } -> untyped
|
368
410
|
def each_key(&block)
|
369
411
|
each do |id, obj|
|
370
412
|
yield id
|
@@ -373,6 +415,7 @@ class PDF::Reader
|
|
373
415
|
|
374
416
|
# iterate over each value. Just like a ruby hash.
|
375
417
|
#
|
418
|
+
#: { (untyped) -> untyped } -> untyped
|
376
419
|
def each_value(&block)
|
377
420
|
each do |id, obj|
|
378
421
|
yield obj
|
@@ -381,6 +424,7 @@ class PDF::Reader
|
|
381
424
|
|
382
425
|
# return the number of objects in the file. An object with multiple generations
|
383
426
|
# is counted once.
|
427
|
+
#: () -> Integer
|
384
428
|
def size
|
385
429
|
xref.size
|
386
430
|
end
|
@@ -388,6 +432,7 @@ class PDF::Reader
|
|
388
432
|
|
389
433
|
# return true if there are no objects in this file
|
390
434
|
#
|
435
|
+
#: () -> bool
|
391
436
|
def empty?
|
392
437
|
size == 0 ? true : false
|
393
438
|
end
|
@@ -395,6 +440,7 @@ class PDF::Reader
|
|
395
440
|
# return true if the specified key exists in the file. key
|
396
441
|
# can be an int or a PDF::Reader::Reference
|
397
442
|
#
|
443
|
+
#: (untyped) -> bool
|
398
444
|
def has_key?(check_key)
|
399
445
|
# TODO update from O(n) to O(1)
|
400
446
|
each_key do |key|
|
@@ -412,6 +458,7 @@ class PDF::Reader
|
|
412
458
|
|
413
459
|
# return true if the specifiedvalue exists in the file
|
414
460
|
#
|
461
|
+
#: (untyped) -> bool
|
415
462
|
def has_value?(value)
|
416
463
|
# TODO update from O(n) to O(1)
|
417
464
|
each_value do |obj|
|
@@ -421,12 +468,14 @@ class PDF::Reader
|
|
421
468
|
end
|
422
469
|
alias :value? :has_key?
|
423
470
|
|
471
|
+
#: () -> String
|
424
472
|
def to_s
|
425
473
|
"<PDF::Reader::ObjectHash size: #{self.size}>"
|
426
474
|
end
|
427
475
|
|
428
476
|
# return an array of all keys in the file
|
429
477
|
#
|
478
|
+
#: () -> Array[PDF::Reader::Reference]
|
430
479
|
def keys
|
431
480
|
ret = []
|
432
481
|
each_key { |k| ret << k }
|
@@ -435,6 +484,7 @@ class PDF::Reader
|
|
435
484
|
|
436
485
|
# return an array of all values in the file
|
437
486
|
#
|
487
|
+
#: () -> untyped
|
438
488
|
def values
|
439
489
|
ret = []
|
440
490
|
each_value { |v| ret << v }
|
@@ -443,12 +493,14 @@ class PDF::Reader
|
|
443
493
|
|
444
494
|
# return an array of all values from the specified keys
|
445
495
|
#
|
496
|
+
#: (*untyped) -> untyped
|
446
497
|
def values_at(*ids)
|
447
498
|
ids.map { |id| self[id] }
|
448
499
|
end
|
449
500
|
|
450
501
|
# return an array of arrays. Each sub array contains a key/value pair.
|
451
502
|
#
|
503
|
+
#: () -> untyped
|
452
504
|
def to_a
|
453
505
|
ret = []
|
454
506
|
each do |id, obj|
|
@@ -463,6 +515,7 @@ class PDF::Reader
|
|
463
515
|
#
|
464
516
|
# Useful for apps that want to extract data from specific pages.
|
465
517
|
#
|
518
|
+
#: () -> Array[PDF::Reader::Reference | Hash[Symbol, untyped]]
|
466
519
|
def page_references
|
467
520
|
root = fetch(trailer[:Root])
|
468
521
|
@page_references ||= begin
|
@@ -471,10 +524,12 @@ class PDF::Reader
|
|
471
524
|
end
|
472
525
|
end
|
473
526
|
|
527
|
+
#: () -> bool
|
474
528
|
def encrypted?
|
475
529
|
trailer.has_key?(:Encrypt)
|
476
530
|
end
|
477
531
|
|
532
|
+
#: () -> bool
|
478
533
|
def sec_handler?
|
479
534
|
!!sec_handler
|
480
535
|
end
|
@@ -484,6 +539,17 @@ class PDF::Reader
|
|
484
539
|
# parse a traditional object from the PDF, starting from the byte offset indicated
|
485
540
|
# in the xref table
|
486
541
|
#
|
542
|
+
#: (PDF::Reader::Reference) -> (
|
543
|
+
#| PDF::Reader::Reference |
|
544
|
+
#| PDF::Reader::Token |
|
545
|
+
#| PDF::Reader::Stream |
|
546
|
+
#| Numeric |
|
547
|
+
#| String |
|
548
|
+
#| Symbol |
|
549
|
+
#| Array[untyped] |
|
550
|
+
#| Hash[untyped, untyped] |
|
551
|
+
#| nil
|
552
|
+
#| )
|
487
553
|
def fetch_object(key)
|
488
554
|
if xref[key].is_a?(Integer)
|
489
555
|
buf = new_buffer(xref[key])
|
@@ -493,11 +559,25 @@ class PDF::Reader
|
|
493
559
|
|
494
560
|
# parse a object that's embedded in an object stream in the PDF
|
495
561
|
#
|
562
|
+
#: (PDF::Reader::Reference) -> (
|
563
|
+
#| PDF::Reader::Reference |
|
564
|
+
#| PDF::Reader::Token |
|
565
|
+
#| PDF::Reader::Stream |
|
566
|
+
#| Numeric |
|
567
|
+
#| String |
|
568
|
+
#| Symbol |
|
569
|
+
#| Array[untyped] |
|
570
|
+
#| Hash[untyped, untyped] |
|
571
|
+
#| nil
|
572
|
+
#| )
|
496
573
|
def fetch_object_stream(key)
|
497
574
|
if xref[key].is_a?(PDF::Reader::Reference)
|
498
575
|
container_key = xref[key]
|
499
|
-
|
500
|
-
|
576
|
+
stream = deref_stream(container_key)
|
577
|
+
raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil?
|
578
|
+
if objstream = object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream)
|
579
|
+
objstream[key.id]
|
580
|
+
end
|
501
581
|
end
|
502
582
|
end
|
503
583
|
|
@@ -505,6 +585,17 @@ class PDF::Reader
|
|
505
585
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
506
586
|
# doesn't need to be part of the public API.
|
507
587
|
#
|
588
|
+
#: (untyped, Hash[Integer, untyped]) -> (
|
589
|
+
#| PDF::Reader::Reference |
|
590
|
+
#| PDF::Reader::Token |
|
591
|
+
#| PDF::Reader::Stream |
|
592
|
+
#| Numeric |
|
593
|
+
#| String |
|
594
|
+
#| Symbol |
|
595
|
+
#| Array[untyped] |
|
596
|
+
#| Hash[untyped, untyped] |
|
597
|
+
#| nil
|
598
|
+
#| )
|
508
599
|
def deref_internal!(key, seen)
|
509
600
|
seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
|
510
601
|
|
@@ -534,6 +625,17 @@ class PDF::Reader
|
|
534
625
|
end
|
535
626
|
end
|
536
627
|
|
628
|
+
#: (PDF::Reader::Reference, untyped) -> (
|
629
|
+
#| PDF::Reader::Reference |
|
630
|
+
#| PDF::Reader::Token |
|
631
|
+
#| PDF::Reader::Stream |
|
632
|
+
#| Numeric |
|
633
|
+
#| String |
|
634
|
+
#| Symbol |
|
635
|
+
#| Array[untyped] |
|
636
|
+
#| Hash[untyped, untyped] |
|
637
|
+
#| nil
|
638
|
+
#| )
|
537
639
|
def decrypt(ref, obj)
|
538
640
|
case obj
|
539
641
|
when PDF::Reader::Stream then
|
@@ -555,25 +657,33 @@ class PDF::Reader
|
|
555
657
|
end
|
556
658
|
end
|
557
659
|
|
660
|
+
#: (?Integer) -> PDF::Reader::Buffer
|
558
661
|
def new_buffer(offset = 0)
|
559
662
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
560
663
|
end
|
561
664
|
|
665
|
+
#: () -> PDF::Reader::XRef[PDF::Reader::Reference]
|
562
666
|
def xref
|
563
667
|
@xref
|
564
668
|
end
|
565
669
|
|
670
|
+
#: () -> Hash[PDF::Reader::Reference, PDF::Reader::ObjectStream]
|
566
671
|
def object_streams
|
567
|
-
@
|
672
|
+
@object_streams ||= {}
|
568
673
|
end
|
569
674
|
|
570
675
|
# returns an array of object references for all pages in this object store. The ordering of
|
571
676
|
# the Array is significant and matches the page ordering of the document
|
572
677
|
#
|
678
|
+
#: (PDF::Reader::Reference | Hash[Symbol, untyped]) -> (
|
679
|
+
#| Array[PDF::Reader::Reference | Hash[Symbol, untyped] ]
|
680
|
+
#| )
|
573
681
|
def get_page_objects(obj)
|
574
682
|
derefed_obj = deref_hash(obj)
|
575
683
|
|
576
|
-
if derefed_obj
|
684
|
+
if derefed_obj.nil?
|
685
|
+
raise MalformedPDFError, "Expected Page or Pages object, got nil"
|
686
|
+
elsif derefed_obj[:Type] == :Page
|
577
687
|
[obj]
|
578
688
|
elsif derefed_obj[:Kids]
|
579
689
|
kids = deref_array(derefed_obj[:Kids]) || []
|
@@ -585,23 +695,26 @@ class PDF::Reader
|
|
585
695
|
end
|
586
696
|
end
|
587
697
|
|
698
|
+
#: () -> Float
|
588
699
|
def read_version
|
589
700
|
@io.seek(0)
|
590
|
-
_m, version = *@io.read(10).match(/PDF-(\d.\d)/)
|
701
|
+
_m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
|
591
702
|
@io.seek(0)
|
592
703
|
version.to_f
|
593
704
|
end
|
594
705
|
|
706
|
+
#: (IO | Tempfile | StringIO | String) -> (IO | Tempfile | StringIO)
|
595
707
|
def extract_io_from(input)
|
596
|
-
if input.
|
708
|
+
if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
|
597
709
|
input
|
598
710
|
elsif File.file?(input.to_s)
|
599
|
-
StringIO.new read_as_binary(input)
|
711
|
+
StringIO.new read_as_binary(input.to_s)
|
600
712
|
else
|
601
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
713
|
+
raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
|
602
714
|
end
|
603
715
|
end
|
604
716
|
|
717
|
+
#: (String) -> (String)
|
605
718
|
def read_as_binary(input)
|
606
719
|
if File.respond_to?(:binread)
|
607
720
|
File.binread(input.to_s)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
class PDF::Reader
|
@@ -8,11 +8,24 @@ class PDF::Reader
|
|
8
8
|
# This is done for added compression and is described as an "Object Stream" in the spec.
|
9
9
|
#
|
10
10
|
class ObjectStream # :nodoc:
|
11
|
+
#: (PDF::Reader::Stream) -> void
|
11
12
|
def initialize(stream)
|
12
|
-
@dict = stream.hash
|
13
|
-
@data = stream.unfiltered_data
|
13
|
+
@dict = stream.hash #: Hash[Symbol, untyped]
|
14
|
+
@data = stream.unfiltered_data #: String
|
15
|
+
@offsets = nil #: Hash[Integer, Integer] | nil
|
16
|
+
@buffer = nil #: PDF::Reader::Buffer | nil
|
14
17
|
end
|
15
18
|
|
19
|
+
#: (Integer) -> (
|
20
|
+
#| PDF::Reader::Reference |
|
21
|
+
#| PDF::Reader::Token |
|
22
|
+
#| Numeric |
|
23
|
+
#| String |
|
24
|
+
#| Symbol |
|
25
|
+
#| Array[untyped] |
|
26
|
+
#| Hash[untyped, untyped] |
|
27
|
+
#| nil
|
28
|
+
#| )
|
16
29
|
def [](objid)
|
17
30
|
if offsets[objid].nil?
|
18
31
|
nil
|
@@ -23,12 +36,14 @@ class PDF::Reader
|
|
23
36
|
end
|
24
37
|
end
|
25
38
|
|
39
|
+
#: () -> Integer
|
26
40
|
def size
|
27
|
-
@dict[:N]
|
41
|
+
TypeCheck.cast_to_int!(@dict[:N])
|
28
42
|
end
|
29
43
|
|
30
44
|
private
|
31
45
|
|
46
|
+
#: () -> Hash[Integer, Integer]
|
32
47
|
def offsets
|
33
48
|
@offsets ||= {}
|
34
49
|
return @offsets if @offsets.keys.size > 0
|
@@ -39,10 +54,12 @@ class PDF::Reader
|
|
39
54
|
@offsets
|
40
55
|
end
|
41
56
|
|
57
|
+
#: () -> Integer
|
42
58
|
def first
|
43
|
-
@dict[:First]
|
59
|
+
TypeCheck.cast_to_int!(@dict[:First])
|
44
60
|
end
|
45
61
|
|
62
|
+
#: () -> PDF::Reader::Buffer
|
46
63
|
def buffer
|
47
64
|
@buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
|
48
65
|
end
|