pdf-reader 2.7.0 → 2.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +20 -0
- data/Rakefile +1 -1
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -34
- data/lib/pdf/reader/cmap.rb +64 -51
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
- data/lib/pdf/reader/filter/depredict.rb +1 -1
- data/lib/pdf/reader/filter/flate.rb +3 -3
- data/lib/pdf/reader/filter/lzw.rb +1 -1
- data/lib/pdf/reader/filter/null.rb +1 -2
- data/lib/pdf/reader/filter/run_length.rb +1 -1
- data/lib/pdf/reader/filter.rb +10 -11
- data/lib/pdf/reader/font.rb +71 -16
- data/lib/pdf/reader/font_descriptor.rb +18 -17
- data/lib/pdf/reader/form_xobject.rb +14 -5
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/null_security_handler.rb +0 -4
- data/lib/pdf/reader/object_hash.rb +251 -44
- data/lib/pdf/reader/page.rb +51 -22
- data/lib/pdf/reader/page_layout.rb +14 -28
- data/lib/pdf/reader/page_state.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +52 -10
- data/lib/pdf/reader/parser.rb +22 -7
- data/lib/pdf/reader/point.rb +1 -1
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +20 -2
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
- data/lib/pdf/reader/stream.rb +2 -2
- data/lib/pdf/reader/text_run.rb +13 -6
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
- data/lib/pdf/reader/xref.rb +20 -3
- data/lib/pdf/reader.rb +32 -11
- data/rbi/pdf-reader.rbi +408 -174
- metadata +16 -9
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -0,0 +1,262 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# Page#walk will execute the content stream of a page, calling methods on a receiver class
|
9
|
+
# provided by the user. Each operator has a specific set of parameters it expects, and we
|
10
|
+
# wrap the users receiver class in this one to verify the PDF uses valid parameters.
|
11
|
+
#
|
12
|
+
# Without these checks, users can't be confident about the number of parameters they'll receive
|
13
|
+
# for an operator, or what the type of those parameters will be. Everyone ends up building their
|
14
|
+
# own type safety guard clauses and it's tedious.
|
15
|
+
#
|
16
|
+
# Not all operators have type safety implemented yet, but we can expand the number over time.
|
17
|
+
class ValidatingReceiver
|
18
|
+
|
19
|
+
def initialize(wrapped)
|
20
|
+
@wrapped = wrapped
|
21
|
+
end
|
22
|
+
|
23
|
+
def page=(page)
|
24
|
+
call_wrapped(:page=, page)
|
25
|
+
end
|
26
|
+
|
27
|
+
#####################################################
|
28
|
+
# Graphics State Operators
|
29
|
+
#####################################################
|
30
|
+
def save_graphics_state(*args)
|
31
|
+
call_wrapped(:save_graphics_state)
|
32
|
+
end
|
33
|
+
|
34
|
+
def restore_graphics_state(*args)
|
35
|
+
call_wrapped(:restore_graphics_state)
|
36
|
+
end
|
37
|
+
|
38
|
+
#####################################################
|
39
|
+
# Matrix Operators
|
40
|
+
#####################################################
|
41
|
+
|
42
|
+
def concatenate_matrix(*args)
|
43
|
+
a, b, c, d, e, f = *args
|
44
|
+
call_wrapped(
|
45
|
+
:concatenate_matrix,
|
46
|
+
TypeCheck.cast_to_numeric!(a),
|
47
|
+
TypeCheck.cast_to_numeric!(b),
|
48
|
+
TypeCheck.cast_to_numeric!(c),
|
49
|
+
TypeCheck.cast_to_numeric!(d),
|
50
|
+
TypeCheck.cast_to_numeric!(e),
|
51
|
+
TypeCheck.cast_to_numeric!(f),
|
52
|
+
)
|
53
|
+
end
|
54
|
+
|
55
|
+
#####################################################
|
56
|
+
# Text Object Operators
|
57
|
+
#####################################################
|
58
|
+
|
59
|
+
def begin_text_object(*args)
|
60
|
+
call_wrapped(:begin_text_object)
|
61
|
+
end
|
62
|
+
|
63
|
+
def end_text_object(*args)
|
64
|
+
call_wrapped(:end_text_object)
|
65
|
+
end
|
66
|
+
|
67
|
+
#####################################################
|
68
|
+
# Text State Operators
|
69
|
+
#####################################################
|
70
|
+
def set_character_spacing(*args)
|
71
|
+
char_spacing, _ = *args
|
72
|
+
call_wrapped(
|
73
|
+
:set_character_spacing,
|
74
|
+
TypeCheck.cast_to_numeric!(char_spacing)
|
75
|
+
)
|
76
|
+
end
|
77
|
+
|
78
|
+
def set_horizontal_text_scaling(*args)
|
79
|
+
h_scaling, _ = *args
|
80
|
+
call_wrapped(
|
81
|
+
:set_horizontal_text_scaling,
|
82
|
+
TypeCheck.cast_to_numeric!(h_scaling)
|
83
|
+
)
|
84
|
+
end
|
85
|
+
|
86
|
+
def set_text_font_and_size(*args)
|
87
|
+
label, size, _ = *args
|
88
|
+
call_wrapped(
|
89
|
+
:set_text_font_and_size,
|
90
|
+
TypeCheck.cast_to_symbol(label),
|
91
|
+
TypeCheck.cast_to_numeric!(size)
|
92
|
+
)
|
93
|
+
end
|
94
|
+
|
95
|
+
def set_text_leading(*args)
|
96
|
+
leading, _ = *args
|
97
|
+
call_wrapped(
|
98
|
+
:set_text_leading,
|
99
|
+
TypeCheck.cast_to_numeric!(leading)
|
100
|
+
)
|
101
|
+
end
|
102
|
+
|
103
|
+
def set_text_rendering_mode(*args)
|
104
|
+
mode, _ = *args
|
105
|
+
call_wrapped(
|
106
|
+
:set_text_rendering_mode,
|
107
|
+
TypeCheck.cast_to_numeric!(mode)
|
108
|
+
)
|
109
|
+
end
|
110
|
+
|
111
|
+
def set_text_rise(*args)
|
112
|
+
rise, _ = *args
|
113
|
+
call_wrapped(
|
114
|
+
:set_text_rise,
|
115
|
+
TypeCheck.cast_to_numeric!(rise)
|
116
|
+
)
|
117
|
+
end
|
118
|
+
|
119
|
+
def set_word_spacing(*args)
|
120
|
+
word_spacing, _ = *args
|
121
|
+
call_wrapped(
|
122
|
+
:set_word_spacing,
|
123
|
+
TypeCheck.cast_to_numeric!(word_spacing)
|
124
|
+
)
|
125
|
+
end
|
126
|
+
|
127
|
+
#####################################################
|
128
|
+
# Text Positioning Operators
|
129
|
+
#####################################################
|
130
|
+
|
131
|
+
def move_text_position(*args) # Td
|
132
|
+
x, y, _ = *args
|
133
|
+
call_wrapped(
|
134
|
+
:move_text_position,
|
135
|
+
TypeCheck.cast_to_numeric!(x),
|
136
|
+
TypeCheck.cast_to_numeric!(y)
|
137
|
+
)
|
138
|
+
end
|
139
|
+
|
140
|
+
def move_text_position_and_set_leading(*args) # TD
|
141
|
+
x, y, _ = *args
|
142
|
+
call_wrapped(
|
143
|
+
:move_text_position_and_set_leading,
|
144
|
+
TypeCheck.cast_to_numeric!(x),
|
145
|
+
TypeCheck.cast_to_numeric!(y)
|
146
|
+
)
|
147
|
+
end
|
148
|
+
|
149
|
+
def set_text_matrix_and_text_line_matrix(*args) # Tm
|
150
|
+
a, b, c, d, e, f = *args
|
151
|
+
call_wrapped(
|
152
|
+
:set_text_matrix_and_text_line_matrix,
|
153
|
+
TypeCheck.cast_to_numeric!(a),
|
154
|
+
TypeCheck.cast_to_numeric!(b),
|
155
|
+
TypeCheck.cast_to_numeric!(c),
|
156
|
+
TypeCheck.cast_to_numeric!(d),
|
157
|
+
TypeCheck.cast_to_numeric!(e),
|
158
|
+
TypeCheck.cast_to_numeric!(f),
|
159
|
+
)
|
160
|
+
end
|
161
|
+
|
162
|
+
def move_to_start_of_next_line(*args) # T*
|
163
|
+
call_wrapped(:move_to_start_of_next_line)
|
164
|
+
end
|
165
|
+
|
166
|
+
#####################################################
|
167
|
+
# Text Showing Operators
|
168
|
+
#####################################################
|
169
|
+
def show_text(*args) # Tj (AWAY)
|
170
|
+
string, _ = *args
|
171
|
+
call_wrapped(
|
172
|
+
:show_text,
|
173
|
+
TypeCheck.cast_to_string!(string)
|
174
|
+
)
|
175
|
+
end
|
176
|
+
|
177
|
+
def show_text_with_positioning(*args) # TJ [(A) 120 (WA) 20 (Y)]
|
178
|
+
params, _ = *args
|
179
|
+
unless params.is_a?(Array)
|
180
|
+
raise MalformedPDFError, "TJ operator expects a single Array argument"
|
181
|
+
end
|
182
|
+
|
183
|
+
call_wrapped(
|
184
|
+
:show_text_with_positioning,
|
185
|
+
params
|
186
|
+
)
|
187
|
+
end
|
188
|
+
|
189
|
+
def move_to_next_line_and_show_text(*args) # '
|
190
|
+
string, _ = *args
|
191
|
+
call_wrapped(
|
192
|
+
:move_to_next_line_and_show_text,
|
193
|
+
TypeCheck.cast_to_string!(string)
|
194
|
+
)
|
195
|
+
end
|
196
|
+
|
197
|
+
def set_spacing_next_line_show_text(*args) # "
|
198
|
+
aw, ac, string = *args
|
199
|
+
call_wrapped(
|
200
|
+
:set_spacing_next_line_show_text,
|
201
|
+
TypeCheck.cast_to_numeric!(aw),
|
202
|
+
TypeCheck.cast_to_numeric!(ac),
|
203
|
+
TypeCheck.cast_to_string!(string)
|
204
|
+
)
|
205
|
+
end
|
206
|
+
|
207
|
+
#####################################################
|
208
|
+
# Form XObject Operators
|
209
|
+
#####################################################
|
210
|
+
|
211
|
+
def invoke_xobject(*args)
|
212
|
+
label, _ = *args
|
213
|
+
|
214
|
+
call_wrapped(
|
215
|
+
:invoke_xobject,
|
216
|
+
TypeCheck.cast_to_symbol(label)
|
217
|
+
)
|
218
|
+
end
|
219
|
+
|
220
|
+
#####################################################
|
221
|
+
# Inline Image Operators
|
222
|
+
#####################################################
|
223
|
+
|
224
|
+
def begin_inline_image(*args)
|
225
|
+
call_wrapped(:begin_inline_image)
|
226
|
+
end
|
227
|
+
|
228
|
+
def begin_inline_image_data(*args)
|
229
|
+
# We can't use call_wrapped() here because sorbet won't allow splat args with a dynamic
|
230
|
+
# number of elements
|
231
|
+
@wrapped.begin_inline_image_data(*args) if @wrapped.respond_to?(:begin_inline_image_data)
|
232
|
+
end
|
233
|
+
|
234
|
+
def end_inline_image(*args)
|
235
|
+
data, _ = *args
|
236
|
+
|
237
|
+
call_wrapped(
|
238
|
+
:end_inline_image,
|
239
|
+
TypeCheck.cast_to_string!(data)
|
240
|
+
)
|
241
|
+
end
|
242
|
+
|
243
|
+
#####################################################
|
244
|
+
# Final safety net for any operators that don't have type checking enabled yet
|
245
|
+
#####################################################
|
246
|
+
|
247
|
+
def respond_to?(meth)
|
248
|
+
@wrapped.respond_to?(meth)
|
249
|
+
end
|
250
|
+
|
251
|
+
def method_missing(methodname, *args)
|
252
|
+
@wrapped.send(methodname, *args)
|
253
|
+
end
|
254
|
+
|
255
|
+
private
|
256
|
+
|
257
|
+
def call_wrapped(methodname, *args)
|
258
|
+
@wrapped.send(methodname, *args) if @wrapped.respond_to?(methodname)
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
@@ -30,7 +30,7 @@ class PDF::Reader
|
|
30
30
|
|
31
31
|
# in ruby a negative index is valid, and will go from the end of the array
|
32
32
|
# which is undesireable in this case.
|
33
|
-
if @font.first_char <= code_point
|
33
|
+
if @font.first_char && @font.first_char <= code_point
|
34
34
|
@font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
|
35
35
|
else
|
36
36
|
@missing_width.to_f
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -104,13 +104,18 @@ class PDF::Reader
|
|
104
104
|
buf = new_buffer(offset)
|
105
105
|
tok_one = buf.token
|
106
106
|
|
107
|
+
# we have a traditional xref table
|
107
108
|
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
108
109
|
|
109
110
|
tok_two = buf.token
|
110
111
|
tok_three = buf.token
|
111
112
|
|
113
|
+
# we have an XRef stream
|
112
114
|
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
113
115
|
buf = new_buffer(offset)
|
116
|
+
# Maybe we should be parsing the ObjectHash second argument to the Parser here,
|
117
|
+
# to handle the case where an XRef Stream has the Length specified via an
|
118
|
+
# indirect object
|
114
119
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
115
120
|
return load_xref_stream(stream)
|
116
121
|
end
|
@@ -126,6 +131,10 @@ class PDF::Reader
|
|
126
131
|
|
127
132
|
while !params.include?("trailer") && !params.include?(nil)
|
128
133
|
if params.size == 2
|
134
|
+
unless params[0].to_s.match(/\A\d+\z/)
|
135
|
+
raise MalformedPDFError, "invalid xref table, expected object ID"
|
136
|
+
end
|
137
|
+
|
129
138
|
objid, count = params[0].to_i, params[1].to_i
|
130
139
|
count.times do
|
131
140
|
offset = buf.token.to_i
|
@@ -143,7 +152,7 @@ class PDF::Reader
|
|
143
152
|
params << buf.token
|
144
153
|
end
|
145
154
|
|
146
|
-
trailer = Parser.new(buf
|
155
|
+
trailer = Parser.new(buf).parse_token
|
147
156
|
|
148
157
|
unless trailer.kind_of?(Hash)
|
149
158
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
@@ -168,8 +177,16 @@ class PDF::Reader
|
|
168
177
|
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
169
178
|
}]
|
170
179
|
|
171
|
-
widths
|
172
|
-
|
180
|
+
widths = stream.hash[:W]
|
181
|
+
|
182
|
+
PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
|
183
|
+
|
184
|
+
entry_length = widths.inject(0) { |s, w|
|
185
|
+
unless w.is_a?(Integer)
|
186
|
+
w = 0
|
187
|
+
end
|
188
|
+
s + w
|
189
|
+
}
|
173
190
|
raw_data = StringIO.new(stream.unfiltered_data)
|
174
191
|
if stream.hash[:Index]
|
175
192
|
index = stream.hash[:Index]
|
data/lib/pdf/reader.rb
CHANGED
@@ -112,19 +112,27 @@ module PDF
|
|
112
112
|
#
|
113
113
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
114
114
|
#
|
115
|
+
# Using this method directly is supported, but it's more common to use
|
116
|
+
# `PDF::Reader.open`
|
117
|
+
#
|
115
118
|
def initialize(input, opts = {})
|
116
119
|
@cache = PDF::Reader::ObjectCache.new
|
117
120
|
opts.merge!(:cache => @cache)
|
118
121
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
119
122
|
end
|
120
123
|
|
124
|
+
# Return a Hash with some basic information about the PDF file
|
125
|
+
#
|
121
126
|
def info
|
122
|
-
dict = @objects.
|
127
|
+
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
123
128
|
doc_strings_to_utf8(dict)
|
124
129
|
end
|
125
130
|
|
131
|
+
# Return a Hash with extra metadata provided by the author of the PDF file. Not
|
132
|
+
# always present.
|
133
|
+
#
|
126
134
|
def metadata
|
127
|
-
stream = @objects.
|
135
|
+
stream = @objects.deref_stream(root[:Metadata])
|
128
136
|
if stream.nil?
|
129
137
|
nil
|
130
138
|
else
|
@@ -134,20 +142,24 @@ module PDF
|
|
134
142
|
end
|
135
143
|
end
|
136
144
|
|
145
|
+
# To number of pages in this PDF
|
146
|
+
#
|
137
147
|
def page_count
|
138
|
-
pages = @objects.
|
148
|
+
pages = @objects.deref_hash(root[:Pages])
|
139
149
|
unless pages.kind_of?(::Hash)
|
140
150
|
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
141
151
|
end
|
142
|
-
@page_count ||= @objects.
|
152
|
+
@page_count ||= @objects.deref_integer(pages[:Count]) || 0
|
143
153
|
end
|
144
154
|
|
155
|
+
# The PDF version this file uses
|
156
|
+
#
|
145
157
|
def pdf_version
|
146
158
|
@objects.pdf_version
|
147
159
|
end
|
148
160
|
|
149
|
-
# syntactic sugar for opening a PDF file. Accepts the
|
150
|
-
# as new().
|
161
|
+
# syntactic sugar for opening a PDF file and the most common approach. Accepts the
|
162
|
+
# same arguments as new().
|
151
163
|
#
|
152
164
|
# PDF::Reader.open("somefile.pdf") do |reader|
|
153
165
|
# puts reader.pdf_version
|
@@ -178,6 +190,8 @@ module PDF
|
|
178
190
|
# methods available on each page
|
179
191
|
#
|
180
192
|
def pages
|
193
|
+
return [] if page_count <= 0
|
194
|
+
|
181
195
|
(1..self.page_count).map do |num|
|
182
196
|
begin
|
183
197
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
@@ -228,7 +242,7 @@ module PDF
|
|
228
242
|
pdfdoc_to_utf8(obj)
|
229
243
|
end
|
230
244
|
else
|
231
|
-
|
245
|
+
obj
|
232
246
|
end
|
233
247
|
end
|
234
248
|
|
@@ -259,7 +273,7 @@ module PDF
|
|
259
273
|
|
260
274
|
def root
|
261
275
|
@root ||= begin
|
262
|
-
obj = @objects.
|
276
|
+
obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
|
263
277
|
unless obj.kind_of?(::Hash)
|
264
278
|
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
265
279
|
end
|
@@ -271,8 +285,9 @@ module PDF
|
|
271
285
|
end
|
272
286
|
################################################################################
|
273
287
|
|
274
|
-
require 'pdf/reader/
|
288
|
+
require 'pdf/reader/resources'
|
275
289
|
require 'pdf/reader/buffer'
|
290
|
+
require 'pdf/reader/bounding_rectangle_runs_filter'
|
276
291
|
require 'pdf/reader/cid_widths'
|
277
292
|
require 'pdf/reader/cmap'
|
278
293
|
require 'pdf/reader/encoding'
|
@@ -301,13 +316,19 @@ require 'pdf/reader/rectangle'
|
|
301
316
|
require 'pdf/reader/reference'
|
302
317
|
require 'pdf/reader/register_receiver'
|
303
318
|
require 'pdf/reader/null_security_handler'
|
304
|
-
require 'pdf/reader/
|
305
|
-
require 'pdf/reader/
|
319
|
+
require 'pdf/reader/security_handler_factory'
|
320
|
+
require 'pdf/reader/standard_key_builder'
|
321
|
+
require 'pdf/reader/key_builder_v5'
|
322
|
+
require 'pdf/reader/aes_v2_security_handler'
|
323
|
+
require 'pdf/reader/aes_v3_security_handler'
|
324
|
+
require 'pdf/reader/rc4_security_handler'
|
306
325
|
require 'pdf/reader/unimplemented_security_handler'
|
307
326
|
require 'pdf/reader/stream'
|
308
327
|
require 'pdf/reader/text_run'
|
328
|
+
require 'pdf/reader/type_check'
|
309
329
|
require 'pdf/reader/page_state'
|
310
330
|
require 'pdf/reader/page_text_receiver'
|
311
331
|
require 'pdf/reader/token'
|
312
332
|
require 'pdf/reader/xref'
|
313
333
|
require 'pdf/reader/page'
|
334
|
+
require 'pdf/reader/validating_receiver'
|