pdf-reader 2.7.0 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +20 -0
  3. data/Rakefile +1 -1
  4. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  5. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  6. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  7. data/lib/pdf/reader/buffer.rb +36 -34
  8. data/lib/pdf/reader/cmap.rb +64 -51
  9. data/lib/pdf/reader/error.rb +8 -0
  10. data/lib/pdf/reader/filter/ascii85.rb +1 -1
  11. data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
  12. data/lib/pdf/reader/filter/depredict.rb +1 -1
  13. data/lib/pdf/reader/filter/flate.rb +3 -3
  14. data/lib/pdf/reader/filter/lzw.rb +1 -1
  15. data/lib/pdf/reader/filter/null.rb +1 -2
  16. data/lib/pdf/reader/filter/run_length.rb +1 -1
  17. data/lib/pdf/reader/filter.rb +10 -11
  18. data/lib/pdf/reader/font.rb +71 -16
  19. data/lib/pdf/reader/font_descriptor.rb +18 -17
  20. data/lib/pdf/reader/form_xobject.rb +14 -5
  21. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  22. data/lib/pdf/reader/null_security_handler.rb +0 -4
  23. data/lib/pdf/reader/object_hash.rb +251 -44
  24. data/lib/pdf/reader/page.rb +51 -22
  25. data/lib/pdf/reader/page_layout.rb +14 -28
  26. data/lib/pdf/reader/page_state.rb +1 -1
  27. data/lib/pdf/reader/page_text_receiver.rb +52 -10
  28. data/lib/pdf/reader/parser.rb +22 -7
  29. data/lib/pdf/reader/point.rb +1 -1
  30. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  31. data/lib/pdf/reader/rectangle.rb +20 -2
  32. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
  33. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  34. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
  35. data/lib/pdf/reader/stream.rb +2 -2
  36. data/lib/pdf/reader/text_run.rb +13 -6
  37. data/lib/pdf/reader/type_check.rb +52 -0
  38. data/lib/pdf/reader/validating_receiver.rb +262 -0
  39. data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
  40. data/lib/pdf/reader/xref.rb +20 -3
  41. data/lib/pdf/reader.rb +32 -11
  42. data/rbi/pdf-reader.rbi +408 -174
  43. metadata +16 -9
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -0,0 +1,262 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # Page#walk will execute the content stream of a page, calling methods on a receiver class
9
+ # provided by the user. Each operator has a specific set of parameters it expects, and we
10
+ # wrap the users receiver class in this one to verify the PDF uses valid parameters.
11
+ #
12
+ # Without these checks, users can't be confident about the number of parameters they'll receive
13
+ # for an operator, or what the type of those parameters will be. Everyone ends up building their
14
+ # own type safety guard clauses and it's tedious.
15
+ #
16
+ # Not all operators have type safety implemented yet, but we can expand the number over time.
17
+ class ValidatingReceiver
18
+
19
+ def initialize(wrapped)
20
+ @wrapped = wrapped
21
+ end
22
+
23
+ def page=(page)
24
+ call_wrapped(:page=, page)
25
+ end
26
+
27
+ #####################################################
28
+ # Graphics State Operators
29
+ #####################################################
30
+ def save_graphics_state(*args)
31
+ call_wrapped(:save_graphics_state)
32
+ end
33
+
34
+ def restore_graphics_state(*args)
35
+ call_wrapped(:restore_graphics_state)
36
+ end
37
+
38
+ #####################################################
39
+ # Matrix Operators
40
+ #####################################################
41
+
42
+ def concatenate_matrix(*args)
43
+ a, b, c, d, e, f = *args
44
+ call_wrapped(
45
+ :concatenate_matrix,
46
+ TypeCheck.cast_to_numeric!(a),
47
+ TypeCheck.cast_to_numeric!(b),
48
+ TypeCheck.cast_to_numeric!(c),
49
+ TypeCheck.cast_to_numeric!(d),
50
+ TypeCheck.cast_to_numeric!(e),
51
+ TypeCheck.cast_to_numeric!(f),
52
+ )
53
+ end
54
+
55
+ #####################################################
56
+ # Text Object Operators
57
+ #####################################################
58
+
59
+ def begin_text_object(*args)
60
+ call_wrapped(:begin_text_object)
61
+ end
62
+
63
+ def end_text_object(*args)
64
+ call_wrapped(:end_text_object)
65
+ end
66
+
67
+ #####################################################
68
+ # Text State Operators
69
+ #####################################################
70
+ def set_character_spacing(*args)
71
+ char_spacing, _ = *args
72
+ call_wrapped(
73
+ :set_character_spacing,
74
+ TypeCheck.cast_to_numeric!(char_spacing)
75
+ )
76
+ end
77
+
78
+ def set_horizontal_text_scaling(*args)
79
+ h_scaling, _ = *args
80
+ call_wrapped(
81
+ :set_horizontal_text_scaling,
82
+ TypeCheck.cast_to_numeric!(h_scaling)
83
+ )
84
+ end
85
+
86
+ def set_text_font_and_size(*args)
87
+ label, size, _ = *args
88
+ call_wrapped(
89
+ :set_text_font_and_size,
90
+ TypeCheck.cast_to_symbol(label),
91
+ TypeCheck.cast_to_numeric!(size)
92
+ )
93
+ end
94
+
95
+ def set_text_leading(*args)
96
+ leading, _ = *args
97
+ call_wrapped(
98
+ :set_text_leading,
99
+ TypeCheck.cast_to_numeric!(leading)
100
+ )
101
+ end
102
+
103
+ def set_text_rendering_mode(*args)
104
+ mode, _ = *args
105
+ call_wrapped(
106
+ :set_text_rendering_mode,
107
+ TypeCheck.cast_to_numeric!(mode)
108
+ )
109
+ end
110
+
111
+ def set_text_rise(*args)
112
+ rise, _ = *args
113
+ call_wrapped(
114
+ :set_text_rise,
115
+ TypeCheck.cast_to_numeric!(rise)
116
+ )
117
+ end
118
+
119
+ def set_word_spacing(*args)
120
+ word_spacing, _ = *args
121
+ call_wrapped(
122
+ :set_word_spacing,
123
+ TypeCheck.cast_to_numeric!(word_spacing)
124
+ )
125
+ end
126
+
127
+ #####################################################
128
+ # Text Positioning Operators
129
+ #####################################################
130
+
131
+ def move_text_position(*args) # Td
132
+ x, y, _ = *args
133
+ call_wrapped(
134
+ :move_text_position,
135
+ TypeCheck.cast_to_numeric!(x),
136
+ TypeCheck.cast_to_numeric!(y)
137
+ )
138
+ end
139
+
140
+ def move_text_position_and_set_leading(*args) # TD
141
+ x, y, _ = *args
142
+ call_wrapped(
143
+ :move_text_position_and_set_leading,
144
+ TypeCheck.cast_to_numeric!(x),
145
+ TypeCheck.cast_to_numeric!(y)
146
+ )
147
+ end
148
+
149
+ def set_text_matrix_and_text_line_matrix(*args) # Tm
150
+ a, b, c, d, e, f = *args
151
+ call_wrapped(
152
+ :set_text_matrix_and_text_line_matrix,
153
+ TypeCheck.cast_to_numeric!(a),
154
+ TypeCheck.cast_to_numeric!(b),
155
+ TypeCheck.cast_to_numeric!(c),
156
+ TypeCheck.cast_to_numeric!(d),
157
+ TypeCheck.cast_to_numeric!(e),
158
+ TypeCheck.cast_to_numeric!(f),
159
+ )
160
+ end
161
+
162
+ def move_to_start_of_next_line(*args) # T*
163
+ call_wrapped(:move_to_start_of_next_line)
164
+ end
165
+
166
+ #####################################################
167
+ # Text Showing Operators
168
+ #####################################################
169
+ def show_text(*args) # Tj (AWAY)
170
+ string, _ = *args
171
+ call_wrapped(
172
+ :show_text,
173
+ TypeCheck.cast_to_string!(string)
174
+ )
175
+ end
176
+
177
+ def show_text_with_positioning(*args) # TJ [(A) 120 (WA) 20 (Y)]
178
+ params, _ = *args
179
+ unless params.is_a?(Array)
180
+ raise MalformedPDFError, "TJ operator expects a single Array argument"
181
+ end
182
+
183
+ call_wrapped(
184
+ :show_text_with_positioning,
185
+ params
186
+ )
187
+ end
188
+
189
+ def move_to_next_line_and_show_text(*args) # '
190
+ string, _ = *args
191
+ call_wrapped(
192
+ :move_to_next_line_and_show_text,
193
+ TypeCheck.cast_to_string!(string)
194
+ )
195
+ end
196
+
197
+ def set_spacing_next_line_show_text(*args) # "
198
+ aw, ac, string = *args
199
+ call_wrapped(
200
+ :set_spacing_next_line_show_text,
201
+ TypeCheck.cast_to_numeric!(aw),
202
+ TypeCheck.cast_to_numeric!(ac),
203
+ TypeCheck.cast_to_string!(string)
204
+ )
205
+ end
206
+
207
+ #####################################################
208
+ # Form XObject Operators
209
+ #####################################################
210
+
211
+ def invoke_xobject(*args)
212
+ label, _ = *args
213
+
214
+ call_wrapped(
215
+ :invoke_xobject,
216
+ TypeCheck.cast_to_symbol(label)
217
+ )
218
+ end
219
+
220
+ #####################################################
221
+ # Inline Image Operators
222
+ #####################################################
223
+
224
+ def begin_inline_image(*args)
225
+ call_wrapped(:begin_inline_image)
226
+ end
227
+
228
+ def begin_inline_image_data(*args)
229
+ # We can't use call_wrapped() here because sorbet won't allow splat args with a dynamic
230
+ # number of elements
231
+ @wrapped.begin_inline_image_data(*args) if @wrapped.respond_to?(:begin_inline_image_data)
232
+ end
233
+
234
+ def end_inline_image(*args)
235
+ data, _ = *args
236
+
237
+ call_wrapped(
238
+ :end_inline_image,
239
+ TypeCheck.cast_to_string!(data)
240
+ )
241
+ end
242
+
243
+ #####################################################
244
+ # Final safety net for any operators that don't have type checking enabled yet
245
+ #####################################################
246
+
247
+ def respond_to?(meth)
248
+ @wrapped.respond_to?(meth)
249
+ end
250
+
251
+ def method_missing(methodname, *args)
252
+ @wrapped.send(methodname, *args)
253
+ end
254
+
255
+ private
256
+
257
+ def call_wrapped(methodname, *args)
258
+ @wrapped.send(methodname, *args) if @wrapped.respond_to?(methodname)
259
+ end
260
+ end
261
+ end
262
+ end
@@ -30,7 +30,7 @@ class PDF::Reader
30
30
 
31
31
  # in ruby a negative index is valid, and will go from the end of the array
32
32
  # which is undesireable in this case.
33
- if @font.first_char <= code_point
33
+ if @font.first_char && @font.first_char <= code_point
34
34
  @font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
35
35
  else
36
36
  @missing_width.to_f
@@ -104,13 +104,18 @@ class PDF::Reader
104
104
  buf = new_buffer(offset)
105
105
  tok_one = buf.token
106
106
 
107
+ # we have a traditional xref table
107
108
  return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
108
109
 
109
110
  tok_two = buf.token
110
111
  tok_three = buf.token
111
112
 
113
+ # we have an XRef stream
112
114
  if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
113
115
  buf = new_buffer(offset)
116
+ # Maybe we should be parsing the ObjectHash second argument to the Parser here,
117
+ # to handle the case where an XRef Stream has the Length specified via an
118
+ # indirect object
114
119
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
115
120
  return load_xref_stream(stream)
116
121
  end
@@ -126,6 +131,10 @@ class PDF::Reader
126
131
 
127
132
  while !params.include?("trailer") && !params.include?(nil)
128
133
  if params.size == 2
134
+ unless params[0].to_s.match(/\A\d+\z/)
135
+ raise MalformedPDFError, "invalid xref table, expected object ID"
136
+ end
137
+
129
138
  objid, count = params[0].to_i, params[1].to_i
130
139
  count.times do
131
140
  offset = buf.token.to_i
@@ -143,7 +152,7 @@ class PDF::Reader
143
152
  params << buf.token
144
153
  end
145
154
 
146
- trailer = Parser.new(buf, self).parse_token
155
+ trailer = Parser.new(buf).parse_token
147
156
 
148
157
  unless trailer.kind_of?(Hash)
149
158
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
@@ -168,8 +177,16 @@ class PDF::Reader
168
177
  [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
169
178
  }]
170
179
 
171
- widths = stream.hash[:W]
172
- entry_length = widths.inject(0) { |s, w| s + w }
180
+ widths = stream.hash[:W]
181
+
182
+ PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
183
+
184
+ entry_length = widths.inject(0) { |s, w|
185
+ unless w.is_a?(Integer)
186
+ w = 0
187
+ end
188
+ s + w
189
+ }
173
190
  raw_data = StringIO.new(stream.unfiltered_data)
174
191
  if stream.hash[:Index]
175
192
  index = stream.hash[:Index]
data/lib/pdf/reader.rb CHANGED
@@ -112,19 +112,27 @@ module PDF
112
112
  #
113
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
114
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
115
118
  def initialize(input, opts = {})
116
119
  @cache = PDF::Reader::ObjectCache.new
117
120
  opts.merge!(:cache => @cache)
118
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
119
122
  end
120
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
121
126
  def info
122
- dict = @objects.deref(@objects.trailer[:Info])
127
+ dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
123
128
  doc_strings_to_utf8(dict)
124
129
  end
125
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
126
134
  def metadata
127
- stream = @objects.deref(root[:Metadata])
135
+ stream = @objects.deref_stream(root[:Metadata])
128
136
  if stream.nil?
129
137
  nil
130
138
  else
@@ -134,20 +142,24 @@ module PDF
134
142
  end
135
143
  end
136
144
 
145
+ # To number of pages in this PDF
146
+ #
137
147
  def page_count
138
- pages = @objects.deref(root[:Pages])
148
+ pages = @objects.deref_hash(root[:Pages])
139
149
  unless pages.kind_of?(::Hash)
140
150
  raise MalformedPDFError, "Pages structure is missing #{pages.class}"
141
151
  end
142
- @page_count ||= @objects.deref(pages[:Count])
152
+ @page_count ||= @objects.deref_integer(pages[:Count]) || 0
143
153
  end
144
154
 
155
+ # The PDF version this file uses
156
+ #
145
157
  def pdf_version
146
158
  @objects.pdf_version
147
159
  end
148
160
 
149
- # syntactic sugar for opening a PDF file. Accepts the same arguments
150
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
151
163
  #
152
164
  # PDF::Reader.open("somefile.pdf") do |reader|
153
165
  # puts reader.pdf_version
@@ -178,6 +190,8 @@ module PDF
178
190
  # methods available on each page
179
191
  #
180
192
  def pages
193
+ return [] if page_count <= 0
194
+
181
195
  (1..self.page_count).map do |num|
182
196
  begin
183
197
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
@@ -228,7 +242,7 @@ module PDF
228
242
  pdfdoc_to_utf8(obj)
229
243
  end
230
244
  else
231
- @objects.deref(obj)
245
+ obj
232
246
  end
233
247
  end
234
248
 
@@ -259,7 +273,7 @@ module PDF
259
273
 
260
274
  def root
261
275
  @root ||= begin
262
- obj = @objects.deref(@objects.trailer[:Root])
276
+ obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
263
277
  unless obj.kind_of?(::Hash)
264
278
  raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
265
279
  end
@@ -271,8 +285,9 @@ module PDF
271
285
  end
272
286
  ################################################################################
273
287
 
274
- require 'pdf/reader/resource_methods'
288
+ require 'pdf/reader/resources'
275
289
  require 'pdf/reader/buffer'
290
+ require 'pdf/reader/bounding_rectangle_runs_filter'
276
291
  require 'pdf/reader/cid_widths'
277
292
  require 'pdf/reader/cmap'
278
293
  require 'pdf/reader/encoding'
@@ -301,13 +316,19 @@ require 'pdf/reader/rectangle'
301
316
  require 'pdf/reader/reference'
302
317
  require 'pdf/reader/register_receiver'
303
318
  require 'pdf/reader/null_security_handler'
304
- require 'pdf/reader/standard_security_handler'
305
- require 'pdf/reader/standard_security_handler_v5'
319
+ require 'pdf/reader/security_handler_factory'
320
+ require 'pdf/reader/standard_key_builder'
321
+ require 'pdf/reader/key_builder_v5'
322
+ require 'pdf/reader/aes_v2_security_handler'
323
+ require 'pdf/reader/aes_v3_security_handler'
324
+ require 'pdf/reader/rc4_security_handler'
306
325
  require 'pdf/reader/unimplemented_security_handler'
307
326
  require 'pdf/reader/stream'
308
327
  require 'pdf/reader/text_run'
328
+ require 'pdf/reader/type_check'
309
329
  require 'pdf/reader/page_state'
310
330
  require 'pdf/reader/page_text_receiver'
311
331
  require 'pdf/reader/token'
312
332
  require 'pdf/reader/xref'
313
333
  require 'pdf/reader/page'
334
+ require 'pdf/reader/validating_receiver'