pdf-reader 2.9.2 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +39 -0
  3. data/README.md +33 -33
  4. data/Rakefile +2 -2
  5. data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
  6. data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
  7. data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
  8. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
  9. data/lib/pdf/reader/buffer.rb +39 -22
  10. data/lib/pdf/reader/cid_widths.rb +14 -6
  11. data/lib/pdf/reader/cmap.rb +16 -5
  12. data/lib/pdf/reader/encoding.rb +42 -18
  13. data/lib/pdf/reader/error.rb +6 -4
  14. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  15. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  16. data/lib/pdf/reader/filter/depredict.rb +6 -2
  17. data/lib/pdf/reader/filter/flate.rb +5 -2
  18. data/lib/pdf/reader/filter/lzw.rb +2 -0
  19. data/lib/pdf/reader/filter/null.rb +2 -0
  20. data/lib/pdf/reader/filter/run_length.rb +2 -0
  21. data/lib/pdf/reader/filter.rb +1 -0
  22. data/lib/pdf/reader/font.rb +99 -32
  23. data/lib/pdf/reader/font_descriptor.rb +79 -24
  24. data/lib/pdf/reader/form_xobject.rb +15 -1
  25. data/lib/pdf/reader/glyph_hash.rb +41 -8
  26. data/lib/pdf/reader/key_builder_v5.rb +17 -9
  27. data/lib/pdf/reader/lzw.rb +42 -16
  28. data/lib/pdf/reader/no_text_filter.rb +15 -0
  29. data/lib/pdf/reader/null_security_handler.rb +1 -0
  30. data/lib/pdf/reader/object_cache.rb +7 -2
  31. data/lib/pdf/reader/object_hash.rb +129 -16
  32. data/lib/pdf/reader/object_stream.rb +22 -5
  33. data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
  34. data/lib/pdf/reader/page.rb +66 -13
  35. data/lib/pdf/reader/page_layout.rb +26 -9
  36. data/lib/pdf/reader/page_state.rb +12 -3
  37. data/lib/pdf/reader/page_text_receiver.rb +16 -2
  38. data/lib/pdf/reader/pages_strategy.rb +1 -1
  39. data/lib/pdf/reader/parser.rb +52 -13
  40. data/lib/pdf/reader/point.rb +9 -2
  41. data/lib/pdf/reader/print_receiver.rb +2 -6
  42. data/lib/pdf/reader/rc4_security_handler.rb +2 -0
  43. data/lib/pdf/reader/rectangle.rb +24 -1
  44. data/lib/pdf/reader/reference.rb +13 -3
  45. data/lib/pdf/reader/register_receiver.rb +15 -2
  46. data/lib/pdf/reader/resources.rb +12 -2
  47. data/lib/pdf/reader/security_handler_factory.rb +13 -0
  48. data/lib/pdf/reader/standard_key_builder.rb +37 -23
  49. data/lib/pdf/reader/stream.rb +9 -3
  50. data/lib/pdf/reader/synchronized_cache.rb +6 -3
  51. data/lib/pdf/reader/text_run.rb +33 -3
  52. data/lib/pdf/reader/token.rb +1 -0
  53. data/lib/pdf/reader/transformation_matrix.rb +41 -10
  54. data/lib/pdf/reader/type_check.rb +53 -0
  55. data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
  56. data/lib/pdf/reader/validating_receiver.rb +29 -0
  57. data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
  58. data/lib/pdf/reader/width_calculator/composite.rb +11 -3
  59. data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
  60. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
  61. data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
  62. data/lib/pdf/reader/xref.rb +31 -10
  63. data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
  64. data/lib/pdf/reader.rb +24 -12
  65. data/rbi/pdf-reader.rbi +1504 -1480
  66. metadata +34 -17
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -46,8 +46,11 @@ class PDF::Reader
46
46
  # the Enumerable mixin. The key difference is no []= method - the hash
47
47
  # is read only.
48
48
  #
49
+ #: [Elem]
49
50
  class XRef
50
51
  include Enumerable
52
+
53
+ #: Hash[Symbol, untyped]
51
54
  attr_reader :trailer
52
55
 
53
56
  ################################################################################
@@ -55,16 +58,19 @@ class PDF::Reader
55
58
  #
56
59
  # io - must be an IO object, generally either a file or a StringIO
57
60
  #
61
+ #: (IO | Tempfile | StringIO) -> void
58
62
  def initialize(io)
59
63
  @io = io
60
- @junk_offset = calc_junk_offset(io) || 0
61
- @xref = {}
62
- @trailer = load_offsets
64
+ @junk_offset = calc_junk_offset(io) || 0 #: Integer
65
+ @xref = {} #: Hash[Integer, Hash[Integer, Integer | PDF::Reader::Reference]]
66
+ @trailer = load_offsets #: Hash[Symbol, untyped]
63
67
  end
64
68
 
65
69
  ################################################################################
66
70
  # return the number of objects in this file. Objects with multiple generations are
67
71
  # only counter once.
72
+ #
73
+ #: () -> untyped
68
74
  def size
69
75
  @xref.size
70
76
  end
@@ -72,18 +78,22 @@ class PDF::Reader
72
78
  # returns the byte offset for the specified PDF object.
73
79
  #
74
80
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
81
+ #: (untyped) -> untyped
75
82
  def [](ref)
76
- @xref[ref.id][ref.gen]
83
+ @xref.fetch(ref.id, {}).fetch(ref.gen)
77
84
  rescue
78
85
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
79
86
  end
80
87
  ################################################################################
81
88
  # iterate over each object in the xref table
89
+ #
90
+ # @override(allow_incompatible: true)
91
+ #: () { (PDF::Reader::Reference) -> untyped } -> void
82
92
  def each(&block)
83
93
  ids = @xref.keys.sort
84
94
  ids.each do |id|
85
- gen = @xref[id].keys.sort[-1]
86
- yield PDF::Reader::Reference.new(id, gen)
95
+ gen = @xref.fetch(id, {}).keys.sort[-1]
96
+ yield PDF::Reader::Reference.new(id, gen.to_i)
87
97
  end
88
98
  end
89
99
  ################################################################################
@@ -97,6 +107,7 @@ class PDF::Reader
97
107
  # After seeking to the offset, processing is handed of to either load_xref_table()
98
108
  # or load_xref_stream() based on what we find there.
99
109
  #
110
+ #: (?Integer?) -> Hash[Symbol, untyped]
100
111
  def load_offsets(offset = nil)
101
112
  offset ||= new_buffer.find_first_xref_offset
102
113
  offset += @junk_offset
@@ -117,7 +128,9 @@ class PDF::Reader
117
128
  # to handle the case where an XRef Stream has the Length specified via an
118
129
  # indirect object
119
130
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
120
- return load_xref_stream(stream)
131
+ if stream.is_a?(PDF::Reader::Stream)
132
+ return load_xref_stream(stream)
133
+ end
121
134
  end
122
135
 
123
136
  raise PDF::Reader::MalformedPDFError,
@@ -126,6 +139,8 @@ class PDF::Reader
126
139
  ################################################################################
127
140
  # Assumes the underlying buffer is positioned at the start of a traditional
128
141
  # Xref table and processes it into memory.
142
+ #
143
+ #: (PDF::Reader::Buffer) -> Hash[Symbol, untyped]
129
144
  def load_xref_table(buf)
130
145
  params = []
131
146
 
@@ -169,8 +184,9 @@ class PDF::Reader
169
184
  ################################################################################
170
185
  # Read an XRef stream from the underlying buffer instead of a traditional xref table.
171
186
  #
187
+ #: (PDF::Reader::Stream) -> Hash[Symbol, untyped]
172
188
  def load_xref_stream(stream)
173
- unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
189
+ unless stream.hash[:Type] == :XRef
174
190
  raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
175
191
  end
176
192
  trailer = Hash[stream.hash.select { |key, value|
@@ -216,8 +232,9 @@ class PDF::Reader
216
232
  # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
217
233
  # bytes they need to be converted to an int in different ways.
218
234
  #
235
+ #: (String?) -> Integer
219
236
  def unpack_bytes(bytes)
220
- if bytes.to_s.size == 0
237
+ res = if bytes.nil? || bytes == ""
221
238
  0
222
239
  elsif bytes.size == 1
223
240
  bytes.unpack("C")[0]
@@ -232,6 +249,7 @@ class PDF::Reader
232
249
  else
233
250
  raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
234
251
  end
252
+ TypeCheck.cast_to_int!(res)
235
253
  end
236
254
  ################################################################################
237
255
  # Wrap the io stream we're working with in a buffer that can tokenise it for us.
@@ -239,12 +257,14 @@ class PDF::Reader
239
257
  # We create multiple buffers so we can be tokenising multiple sections of the file
240
258
  # at the same time without worrying about clearing the buffers contents.
241
259
  #
260
+ #: (?Integer) -> PDF::Reader::Buffer
242
261
  def new_buffer(offset = 0)
243
262
  PDF::Reader::Buffer.new(@io, :seek => offset)
244
263
  end
245
264
  ################################################################################
246
265
  # Stores an offset value for a particular PDF object ID and revision number
247
266
  #
267
+ #: (Integer, Integer, Integer | PDF::Reader::Reference) -> (Integer | PDF::Reader::Reference)
248
268
  def store(id, gen, offset)
249
269
  (@xref[id] ||= {})[gen] ||= offset
250
270
  end
@@ -258,6 +278,7 @@ class PDF::Reader
258
278
  # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
259
279
  # header appear somewhere within the first 1024 bytes of the file
260
280
  #
281
+ #: (IO | Tempfile | StringIO) -> Integer?
261
282
  def calc_junk_offset(io)
262
283
  io.rewind
263
284
  offset = io.pos
@@ -6,6 +6,7 @@ class PDF::Reader
6
6
  # There's no point rendering zero-width characters
7
7
  class ZeroWidthRunsFilter
8
8
 
9
+ #: (Array[PDF::Reader::TextRun]) -> Array[PDF::Reader::TextRun]
9
10
  def self.exclude_zero_width_runs(runs)
10
11
  runs.reject { |run| run.width == 0 }
11
12
  end
data/lib/pdf/reader.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -95,6 +95,7 @@ module PDF
95
95
  class Reader
96
96
 
97
97
  # lowlevel hash-like access to all objects in the underlying PDF
98
+ #: PDF::Reader::ObjectHash
98
99
  attr_reader :objects
99
100
 
100
101
  # creates a new document reader for the provided PDF.
@@ -115,22 +116,27 @@ module PDF
115
116
  # Using this method directly is supported, but it's more common to use
116
117
  # `PDF::Reader.open`
117
118
  #
119
+ #: (String | Tempfile | IO | StringIO, ?Hash[untyped, untyped]) -> void
118
120
  def initialize(input, opts = {})
119
- @cache = PDF::Reader::ObjectCache.new
121
+ @cache = PDF::Reader::ObjectCache.new #: PDF::Reader::ObjectCache
120
122
  opts.merge!(:cache => @cache)
121
- @objects = PDF::Reader::ObjectHash.new(input, opts)
123
+ @objects = PDF::Reader::ObjectHash.new(input, opts) #: PDF::Reader::ObjectHash
124
+ @page_count = nil #: Integer | nil
125
+ @root = nil #: Hash[Symbol, untyped] | nil
122
126
  end
123
127
 
124
128
  # Return a Hash with some basic information about the PDF file
125
129
  #
130
+ #: () -> Hash[untyped, untyped]?
126
131
  def info
127
132
  dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
128
133
  doc_strings_to_utf8(dict)
129
134
  end
130
135
 
131
- # Return a Hash with extra metadata provided by the author of the PDF file. Not
136
+ # Return a String with extra XML metadata provided by the author of the PDF file. Not
132
137
  # always present.
133
138
  #
139
+ #: () -> String?
134
140
  def metadata
135
141
  stream = @objects.deref_stream(root[:Metadata])
136
142
  if stream.nil?
@@ -144,6 +150,7 @@ module PDF
144
150
 
145
151
  # To number of pages in this PDF
146
152
  #
153
+ #: () -> Integer
147
154
  def page_count
148
155
  pages = @objects.deref_hash(root[:Pages])
149
156
  unless pages.kind_of?(::Hash)
@@ -154,6 +161,7 @@ module PDF
154
161
 
155
162
  # The PDF version this file uses
156
163
  #
164
+ #: () -> Float
157
165
  def pdf_version
158
166
  @objects.pdf_version
159
167
  end
@@ -171,6 +179,7 @@ module PDF
171
179
  # puts reader.pdf_version
172
180
  # end
173
181
  #
182
+ #: (String | Tempfile | IO, ?Hash[untyped, untyped]) { (PDF::Reader) -> void } -> untyped
174
183
  def self.open(input, opts = {}, &block)
175
184
  yield PDF::Reader.new(input, opts)
176
185
  end
@@ -182,13 +191,14 @@ module PDF
182
191
  #
183
192
  # reader.pages.each do |page|
184
193
  # puts page.fonts
185
- # puts page.images
194
+ # puts page.rectangles
186
195
  # puts page.text
187
196
  # end
188
197
  #
189
198
  # See the docs for PDF::Reader::Page to read more about the
190
199
  # methods available on each page
191
200
  #
201
+ #: () -> Array[PDF::Reader::Page]
192
202
  def pages
193
203
  return [] if page_count <= 0
194
204
 
@@ -213,6 +223,7 @@ module PDF
213
223
  # See the docs for PDF::Reader::Page to read more about the
214
224
  # methods available on each page
215
225
  #
226
+ #: (Integer) -> PDF::Reader::Page
216
227
  def page(num)
217
228
  num = num.to_i
218
229
  if num < 1 || num > self.page_count
@@ -225,6 +236,7 @@ module PDF
225
236
 
226
237
  # recursively convert strings from outside a content stream into UTF-8
227
238
  #
239
+ #: (untyped) -> untyped
228
240
  def doc_strings_to_utf8(obj)
229
241
  case obj
230
242
  when ::Hash then
@@ -246,6 +258,7 @@ module PDF
246
258
  end
247
259
  end
248
260
 
261
+ #: (String) -> bool
249
262
  def has_utf16_bom?(str)
250
263
  first_bytes = str[0,2]
251
264
 
@@ -256,6 +269,7 @@ module PDF
256
269
 
257
270
  # TODO find a PDF I can use to spec this behaviour
258
271
  #
272
+ #: (String) -> String
259
273
  def pdfdoc_to_utf8(obj)
260
274
  obj.force_encoding("utf-8")
261
275
  obj
@@ -264,6 +278,7 @@ module PDF
264
278
  # one day we'll all run on a 1.9 compatible VM and I can just do this with
265
279
  # String#encode
266
280
  #
281
+ #: (String) -> String
267
282
  def utf16_to_utf8(obj)
268
283
  str = obj[2, obj.size].to_s
269
284
  str = str.unpack("n*").pack("U*")
@@ -271,14 +286,9 @@ module PDF
271
286
  str
272
287
  end
273
288
 
289
+ #: () -> Hash[Symbol, untyped]
274
290
  def root
275
- @root ||= begin
276
- obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
277
- unless obj.kind_of?(::Hash)
278
- raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
279
- end
280
- obj
281
- end
291
+ @root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
282
292
  end
283
293
 
284
294
  end
@@ -286,6 +296,7 @@ end
286
296
  ################################################################################
287
297
 
288
298
  require 'pdf/reader/resources'
299
+ require 'pdf/reader/advanced_text_run_filter'
289
300
  require 'pdf/reader/buffer'
290
301
  require 'pdf/reader/bounding_rectangle_runs_filter'
291
302
  require 'pdf/reader/cid_widths'
@@ -315,6 +326,7 @@ require 'pdf/reader/print_receiver'
315
326
  require 'pdf/reader/rectangle'
316
327
  require 'pdf/reader/reference'
317
328
  require 'pdf/reader/register_receiver'
329
+ require 'pdf/reader/no_text_filter'
318
330
  require 'pdf/reader/null_security_handler'
319
331
  require 'pdf/reader/security_handler_factory'
320
332
  require 'pdf/reader/standard_key_builder'