pdf-reader 2.14.1 → 2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +19 -0
  3. data/lib/pdf/reader/advanced_text_run_filter.rb +17 -2
  4. data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
  5. data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
  6. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +35 -17
  8. data/lib/pdf/reader/cid_widths.rb +7 -1
  9. data/lib/pdf/reader/cmap.rb +22 -6
  10. data/lib/pdf/reader/encoding.rb +37 -12
  11. data/lib/pdf/reader/error.rb +6 -0
  12. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  13. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  14. data/lib/pdf/reader/filter/depredict.rb +4 -0
  15. data/lib/pdf/reader/filter/flate.rb +5 -2
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +2 -0
  18. data/lib/pdf/reader/filter/run_length.rb +2 -0
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +101 -25
  21. data/lib/pdf/reader/font_descriptor.rb +76 -23
  22. data/lib/pdf/reader/form_xobject.rb +11 -0
  23. data/lib/pdf/reader/glyph_hash.rb +34 -9
  24. data/lib/pdf/reader/key_builder_v5.rb +17 -9
  25. data/lib/pdf/reader/lzw.rb +17 -6
  26. data/lib/pdf/reader/no_text_filter.rb +1 -0
  27. data/lib/pdf/reader/null_security_handler.rb +1 -0
  28. data/lib/pdf/reader/object_cache.rb +7 -2
  29. data/lib/pdf/reader/object_hash.rb +116 -9
  30. data/lib/pdf/reader/object_stream.rb +19 -2
  31. data/lib/pdf/reader/overlapping_runs_filter.rb +7 -1
  32. data/lib/pdf/reader/page.rb +41 -7
  33. data/lib/pdf/reader/page_layout.rb +25 -8
  34. data/lib/pdf/reader/page_state.rb +5 -2
  35. data/lib/pdf/reader/page_text_receiver.rb +6 -2
  36. data/lib/pdf/reader/pages_strategy.rb +1 -1
  37. data/lib/pdf/reader/parser.rb +51 -10
  38. data/lib/pdf/reader/point.rb +9 -2
  39. data/lib/pdf/reader/print_receiver.rb +2 -6
  40. data/lib/pdf/reader/rc4_security_handler.rb +2 -0
  41. data/lib/pdf/reader/rectangle.rb +24 -1
  42. data/lib/pdf/reader/reference.rb +10 -1
  43. data/lib/pdf/reader/register_receiver.rb +15 -2
  44. data/lib/pdf/reader/resources.rb +9 -0
  45. data/lib/pdf/reader/security_handler_factory.rb +13 -0
  46. data/lib/pdf/reader/standard_key_builder.rb +37 -23
  47. data/lib/pdf/reader/stream.rb +9 -3
  48. data/lib/pdf/reader/synchronized_cache.rb +5 -2
  49. data/lib/pdf/reader/text_run.rb +28 -1
  50. data/lib/pdf/reader/token.rb +1 -0
  51. data/lib/pdf/reader/transformation_matrix.rb +33 -2
  52. data/lib/pdf/reader/type_check.rb +10 -3
  53. data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
  54. data/lib/pdf/reader/validating_receiver.rb +29 -0
  55. data/lib/pdf/reader/width_calculator/built_in.rb +10 -3
  56. data/lib/pdf/reader/width_calculator/composite.rb +5 -1
  57. data/lib/pdf/reader/width_calculator/true_type.rb +5 -1
  58. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +3 -1
  59. data/lib/pdf/reader/width_calculator/type_zero.rb +2 -0
  60. data/lib/pdf/reader/xref.rb +28 -7
  61. data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
  62. data/lib/pdf/reader.rb +18 -2
  63. data/rbi/pdf-reader.rbi +1511 -1594
  64. metadata +17 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38765d176ae7b8f4cff7ea6f10fff00b811f6812629d76a2b966f36139c23188
4
- data.tar.gz: a406d525e4fccb84cc9e86b28aab06a12854c6f0f297a1a479d26b3f845267f6
3
+ metadata.gz: 680d773fb89a823854ce986d7e35c5313df55087d0b4e8bfc3c70c51d97a8130
4
+ data.tar.gz: 7484cc4e28a01b9a74b869c2dede32a47bfd2519cded7ff6fc421c99d546a406
5
5
  SHA512:
6
- metadata.gz: 010c16b1528d4c46d0175737c9694e2e326092b5e7091cbdd0e0ca41567e662b1adabe989c33b0b919a021bee9f985fa4f2862058bd144762c090e718b3089cc
7
- data.tar.gz: 996fe5b0761280edd67c5523d00c04519b7c682c5ededd86d8dfd412df6e11d554d162ab5b4eb231709f4d3013c5963129b32358ef0b49a4521e8ba72dcf490b
6
+ metadata.gz: e9b2ad3cfb37fb76f731d646bf5097e0334d88b3f7e5ba39abfe9530e51a7dbcfcf08a75d81b6cc1e17f2b18015a25f3ef9ef0d9b75b0d5763eb38470263ee49
7
+ data.tar.gz: 7acdc84e89045708ac4c983deefa2bb1a6246f1a57d8d9a809efeb3b1477d69921238067b797f78d2b7a33eff9bd54c03e139fd4bed10102f365a4d48e8899a0
data/CHANGELOG CHANGED
@@ -1,3 +1,22 @@
1
+ v2.15.1 (28th December 2025)
2
+ - Add ruby 4.0 to the CI matrix (https://github.com/yob/pdf-reader/pull/575)
3
+ - Avoiding raising an error when ToUnicode poins to the wrong object type (https://github.com/yob/pdf-reader/pull/573)
4
+ - Skip invalid UTF-16 surrogate pairs in CMaps (https://github.com/yob/pdf-reader/pull/574)
5
+
6
+ v2.15.0 (13th August 2025)
7
+ - Overhaul sorbet types, moving from an external RBI file to inline comments in RBS syntax
8
+ - multiple PRs, but mainly https://github.com/yob/pdf-reader/pull/562
9
+ - See https://railsatscale.com/2025-04-23-rbs-support-for-sorbet/
10
+ - No impact expected for most users, but projects that use sorbet may find subtle changes in
11
+ the RBI file that is shipped with the gem
12
+ - Relax version requirements for dependency `afm`, allow 1.x (https://github.com/yob/pdf-reader/pull/557)
13
+ - Improve text positioning logic in some PDFs (https://github.com/yob/pdf-reader/pull/554)
14
+ - Multiple fixes for encrypted files
15
+ - Some files with passwords > 32 bytes long (https://github.com/yob/pdf-reader/pull/555)
16
+ - Some files that contain cipher text with a 16 byte IV and no further blocks (https://github.com/yob/pdf-reader/pull/561)
17
+ - Some files that encrypted data with no padding (https://github.com/yob/pdf-reader/pull/564)
18
+ - Add jruby 10 to CI matrix (https://github.com/yob/pdf-reader/pull/552)
19
+
1
20
  v2.14.1 (4th February 2025)
2
21
  - Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
3
22
 
@@ -46,28 +46,37 @@ class PDF::Reader
46
46
  less_than_or_equal
47
47
  include
48
48
  exclude
49
- ]
49
+ ] #: Array[Symbol]
50
50
 
51
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
51
52
  def self.only(text_runs, filter_hash)
52
53
  new(text_runs, filter_hash).only
53
54
  end
54
55
 
56
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
55
57
  def self.exclude(text_runs, filter_hash)
56
58
  new(text_runs, filter_hash).exclude
57
59
  end
58
60
 
59
- attr_reader :text_runs, :filter_hash
61
+ #: Array[PDF::Reader::TextRun]
62
+ attr_reader :text_runs
60
63
 
64
+ #: Hash[Symbol, untyped]
65
+ attr_reader :filter_hash
66
+
67
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> void
61
68
  def initialize(text_runs, filter_hash)
62
69
  @text_runs = text_runs
63
70
  @filter_hash = filter_hash
64
71
  end
65
72
 
73
+ #: () -> Array[PDF::Reader::TextRun]
66
74
  def only
67
75
  return text_runs if filter_hash.empty?
68
76
  text_runs.select { |text_run| evaluate_filter(text_run) }
69
77
  end
70
78
 
79
+ #: () -> Array[PDF::Reader::TextRun]
71
80
  def exclude
72
81
  return text_runs if filter_hash.empty?
73
82
  text_runs.reject { |text_run| evaluate_filter(text_run) }
@@ -75,6 +84,7 @@ class PDF::Reader
75
84
 
76
85
  private
77
86
 
87
+ #: (PDF::Reader::TextRun) -> bool
78
88
  def evaluate_filter(text_run)
79
89
  if filter_hash[:or]
80
90
  evaluate_or_filters(text_run, filter_hash[:or])
@@ -85,24 +95,28 @@ class PDF::Reader
85
95
  end
86
96
  end
87
97
 
98
+ #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
88
99
  def evaluate_or_filters(text_run, conditions)
89
100
  conditions.any? do |condition|
90
101
  evaluate_filters(text_run, condition)
91
102
  end
92
103
  end
93
104
 
105
+ #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
94
106
  def evaluate_and_filters(text_run, conditions)
95
107
  conditions.all? do |condition|
96
108
  evaluate_filters(text_run, condition)
97
109
  end
98
110
  end
99
111
 
112
+ #: (PDF::Reader::TextRun, Hash[Symbol, untyped]) -> bool
100
113
  def evaluate_filters(text_run, filter_hash)
101
114
  filter_hash.all? do |attribute, conditions|
102
115
  evaluate_attribute_conditions(text_run, attribute, conditions)
103
116
  end
104
117
  end
105
118
 
119
+ #: (PDF::Reader::TextRun, Symbol, Hash[Symbol, untyped]) -> bool
106
120
  def evaluate_attribute_conditions(text_run, attribute, conditions)
107
121
  conditions.all? do |operator, value|
108
122
  unless VALID_OPERATORS.include?(operator)
@@ -113,6 +127,7 @@ class PDF::Reader
113
127
  end
114
128
  end
115
129
 
130
+ #: (untyped, Symbol, untyped) -> bool
116
131
  def apply_operator(attribute_value, operator, filter_value)
117
132
  case operator
118
133
  when :equal
@@ -11,6 +11,7 @@ class PDF::Reader
11
11
  #
12
12
  class AesV2SecurityHandler
13
13
 
14
+ #: (String) -> void
14
15
  def initialize(key)
15
16
  @encrypt_key = key
16
17
  end
@@ -21,10 +22,38 @@ class PDF::Reader
21
22
  #
22
23
  # version == 4 and CFM == AESV2
23
24
  #
25
+ # used to decrypt PDF streams (buf). Input data should be in bytesizes of
26
+ # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
27
+ # vector, so any input of exactly 16 bytes decrypts to an empty string
28
+ #
24
29
  # buf - a string to decrypt
25
30
  # ref - a PDF::Reader::Reference for the object to decrypt
26
31
  #
32
+ #: (String, PDF::Reader::Reference) -> String
27
33
  def decrypt( buf, ref )
34
+ if buf.bytesize % 16 > 0
35
+ raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
36
+ elsif buf.bytesize == 16
37
+ return ""
38
+ else
39
+ begin
40
+ internal_decrypt(buf, ref)
41
+ rescue OpenSSL::Cipher::CipherError
42
+ # If we failed to decrypt it might be a padding error, so try again
43
+ # and assume no padding in the ciphertext. This will "suceed" but might
44
+ # return garbage if the key is incorrect but that's OK - well before this
45
+ # class is used we have confirmed the user provided key is correct so if
46
+ # this works without error we can be confident the returned plaintext is
47
+ # correct
48
+ internal_decrypt(buf, ref, false)
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ #: (String, PDF::Reader::Reference, ?bool) -> String
56
+ def internal_decrypt(buf, ref, padding = true)
28
57
  objKey = @encrypt_key.dup
29
58
  (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
59
  (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
@@ -32,6 +61,7 @@ class PDF::Reader
32
61
  length = objKey.length < 16 ? objKey.length : 16
33
62
  cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
63
  cipher.decrypt
64
+ cipher.padding = 0 unless padding
35
65
  cipher.key = Digest::MD5.digest(objKey)[0,length]
36
66
  cipher.iv = buf[0..15]
37
67
  cipher.update(buf[16..-1]) + cipher.final
@@ -12,27 +12,59 @@ class PDF::Reader
12
12
  #
13
13
  class AesV3SecurityHandler
14
14
 
15
+ #: (String) -> void
15
16
  def initialize(key)
17
+ if key.bytesize != 32
18
+ raise PDF::Reader::MalformedPDFError.new(
19
+ "AES-256 key must be exactly 32 bytes, got #{key.bytesize}"
20
+ )
21
+ end
16
22
  @encrypt_key = key
17
- @cipher = "AES-256-CBC"
23
+ @cipher = "AES-256-CBC" #: String
18
24
  end
19
25
 
20
26
  ##7.6.2 General Encryption Algorithm
21
27
  #
22
28
  # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
29
  #
24
- # used to decrypt RC4/AES encrypted PDF streams (buf)
30
+ # used to decrypt RC4/AES encrypted PDF streams (buf). Input data should be in bytesizes of
31
+ # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
32
+ # vector, so any input of exactly 16 bytes decrypts to an empty string
25
33
  #
26
34
  # buf - a string to decrypt
27
35
  # ref - a PDF::Reader::Reference for the object to decrypt
28
36
  #
37
+ #: (String, PDF::Reader::Reference) -> String
29
38
  def decrypt( buf, ref )
39
+ if buf.bytesize % 16 > 0
40
+ raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
41
+ elsif buf.bytesize == 16
42
+ return ""
43
+ else
44
+ begin
45
+ internal_decrypt(buf, ref)
46
+ rescue OpenSSL::Cipher::CipherError
47
+ # If we failed to decrypt it might be a padding error, so try again
48
+ # and assume no padding in the ciphertext. This will "suceed" but might
49
+ # return garbage if the key is incorrect but that's OK - well before this
50
+ # class is used we have confirmed the user provided key is correct so if
51
+ # this works without error we can be confident the returned plaintext is
52
+ # correct
53
+ internal_decrypt(buf, ref, false)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+
60
+ #: (String, PDF::Reader::Reference, ?bool) -> String
61
+ def internal_decrypt(buf, ref, padding = true)
30
62
  cipher = OpenSSL::Cipher.new(@cipher)
31
63
  cipher.decrypt
64
+ cipher.padding = 0 unless padding
32
65
  cipher.key = @encrypt_key.dup
33
66
  cipher.iv = buf[0..15]
34
67
  cipher.update(buf[16..-1]) + cipher.final
35
68
  end
36
-
37
69
  end
38
70
  end
@@ -8,6 +8,7 @@ class PDF::Reader
8
8
  # MediaBox or CropBox, but could be a user specified rectangle too
9
9
  class BoundingRectangleRunsFilter
10
10
 
11
+ #: (Array[PDF::Reader::TextRun], PDF::Reader::Rectangle) -> Array[PDF::Reader::TextRun]
11
12
  def self.runs_within_rect(runs, rect)
12
13
  runs.select { |run| rect.contains?(run.origin) }
13
14
  end
@@ -38,30 +38,31 @@ class PDF::Reader
38
38
  # the raw tokens into objects we can work with (strings, ints, arrays, etc)
39
39
  #
40
40
  class Buffer
41
- TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
42
- TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
41
+ TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
42
+ TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]
43
43
 
44
44
  # some strings for comparissons. Declaring them here avoids creating new
45
45
  # strings that need GC over and over
46
- LEFT_PAREN = "("
47
- LESS_THAN = "<"
48
- STREAM = "stream"
49
- ID = "ID"
50
- FWD_SLASH = "/"
51
- NULL_BYTE = "\x00"
52
- CR = "\r"
53
- LF = "\n"
54
- CRLF = "\r\n"
55
- WHITE_SPACE = ["\n", "\r", ' ']
46
+ LEFT_PAREN = "(" #: String
47
+ LESS_THAN = "<" #: String
48
+ STREAM = "stream" #: String
49
+ ID = "ID" #: String
50
+ FWD_SLASH = "/" #: String
51
+ NULL_BYTE = "\x00" #: String
52
+ CR = "\r" #: String
53
+ LF = "\n" #: String
54
+ CRLF = "\r\n" #: String
55
+ WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]
56
56
 
57
57
  # Quite a few PDFs have trailing junk.
58
58
  # This can be several k of nuls in some cases
59
59
  # Allow for this here
60
- TRAILING_BYTECOUNT = 5000
60
+ TRAILING_BYTECOUNT = 5000 #: Integer
61
61
 
62
62
  # must match whole tokens
63
- DIGITS_ONLY = %r{\A\d+\z}
63
+ DIGITS_ONLY = %r{\A\d+\z} #: Regexp
64
64
 
65
+ #: Integer
65
66
  attr_reader :pos
66
67
 
67
68
  # Creates a new buffer.
@@ -76,17 +77,19 @@ class PDF::Reader
76
77
  # :content_stream - set to true if buffer will be tokenising a
77
78
  # content stream. Defaults to false
78
79
  #
80
+ #: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
79
81
  def initialize(io, opts = {})
80
82
  @io = io
81
- @tokens = []
82
- @in_content_stream = opts[:content_stream]
83
+ @tokens = [] #: Array[String | PDF::Reader::Reference]
84
+ @in_content_stream = opts[:content_stream] #: bool
83
85
 
84
86
  @io.seek(opts[:seek]) if opts[:seek]
85
- @pos = @io.pos
87
+ @pos = @io.pos #: Integer
86
88
  end
87
89
 
88
90
  # return true if there are no more tokens left
89
91
  #
92
+ #: () -> bool
90
93
  def empty?
91
94
  prepare_tokens if @tokens.size < 3
92
95
 
@@ -105,6 +108,7 @@ class PDF::Reader
105
108
  # Skipping a bare CR is not spec-compliant.
106
109
  # This is because the data may start with LF.
107
110
  # However we check for CRLF first, so the ambiguity is avoided.
111
+ #: (Integer, ?Hash[Symbol, untyped]) -> String?
108
112
  def read(bytes, opts = {})
109
113
  reset_pos
110
114
 
@@ -130,6 +134,7 @@ class PDF::Reader
130
134
  # return the next token from the source. Returns a string if a token
131
135
  # is found, nil if there are no tokens left.
132
136
  #
137
+ #: () -> (nil | String | PDF::Reader::Reference)
133
138
  def token
134
139
  reset_pos
135
140
  prepare_tokens if @tokens.size < 3
@@ -141,6 +146,7 @@ class PDF::Reader
141
146
 
142
147
  # return the byte offset where the first XRef table in th source can be found.
143
148
  #
149
+ #: () -> Integer
144
150
  def find_first_xref_offset
145
151
  check_size_is_non_zero
146
152
  @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
@@ -164,6 +170,7 @@ class PDF::Reader
164
170
 
165
171
  private
166
172
 
173
+ #: () -> void
167
174
  def check_size_is_non_zero
168
175
  @io.seek(-1, IO::SEEK_END)
169
176
  @io.seek(0)
@@ -173,12 +180,14 @@ class PDF::Reader
173
180
 
174
181
  # Returns true if this buffer is parsing a content stream
175
182
  #
183
+ #: () -> bool
176
184
  def in_content_stream?
177
185
  @in_content_stream ? true : false
178
186
  end
179
187
 
180
188
  # Some bastard moved our IO stream cursor. Restore it.
181
189
  #
190
+ #: () -> void
182
191
  def reset_pos
183
192
  @io.seek(@pos) if @io.pos != @pos
184
193
  end
@@ -186,12 +195,14 @@ class PDF::Reader
186
195
  # save the current position of the source IO stream. If someone else (like another buffer)
187
196
  # moves the cursor, we can then restore it.
188
197
  #
198
+ #: () -> void
189
199
  def save_pos
190
200
  @pos = @io.pos
191
201
  end
192
202
 
193
203
  # attempt to prime the buffer with the next few tokens.
194
204
  #
205
+ #: () -> void
195
206
  def prepare_tokens
196
207
  10.times do
197
208
  case state
@@ -208,6 +219,7 @@ class PDF::Reader
208
219
  # tokenising behaves slightly differently based on the current context.
209
220
  # Determine the current context/state by examining the last token we found
210
221
  #
222
+ #: () -> Symbol
211
223
  def state
212
224
  case @tokens.last
213
225
  when LEFT_PAREN then :literal_string
@@ -236,6 +248,7 @@ class PDF::Reader
236
248
  # indirect reference, so test for that case first and avoid the relatively
237
249
  # expensive regexp checks if possible.
238
250
  #
251
+ #: () -> void
239
252
  def merge_indirect_reference
240
253
  return if @tokens.size < 3
241
254
  return if @tokens[2] != "R"
@@ -253,6 +266,7 @@ class PDF::Reader
253
266
  # If the EI follows white-space the space is dropped from the data
254
267
  # The EI must followed by white-space or end of buffer
255
268
  # This is to reduce the chance of accidentally matching an embedded EI
269
+ #: () -> void
256
270
  def prepare_inline_token
257
271
  idstart = @io.pos
258
272
  prevchr = ''
@@ -299,6 +313,7 @@ class PDF::Reader
299
313
  # if we're currently inside a hex string, read hex nibbles until
300
314
  # we find a closing >
301
315
  #
316
+ #: () -> void
302
317
  def prepare_hex_token
303
318
  str = "".dup
304
319
 
@@ -328,6 +343,7 @@ class PDF::Reader
328
343
  # processing to fix things like escaped new lines, but that's someone else's
329
344
  # problem.
330
345
  #
346
+ #: () -> void
331
347
  def prepare_literal_token
332
348
  str = "".dup
333
349
  count = 1
@@ -358,6 +374,7 @@ class PDF::Reader
358
374
  # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
359
375
  # to read up on it.
360
376
  #
377
+ #: () -> void
361
378
  def prepare_regular_token
362
379
  tok = "".dup
363
380
 
@@ -435,6 +452,7 @@ class PDF::Reader
435
452
  # peek at the next character in the io stream, leaving the stream position
436
453
  # untouched
437
454
  #
455
+ #: () -> (Integer | nil)
438
456
  def peek_byte
439
457
  byte = @io.getbyte
440
458
  @io.seek(-1, IO::SEEK_CUR) if byte
@@ -18,12 +18,14 @@ class PDF::Reader
18
18
  # Graphics State Operators
19
19
  def_delegators :@widths, :[], :fetch
20
20
 
21
+ #: (Numeric, Array[Numeric]) -> void
21
22
  def initialize(default, array)
22
- @widths = parse_array(default, array.dup)
23
+ @widths = parse_array(default, array.dup) #: Hash[Numeric, Numeric]
23
24
  end
24
25
 
25
26
  private
26
27
 
28
+ #: (Numeric, Array[Numeric]) -> Hash[Numeric, Numeric]
27
29
  def parse_array(default, array)
28
30
  widths = Hash.new(default)
29
31
  params = []
@@ -43,6 +45,8 @@ class PDF::Reader
43
45
 
44
46
  # this is the form 10 [234 63 234 346 47 234] where width of index 10 is
45
47
  # 234, index 11 is 63, etc
48
+ #
49
+ #: (Integer, Array[Numeric]) -> Hash[Numeric, Numeric]
46
50
  def parse_first_form(first, widths)
47
51
  widths.inject({}) { |accum, glyph_width|
48
52
  accum[first + accum.size] = glyph_width
@@ -51,6 +55,8 @@ class PDF::Reader
51
55
  end
52
56
 
53
57
  # this is the form 10 20 123 where all index between 10 and 20 have width 123
58
+ #
59
+ #: (Integer, Integer, Numeric) -> Hash[Numeric, Numeric]
54
60
  def parse_second_form(first, final, width)
55
61
  if first > final
56
62
  raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -44,15 +44,22 @@ class PDF::Reader
44
44
  "begin" => :noop,
45
45
  "begincmap" => :noop,
46
46
  "def" => :noop
47
- }
47
+ } #: Hash[String, Symbol]
48
48
 
49
+ # Indicates the start of a UTF-16 surrogate pair, see
50
+ # https://en.wikipedia.org/wiki/Universal_Character_Set_characters
51
+ HIGH_SURROGATE_RANGE = (0xD800..0xDBFF) #: Range[Integer]
52
+
53
+ #: Hash[Integer, Array[Integer]]
49
54
  attr_reader :map
50
55
 
56
+ #: (String) -> void
51
57
  def initialize(data)
52
- @map = {}
58
+ @map = {} #: Hash[Integer, Array[Integer]]
53
59
  process_data(data)
54
60
  end
55
61
 
62
+ #: () -> Integer
56
63
  def size
57
64
  @map.size
58
65
  end
@@ -61,12 +68,14 @@ class PDF::Reader
61
68
  #
62
69
  # Returns an array of Integers.
63
70
  #
71
+ #: (Integer) -> Array[Integer]
64
72
  def decode(c)
65
73
  @map.fetch(c, [])
66
74
  end
67
75
 
68
76
  private
69
77
 
78
+ #: (String, ?Symbol) -> void
70
79
  def process_data(data, initial_mode = :none)
71
80
  parser = build_parser(data)
72
81
  mode = initial_mode
@@ -96,6 +105,7 @@ class PDF::Reader
96
105
  end
97
106
 
98
107
 
108
+ #: (String) -> PDF::Reader::Parser
99
109
  def build_parser(instructions)
100
110
  buffer = Buffer.new(StringIO.new(instructions))
101
111
  Parser.new(buffer)
@@ -109,6 +119,7 @@ class PDF::Reader
109
119
  # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
110
120
  # exception when we try converting broken UTF-16 to UTF-8
111
121
  #
122
+ #: (String) -> Array[Integer]
112
123
  def str_to_int(str)
113
124
  unpacked_string = if str.bytesize == 1 # UTF-8
114
125
  str.unpack("C*")
@@ -117,15 +128,16 @@ class PDF::Reader
117
128
  end
118
129
  result = []
119
130
  while unpacked_string.any? do
120
- if unpacked_string.size >= 2 &&
121
- unpacked_string.first.to_i >= 0xD800 &&
122
- unpacked_string.first.to_i <= 0xDBFF
131
+ if unpacked_string.size >= 2 && HIGH_SURROGATE_RANGE.include?(unpacked_string.first.to_i)
123
132
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
124
133
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
125
134
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
126
135
  point_one = unpacked_string.shift.to_i
127
136
  point_two = unpacked_string.shift.to_i
128
137
  result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
138
+ elsif unpacked_string.size == 1 && HIGH_SURROGATE_RANGE.include?(unpacked_string.first.to_i)
139
+ # the start of a surrogate pair but the pair is missing. Skip it
140
+ unpacked_string.shift
129
141
  else
130
142
  result << unpacked_string.shift
131
143
  end
@@ -133,6 +145,7 @@ class PDF::Reader
133
145
  result
134
146
  end
135
147
 
148
+ #: (Array[String]) -> void
136
149
  def process_bfchar_instructions(instructions)
137
150
  instructions.each_slice(2) do |one, two|
138
151
  find = str_to_int(one.to_s)
@@ -143,6 +156,7 @@ class PDF::Reader
143
156
  end
144
157
  end
145
158
 
159
+ #: (Array[Array[String] | String]) -> void
146
160
  def process_bfrange_instructions(instructions)
147
161
  instructions.each_slice(3) do |start, finish, to|
148
162
  if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
@@ -155,6 +169,7 @@ class PDF::Reader
155
169
  end
156
170
  end
157
171
 
172
+ #: (String, String, String) -> void
158
173
  def bfrange_type_one(start_code, end_code, dst)
159
174
  start_code = str_to_int(start_code).first
160
175
  end_code = str_to_int(end_code).first
@@ -168,6 +183,7 @@ class PDF::Reader
168
183
  end
169
184
  end
170
185
 
186
+ #: (String, String, Array[String]) -> void
171
187
  def bfrange_type_two(start_code, end_code, dst)
172
188
  start_code = str_to_int(start_code).first
173
189
  end_code = str_to_int(end_code).first