pdf-reader 2.6.0 → 2.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +30 -1
  3. data/Rakefile +1 -1
  4. data/examples/rspec.rb +1 -0
  5. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  6. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  7. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  8. data/lib/pdf/reader/buffer.rb +36 -33
  9. data/lib/pdf/reader/cid_widths.rb +1 -0
  10. data/lib/pdf/reader/cmap.rb +65 -50
  11. data/lib/pdf/reader/encoding.rb +2 -1
  12. data/lib/pdf/reader/error.rb +16 -0
  13. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  14. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  15. data/lib/pdf/reader/filter/depredict.rb +8 -6
  16. data/lib/pdf/reader/filter/flate.rb +4 -2
  17. data/lib/pdf/reader/filter/lzw.rb +2 -0
  18. data/lib/pdf/reader/filter/null.rb +1 -1
  19. data/lib/pdf/reader/filter/run_length.rb +19 -13
  20. data/lib/pdf/reader/filter.rb +11 -11
  21. data/lib/pdf/reader/font.rb +72 -16
  22. data/lib/pdf/reader/font_descriptor.rb +19 -17
  23. data/lib/pdf/reader/form_xobject.rb +15 -5
  24. data/lib/pdf/reader/glyph_hash.rb +1 -0
  25. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  26. data/lib/pdf/reader/lzw.rb +4 -2
  27. data/lib/pdf/reader/null_security_handler.rb +1 -4
  28. data/lib/pdf/reader/object_cache.rb +1 -0
  29. data/lib/pdf/reader/object_hash.rb +252 -44
  30. data/lib/pdf/reader/object_stream.rb +1 -0
  31. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  32. data/lib/pdf/reader/page.rb +99 -19
  33. data/lib/pdf/reader/page_layout.rb +28 -32
  34. data/lib/pdf/reader/page_state.rb +12 -11
  35. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  36. data/lib/pdf/reader/pages_strategy.rb +1 -0
  37. data/lib/pdf/reader/parser.rb +26 -8
  38. data/lib/pdf/reader/point.rb +25 -0
  39. data/lib/pdf/reader/print_receiver.rb +1 -0
  40. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  41. data/lib/pdf/reader/rectangle.rb +113 -0
  42. data/lib/pdf/reader/reference.rb +1 -0
  43. data/lib/pdf/reader/register_receiver.rb +1 -0
  44. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  45. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  46. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  47. data/lib/pdf/reader/stream.rb +2 -1
  48. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  49. data/lib/pdf/reader/text_run.rb +14 -6
  50. data/lib/pdf/reader/token.rb +1 -0
  51. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  52. data/lib/pdf/reader/type_check.rb +52 -0
  53. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  54. data/lib/pdf/reader/validating_receiver.rb +262 -0
  55. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  56. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  57. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  58. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  60. data/lib/pdf/reader/width_calculator.rb +1 -0
  61. data/lib/pdf/reader/xref.rb +21 -3
  62. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  63. data/lib/pdf/reader.rb +46 -15
  64. data/lib/pdf-reader.rb +1 -0
  65. data/rbi/pdf-reader.rbi +1978 -0
  66. metadata +22 -13
  67. data/lib/pdf/reader/orientation_detector.rb +0 -34
  68. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
4
- data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
3
+ metadata.gz: 07c734cf3cfc0abf1102f813976d4936d33b57815f114ce92224bbd605fe16a2
4
+ data.tar.gz: f52b1751f83717a7bc96c56e8d830559d387fb430cfa6fa2a78604d98c7476f4
5
5
  SHA512:
6
- metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
7
- data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
6
+ metadata.gz: 72fda8f6b32c20782adca6cca44d291c7cbe4ac9d858da5ed1c815af2a7d6680e3906cac47a8414923c8db639fd51365d9da8612c1c7f79a674b22448bb35cae
7
+ data.tar.gz: fa79a29d80a36d37e1188769bf7991d5108bbe08b11711a7c9bb1741cedd3682b77afe219a24ae7844fdbf10b23ca3eb5434f4b9418d7002f07fb8edf9dd6e26
data/CHANGELOG CHANGED
@@ -1,6 +1,35 @@
1
+ v2.9.1 (4th February 2022)
2
+ - Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
3
+ - Other small bug fixes
4
+
5
+ v2.9.0 (24th January 2022)
6
+ - Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
7
+ - Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
8
+ - For sorbet users, additional type annotations are included in the gem
9
+
10
+ v2.8.0 (28th Decemeber 2021)
11
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
12
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
13
+ - including extracting the text for only part of the page
14
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
15
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
16
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
17
+
18
+ v2.7.0 (13th December 2021)
19
+ - Include RBI type files in the gem
20
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
21
+ now be typed checked by sorbet
22
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
23
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
24
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
25
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
26
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
27
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
28
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
29
+
1
30
  v2.6.0 (12th November 2021)
2
31
  - Text extraction improvements
3
- - Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
32
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
4
33
  - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
5
34
  - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
6
35
  - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 32
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+
7
+ class PDF::Reader
8
+
9
+ # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
10
+ # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
11
+ #
12
+ class AesV2SecurityHandler
13
+
14
+ def initialize(key)
15
+ @encrypt_key = key
16
+ end
17
+
18
+ ##7.6.2 General Encryption Algorithm
19
+ #
20
+ # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
21
+ #
22
+ # version == 4 and CFM == AESV2
23
+ #
24
+ # buf - a string to decrypt
25
+ # ref - a PDF::Reader::Reference for the object to decrypt
26
+ #
27
+ def decrypt( buf, ref )
28
+ objKey = @encrypt_key.dup
29
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
31
+ objKey << 'sAlT' # Algorithm 1, b)
32
+ length = objKey.length < 16 ? objKey.length : 16
33
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
+ cipher.decrypt
35
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
36
+ cipher.iv = buf[0..15]
37
+ cipher.update(buf[16..-1]) + cipher.final
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest'
6
+ require 'openssl'
7
+
8
+ class PDF::Reader
9
+
10
+ # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
11
+ # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
12
+ #
13
+ class AesV3SecurityHandler
14
+
15
+ def initialize(key)
16
+ @encrypt_key = key
17
+ @cipher = "AES-256-CBC"
18
+ end
19
+
20
+ ##7.6.2 General Encryption Algorithm
21
+ #
22
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
+ #
24
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
25
+ #
26
+ # buf - a string to decrypt
27
+ # ref - a PDF::Reader::Reference for the object to decrypt
28
+ #
29
+ def decrypt( buf, ref )
30
+ cipher = OpenSSL::Cipher.new(@cipher)
31
+ cipher.decrypt
32
+ cipher.key = @encrypt_key.dup
33
+ cipher.iv = buf[0..15]
34
+ cipher.update(buf[16..-1]) + cipher.final
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -58,6 +59,9 @@ class PDF::Reader
58
59
  # Allow for this here
59
60
  TRAILING_BYTECOUNT = 5000
60
61
 
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
64
+
61
65
  attr_reader :pos
62
66
 
63
67
  # Creates a new buffer.
@@ -142,13 +146,20 @@ class PDF::Reader
142
146
  @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
143
147
  data = @io.read(TRAILING_BYTECOUNT)
144
148
 
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
150
+
145
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
146
152
  lines = data.split(/[\n\r]+/).reverse
147
153
  eof_index = lines.index { |l| l.strip[/^%%EOF/] }
148
154
 
149
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
150
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
151
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
152
163
  end
153
164
 
154
165
  private
@@ -229,13 +240,12 @@ class PDF::Reader
229
240
  return if @tokens.size < 3
230
241
  return if @tokens[2] != "R"
231
242
 
232
- # must match whole tokens
233
- digits_only = %r{\A\d+\z}
234
- if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
235
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
236
- @tokens[1] = nil
237
- @tokens[2] = nil
238
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
239
249
  end
240
250
  end
241
251
 
@@ -245,7 +255,7 @@ class PDF::Reader
245
255
  # This is to reduce the chance of accidentally matching an embedded EI
246
256
  def prepare_inline_token
247
257
  idstart = @io.pos
248
- chr = prevchr = nil
258
+ prevchr = ''
249
259
  eisize = 0 # how many chars in the end marker
250
260
  seeking = 'E' # what are we looking for now?
251
261
  loop do
@@ -263,11 +273,11 @@ class PDF::Reader
263
273
  end
264
274
  when 'I'
265
275
  if chr == 'I'
266
- seeking = :END
276
+ seeking = ''
267
277
  else
268
278
  seeking = 'E'
269
279
  end
270
- when :END
280
+ when ''
271
281
  if WHITE_SPACE.include? chr
272
282
  eisize += 1 # Drop trailer
273
283
  break
@@ -275,28 +285,28 @@ class PDF::Reader
275
285
  seeking = 'E'
276
286
  end
277
287
  end
278
- prevchr = chr
288
+ prevchr = chr.is_a?(String) ? chr : ''
279
289
  end
280
- unless seeking == :END
290
+ unless seeking == ''
281
291
  raise MalformedPDFError, "EI terminator not found"
282
292
  end
283
293
  eiend = @io.pos
284
294
  @io.seek(idstart, IO::SEEK_SET)
285
295
  str = @io.read(eiend - eisize - idstart) # get the ID content
286
- @tokens << string_token(str)
296
+ @tokens << str.freeze if str
287
297
  end
288
298
 
289
299
  # if we're currently inside a hex string, read hex nibbles until
290
300
  # we find a closing >
291
301
  #
292
302
  def prepare_hex_token
303
+ finished = :false
293
304
  str = "".dup
294
- finished = false
295
305
 
296
- while !finished
306
+ until finished == :true
297
307
  byte = @io.getbyte
298
308
  if byte.nil?
299
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
300
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
301
311
  str << byte
302
312
  elsif byte <= 32
@@ -305,7 +315,7 @@ class PDF::Reader
305
315
  @tokens << str if str.size > 0
306
316
  @tokens << ">" if byte != 0x3E # '>'
307
317
  @tokens << byte.chr
308
- finished = true
318
+ finished = :true
309
319
  end
310
320
  end
311
321
  end
@@ -352,14 +362,17 @@ class PDF::Reader
352
362
  def prepare_regular_token
353
363
  tok = "".dup
354
364
 
355
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
356
368
  case byte
369
+ when nil
370
+ break
357
371
  when 0x25
358
372
  # comment, ignore everything until the next EOL char
359
- done = false
360
- while !done
361
- byte = @io.getbyte
362
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
363
376
  end
364
377
  when *TOKEN_WHITESPACE
365
378
  # white space, token finished
@@ -429,15 +442,5 @@ class PDF::Reader
429
442
  byte
430
443
  end
431
444
 
432
- # for a handful of tokens we want to tell the parser how to convert them
433
- # into higher level tokens. This methods adds a to_token() method
434
- # to tokens that should remain as strings.
435
- #
436
- def string_token(token)
437
- def token.to_token
438
- to_s
439
- end
440
- token
441
- end
442
445
  end
443
446
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,13 +62,40 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
@@ -105,7 +110,6 @@ class PDF::Reader
105
110
  # exception when we try converting broken UTF-16 to UTF-8
106
111
  #
107
112
  def str_to_int(str)
108
- return nil if str.nil? || str.size == 0
109
113
  unpacked_string = if str.bytesize == 1 # UTF-8
110
114
  str.unpack("C*")
111
115
  else # UTF-16
@@ -113,12 +117,15 @@ class PDF::Reader
113
117
  end
114
118
  result = []
115
119
  while unpacked_string.any? do
116
- if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
117
123
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
124
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
125
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
- points = [unpacked_string.shift, unpacked_string.shift]
121
- result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
122
129
  else
123
130
  result << unpacked_string.shift
124
131
  end
@@ -128,9 +135,11 @@ class PDF::Reader
128
135
 
129
136
  def process_bfchar_instructions(instructions)
130
137
  instructions.each_slice(2) do |one, two|
131
- find = str_to_int(one)
132
- replace = str_to_int(two)
133
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
134
143
  end
135
144
  end
136
145
 
@@ -141,30 +150,36 @@ class PDF::Reader
141
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
142
151
  bfrange_type_two(start, finish, to)
143
152
  else
144
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
145
154
  end
146
155
  end
147
156
  end
148
157
 
149
158
  def bfrange_type_one(start_code, end_code, dst)
150
- start_code = str_to_int(start_code)[0]
151
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
152
161
  dst = str_to_int(dst)
153
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
154
165
  # add all values in the range to our mapping
155
166
  (start_code..end_code).each_with_index do |val, idx|
156
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
157
168
  end
158
169
  end
159
170
 
160
171
  def bfrange_type_two(start_code, end_code, dst)
161
- start_code = str_to_int(start_code)[0]
162
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
163
177
  from_range = (start_code..end_code)
164
178
 
165
179
  # add all values in the range to our mapping
166
180
  from_range.each_with_index do |val, idx|
167
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
168
183
  end
169
184
  end
170
185
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -46,6 +47,21 @@ class PDF::Reader
46
47
  raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_type_as_malformed(object, name, klass)
55
+ raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
56
+ end
57
+ ################################################################################
58
+ def self.validate_not_nil(object, name)
59
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
60
+ end
61
+ ################################################################################
62
+ def self.validate_not_nil_as_malformed(object, name)
63
+ raise MalformedPDFError, "#{object} must not be nil" if object.nil?
64
+ end
49
65
  end
50
66
 
51
67
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,8 +7,9 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
- @options = options || {}
12
+ @options = options
11
13
  end
12
14
 
13
15
  ################################################################################
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left