pdf-reader 2.6.0 → 2.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +30 -1
  3. data/Rakefile +1 -1
  4. data/examples/rspec.rb +1 -0
  5. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  6. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  7. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  8. data/lib/pdf/reader/buffer.rb +36 -33
  9. data/lib/pdf/reader/cid_widths.rb +1 -0
  10. data/lib/pdf/reader/cmap.rb +65 -50
  11. data/lib/pdf/reader/encoding.rb +2 -1
  12. data/lib/pdf/reader/error.rb +16 -0
  13. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  14. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  15. data/lib/pdf/reader/filter/depredict.rb +8 -6
  16. data/lib/pdf/reader/filter/flate.rb +4 -2
  17. data/lib/pdf/reader/filter/lzw.rb +2 -0
  18. data/lib/pdf/reader/filter/null.rb +1 -1
  19. data/lib/pdf/reader/filter/run_length.rb +19 -13
  20. data/lib/pdf/reader/filter.rb +11 -11
  21. data/lib/pdf/reader/font.rb +72 -16
  22. data/lib/pdf/reader/font_descriptor.rb +19 -17
  23. data/lib/pdf/reader/form_xobject.rb +15 -5
  24. data/lib/pdf/reader/glyph_hash.rb +1 -0
  25. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  26. data/lib/pdf/reader/lzw.rb +4 -2
  27. data/lib/pdf/reader/null_security_handler.rb +1 -4
  28. data/lib/pdf/reader/object_cache.rb +1 -0
  29. data/lib/pdf/reader/object_hash.rb +252 -44
  30. data/lib/pdf/reader/object_stream.rb +1 -0
  31. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  32. data/lib/pdf/reader/page.rb +99 -19
  33. data/lib/pdf/reader/page_layout.rb +28 -32
  34. data/lib/pdf/reader/page_state.rb +12 -11
  35. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  36. data/lib/pdf/reader/pages_strategy.rb +1 -0
  37. data/lib/pdf/reader/parser.rb +26 -8
  38. data/lib/pdf/reader/point.rb +25 -0
  39. data/lib/pdf/reader/print_receiver.rb +1 -0
  40. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  41. data/lib/pdf/reader/rectangle.rb +113 -0
  42. data/lib/pdf/reader/reference.rb +1 -0
  43. data/lib/pdf/reader/register_receiver.rb +1 -0
  44. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  45. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  46. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  47. data/lib/pdf/reader/stream.rb +2 -1
  48. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  49. data/lib/pdf/reader/text_run.rb +14 -6
  50. data/lib/pdf/reader/token.rb +1 -0
  51. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  52. data/lib/pdf/reader/type_check.rb +52 -0
  53. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  54. data/lib/pdf/reader/validating_receiver.rb +262 -0
  55. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  56. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  57. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  58. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  60. data/lib/pdf/reader/width_calculator.rb +1 -0
  61. data/lib/pdf/reader/xref.rb +21 -3
  62. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  63. data/lib/pdf/reader.rb +46 -15
  64. data/lib/pdf-reader.rb +1 -0
  65. data/rbi/pdf-reader.rbi +1978 -0
  66. metadata +22 -13
  67. data/lib/pdf/reader/orientation_detector.rb +0 -34
  68. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
4
- data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
3
+ metadata.gz: 07c734cf3cfc0abf1102f813976d4936d33b57815f114ce92224bbd605fe16a2
4
+ data.tar.gz: f52b1751f83717a7bc96c56e8d830559d387fb430cfa6fa2a78604d98c7476f4
5
5
  SHA512:
6
- metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
7
- data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
6
+ metadata.gz: 72fda8f6b32c20782adca6cca44d291c7cbe4ac9d858da5ed1c815af2a7d6680e3906cac47a8414923c8db639fd51365d9da8612c1c7f79a674b22448bb35cae
7
+ data.tar.gz: fa79a29d80a36d37e1188769bf7991d5108bbe08b11711a7c9bb1741cedd3682b77afe219a24ae7844fdbf10b23ca3eb5434f4b9418d7002f07fb8edf9dd6e26
data/CHANGELOG CHANGED
@@ -1,6 +1,35 @@
1
+ v2.9.1 (4th February 2022)
2
+ - Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
3
+ - Other small bug fixes
4
+
5
+ v2.9.0 (24th January 2022)
6
+ - Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
7
+ - Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
8
+ - For sorbet users, additional type annotations are included in the gem
9
+
10
+ v2.8.0 (28th Decemeber 2021)
11
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
12
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
13
+ - including extracting the text for only part of the page
14
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
15
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
16
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
17
+
18
+ v2.7.0 (13th December 2021)
19
+ - Include RBI type files in the gem
20
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
21
+ now be typed checked by sorbet
22
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
23
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
24
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
25
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
26
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
27
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
28
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
29
+
1
30
  v2.6.0 (12th November 2021)
2
31
  - Text extraction improvements
3
- - Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
32
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
4
33
  - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
5
34
  - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
6
35
  - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 32
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+
7
+ class PDF::Reader
8
+
9
+ # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
10
+ # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
11
+ #
12
+ class AesV2SecurityHandler
13
+
14
+ def initialize(key)
15
+ @encrypt_key = key
16
+ end
17
+
18
+ ##7.6.2 General Encryption Algorithm
19
+ #
20
+ # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
21
+ #
22
+ # version == 4 and CFM == AESV2
23
+ #
24
+ # buf - a string to decrypt
25
+ # ref - a PDF::Reader::Reference for the object to decrypt
26
+ #
27
+ def decrypt( buf, ref )
28
+ objKey = @encrypt_key.dup
29
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
31
+ objKey << 'sAlT' # Algorithm 1, b)
32
+ length = objKey.length < 16 ? objKey.length : 16
33
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
+ cipher.decrypt
35
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
36
+ cipher.iv = buf[0..15]
37
+ cipher.update(buf[16..-1]) + cipher.final
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest'
6
+ require 'openssl'
7
+
8
+ class PDF::Reader
9
+
10
+ # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
11
+ # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
12
+ #
13
+ class AesV3SecurityHandler
14
+
15
+ def initialize(key)
16
+ @encrypt_key = key
17
+ @cipher = "AES-256-CBC"
18
+ end
19
+
20
+ ##7.6.2 General Encryption Algorithm
21
+ #
22
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
+ #
24
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
25
+ #
26
+ # buf - a string to decrypt
27
+ # ref - a PDF::Reader::Reference for the object to decrypt
28
+ #
29
+ def decrypt( buf, ref )
30
+ cipher = OpenSSL::Cipher.new(@cipher)
31
+ cipher.decrypt
32
+ cipher.key = @encrypt_key.dup
33
+ cipher.iv = buf[0..15]
34
+ cipher.update(buf[16..-1]) + cipher.final
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -58,6 +59,9 @@ class PDF::Reader
58
59
  # Allow for this here
59
60
  TRAILING_BYTECOUNT = 5000
60
61
 
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
64
+
61
65
  attr_reader :pos
62
66
 
63
67
  # Creates a new buffer.
@@ -142,13 +146,20 @@ class PDF::Reader
142
146
  @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
143
147
  data = @io.read(TRAILING_BYTECOUNT)
144
148
 
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
150
+
145
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
146
152
  lines = data.split(/[\n\r]+/).reverse
147
153
  eof_index = lines.index { |l| l.strip[/^%%EOF/] }
148
154
 
149
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
150
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
151
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
152
163
  end
153
164
 
154
165
  private
@@ -229,13 +240,12 @@ class PDF::Reader
229
240
  return if @tokens.size < 3
230
241
  return if @tokens[2] != "R"
231
242
 
232
- # must match whole tokens
233
- digits_only = %r{\A\d+\z}
234
- if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
235
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
236
- @tokens[1] = nil
237
- @tokens[2] = nil
238
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
239
249
  end
240
250
  end
241
251
 
@@ -245,7 +255,7 @@ class PDF::Reader
245
255
  # This is to reduce the chance of accidentally matching an embedded EI
246
256
  def prepare_inline_token
247
257
  idstart = @io.pos
248
- chr = prevchr = nil
258
+ prevchr = ''
249
259
  eisize = 0 # how many chars in the end marker
250
260
  seeking = 'E' # what are we looking for now?
251
261
  loop do
@@ -263,11 +273,11 @@ class PDF::Reader
263
273
  end
264
274
  when 'I'
265
275
  if chr == 'I'
266
- seeking = :END
276
+ seeking = ''
267
277
  else
268
278
  seeking = 'E'
269
279
  end
270
- when :END
280
+ when ''
271
281
  if WHITE_SPACE.include? chr
272
282
  eisize += 1 # Drop trailer
273
283
  break
@@ -275,28 +285,28 @@ class PDF::Reader
275
285
  seeking = 'E'
276
286
  end
277
287
  end
278
- prevchr = chr
288
+ prevchr = chr.is_a?(String) ? chr : ''
279
289
  end
280
- unless seeking == :END
290
+ unless seeking == ''
281
291
  raise MalformedPDFError, "EI terminator not found"
282
292
  end
283
293
  eiend = @io.pos
284
294
  @io.seek(idstart, IO::SEEK_SET)
285
295
  str = @io.read(eiend - eisize - idstart) # get the ID content
286
- @tokens << string_token(str)
296
+ @tokens << str.freeze if str
287
297
  end
288
298
 
289
299
  # if we're currently inside a hex string, read hex nibbles until
290
300
  # we find a closing >
291
301
  #
292
302
  def prepare_hex_token
303
+ finished = :false
293
304
  str = "".dup
294
- finished = false
295
305
 
296
- while !finished
306
+ until finished == :true
297
307
  byte = @io.getbyte
298
308
  if byte.nil?
299
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
300
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
301
311
  str << byte
302
312
  elsif byte <= 32
@@ -305,7 +315,7 @@ class PDF::Reader
305
315
  @tokens << str if str.size > 0
306
316
  @tokens << ">" if byte != 0x3E # '>'
307
317
  @tokens << byte.chr
308
- finished = true
318
+ finished = :true
309
319
  end
310
320
  end
311
321
  end
@@ -352,14 +362,17 @@ class PDF::Reader
352
362
  def prepare_regular_token
353
363
  tok = "".dup
354
364
 
355
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
356
368
  case byte
369
+ when nil
370
+ break
357
371
  when 0x25
358
372
  # comment, ignore everything until the next EOL char
359
- done = false
360
- while !done
361
- byte = @io.getbyte
362
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
363
376
  end
364
377
  when *TOKEN_WHITESPACE
365
378
  # white space, token finished
@@ -429,15 +442,5 @@ class PDF::Reader
429
442
  byte
430
443
  end
431
444
 
432
- # for a handful of tokens we want to tell the parser how to convert them
433
- # into higher level tokens. This methods adds a to_token() method
434
- # to tokens that should remain as strings.
435
- #
436
- def string_token(token)
437
- def token.to_token
438
- to_s
439
- end
440
- token
441
- end
442
445
  end
443
446
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,13 +62,40 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
@@ -105,7 +110,6 @@ class PDF::Reader
105
110
  # exception when we try converting broken UTF-16 to UTF-8
106
111
  #
107
112
  def str_to_int(str)
108
- return nil if str.nil? || str.size == 0
109
113
  unpacked_string = if str.bytesize == 1 # UTF-8
110
114
  str.unpack("C*")
111
115
  else # UTF-16
@@ -113,12 +117,15 @@ class PDF::Reader
113
117
  end
114
118
  result = []
115
119
  while unpacked_string.any? do
116
- if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
117
123
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
124
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
125
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
- points = [unpacked_string.shift, unpacked_string.shift]
121
- result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
122
129
  else
123
130
  result << unpacked_string.shift
124
131
  end
@@ -128,9 +135,11 @@ class PDF::Reader
128
135
 
129
136
  def process_bfchar_instructions(instructions)
130
137
  instructions.each_slice(2) do |one, two|
131
- find = str_to_int(one)
132
- replace = str_to_int(two)
133
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
134
143
  end
135
144
  end
136
145
 
@@ -141,30 +150,36 @@ class PDF::Reader
141
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
142
151
  bfrange_type_two(start, finish, to)
143
152
  else
144
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
145
154
  end
146
155
  end
147
156
  end
148
157
 
149
158
  def bfrange_type_one(start_code, end_code, dst)
150
- start_code = str_to_int(start_code)[0]
151
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
152
161
  dst = str_to_int(dst)
153
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
154
165
  # add all values in the range to our mapping
155
166
  (start_code..end_code).each_with_index do |val, idx|
156
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
157
168
  end
158
169
  end
159
170
 
160
171
  def bfrange_type_two(start_code, end_code, dst)
161
- start_code = str_to_int(start_code)[0]
162
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
163
177
  from_range = (start_code..end_code)
164
178
 
165
179
  # add all values in the range to our mapping
166
180
  from_range.each_with_index do |val, idx|
167
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
168
183
  end
169
184
  end
170
185
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -46,6 +47,21 @@ class PDF::Reader
46
47
  raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_type_as_malformed(object, name, klass)
55
+ raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
56
+ end
57
+ ################################################################################
58
+ def self.validate_not_nil(object, name)
59
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
60
+ end
61
+ ################################################################################
62
+ def self.validate_not_nil_as_malformed(object, name)
63
+ raise MalformedPDFError, "#{object} must not be nil" if object.nil?
64
+ end
49
65
  end
50
66
 
51
67
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,8 +7,9 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
- @options = options || {}
12
+ @options = options
11
13
  end
12
14
 
13
15
  ################################################################################
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left