pdf-reader 2.2.0 → 2.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,18 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
61
+
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
51
64
 
52
65
  attr_reader :pos
53
66
 
@@ -55,7 +68,7 @@ class PDF::Reader
55
68
  #
56
69
  # Params:
57
70
  #
58
- # io - an IO stream or string with the raw data to tokenise
71
+ # io - an IO stream (usually a StringIO) with the raw data to tokenise
59
72
  #
60
73
  # options:
61
74
  #
@@ -86,9 +99,12 @@ class PDF::Reader
86
99
  #
87
100
  # options:
88
101
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
102
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
103
+ # that is sitting under the io cursor.
104
+ # Note:
105
+ # Skipping a bare CR is not spec-compliant.
106
+ # This is because the data may start with LF.
107
+ # However we check for CRLF first, so the ambiguity is avoided.
92
108
  def read(bytes, opts = {})
93
109
  reset_pos
94
110
 
@@ -97,9 +113,9 @@ class PDF::Reader
97
113
  str = @io.read(2)
98
114
  if str.nil?
99
115
  return nil
100
- elsif str == "\r\n"
116
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
117
  # do nothing
102
- elsif str[0,1] == "\n"
118
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
119
  @io.seek(-1, IO::SEEK_CUR)
104
120
  else
105
121
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +143,10 @@ class PDF::Reader
127
143
  #
128
144
  def find_first_xref_offset
129
145
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
146
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
147
+ data = @io.read(TRAILING_BYTECOUNT)
148
+
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
132
150
 
133
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
152
  lines = data.split(/[\n\r]+/).reverse
@@ -136,7 +154,12 @@ class PDF::Reader
136
154
 
137
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
138
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
139
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
140
163
  end
141
164
 
142
165
  private
@@ -217,45 +240,73 @@ class PDF::Reader
217
240
  return if @tokens.size < 3
218
241
  return if @tokens[2] != "R"
219
242
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
221
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
- @tokens[1] = nil
223
- @tokens[2] = nil
224
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
225
249
  end
226
250
  end
227
251
 
252
+ # Extract data between ID and EI
253
+ # If the EI follows white-space the space is dropped from the data
254
+ # The EI must followed by white-space or end of buffer
255
+ # This is to reduce the chance of accidentally matching an embedded EI
228
256
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
257
+ idstart = @io.pos
258
+ prevchr = ''
259
+ eisize = 0 # how many chars in the end marker
260
+ seeking = 'E' # what are we looking for now?
261
+ loop do
234
262
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
263
+ break if chr.nil?
264
+ case seeking
265
+ when 'E'
266
+ if chr == 'E'
267
+ seeking = 'I'
268
+ if WHITE_SPACE.include? prevchr
269
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
270
+ else # assume the EI immediately follows the data
271
+ eisize = 2 # leave prevchr in data
272
+ end
273
+ end
274
+ when 'I'
275
+ if chr == 'I'
276
+ seeking = ''
277
+ else
278
+ seeking = 'E'
279
+ end
280
+ when ''
281
+ if WHITE_SPACE.include? chr
282
+ eisize += 1 # Drop trailer
283
+ break
284
+ else
285
+ seeking = 'E'
286
+ end
239
287
  end
288
+ prevchr = chr.is_a?(String) ? chr : ''
240
289
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
244
- @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
290
+ unless seeking == ''
291
+ raise MalformedPDFError, "EI terminator not found"
292
+ end
293
+ eiend = @io.pos
294
+ @io.seek(idstart, IO::SEEK_SET)
295
+ str = @io.read(eiend - eisize - idstart) # get the ID content
296
+ @tokens << str.freeze if str
246
297
  end
247
298
 
248
299
  # if we're currently inside a hex string, read hex nibbles until
249
300
  # we find a closing >
250
301
  #
251
302
  def prepare_hex_token
303
+ finished = :false
252
304
  str = "".dup
253
- finished = false
254
305
 
255
- while !finished
306
+ until finished == :true
256
307
  byte = @io.getbyte
257
308
  if byte.nil?
258
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
259
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
260
311
  str << byte
261
312
  elsif byte <= 32
@@ -264,7 +315,7 @@ class PDF::Reader
264
315
  @tokens << str if str.size > 0
265
316
  @tokens << ">" if byte != 0x3E # '>'
266
317
  @tokens << byte.chr
267
- finished = true
318
+ finished = :true
268
319
  end
269
320
  end
270
321
  end
@@ -311,14 +362,17 @@ class PDF::Reader
311
362
  def prepare_regular_token
312
363
  tok = "".dup
313
364
 
314
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
315
368
  case byte
369
+ when nil
370
+ break
316
371
  when 0x25
317
372
  # comment, ignore everything until the next EOL char
318
- done = false
319
- while !done
320
- byte = @io.getbyte
321
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
322
376
  end
323
377
  when *TOKEN_WHITESPACE
324
378
  # white space, token finished
@@ -388,15 +442,5 @@ class PDF::Reader
388
442
  byte
389
443
  end
390
444
 
391
- # for a handful of tokens we want to tell the parser how to convert them
392
- # into higher level tokens. This methods adds a to_token() method
393
- # to tokens that should remain as strings.
394
- #
395
- def string_token(token)
396
- def token.to_token
397
- to_s
398
- end
399
- token
400
- end
401
445
  end
402
446
  end
@@ -1,8 +1,7 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
- #
5
-
6
5
  require 'forwardable'
7
6
 
8
7
  class PDF::Reader
@@ -32,10 +31,10 @@ class PDF::Reader
32
31
  params << array.shift
33
32
 
34
33
  if params.size == 2 && params.last.is_a?(Array)
35
- widths.merge! parse_first_form(params.first, params.last)
34
+ widths.merge! parse_first_form(params.first.to_i, Array(params.last))
36
35
  params = []
37
36
  elsif params.size == 3
38
- widths.merge! parse_second_form(params[0], params[1], params[2])
37
+ widths.merge! parse_second_form(params[0].to_i, params[1].to_i, params[2].to_i)
39
38
  params = []
40
39
  end
41
40
  end
@@ -53,6 +52,10 @@ class PDF::Reader
53
52
 
54
53
  # this is the form 10 20 123 where all index between 10 and 20 have width 123
55
54
  def parse_second_form(first, final, width)
55
+ if first > final
56
+ raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
57
+ end
58
+
56
59
  (first..final).inject({}) { |accum, index|
57
60
  accum[index] = width
58
61
  accum
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,44 +62,84 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
97
102
  end
98
103
 
104
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
105
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
106
+ #
107
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
108
+ #
109
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
110
+ # exception when we try converting broken UTF-16 to UTF-8
111
+ #
99
112
  def str_to_int(str)
100
- return nil if str.nil? || str.size == 0
101
- unpacked_string = if str.size == 1 # UTF-8
113
+ unpacked_string = if str.bytesize == 1 # UTF-8
102
114
  str.unpack("C*")
103
115
  else # UTF-16
104
116
  str.unpack("n*")
105
117
  end
106
- if unpacked_string.size == 1
107
- unpacked_string
108
- elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
109
- # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
110
- # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
111
- # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
112
- [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
113
- else
114
- # it is a bad idea to just return the first 16 bits, as this doesn't allow
115
- # for ligatures for example fi (U+0066 U+0069)
116
- unpacked_string
118
+ result = []
119
+ while unpacked_string.any? do
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
123
+ # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
124
+ # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
125
+ # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
129
+ else
130
+ result << unpacked_string.shift
131
+ end
117
132
  end
133
+ result
118
134
  end
119
135
 
120
136
  def process_bfchar_instructions(instructions)
121
137
  instructions.each_slice(2) do |one, two|
122
- find = str_to_int(one)
123
- replace = str_to_int(two)
124
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
125
143
  end
126
144
  end
127
145
 
@@ -132,30 +150,36 @@ class PDF::Reader
132
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
133
151
  bfrange_type_two(start, finish, to)
134
152
  else
135
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
136
154
  end
137
155
  end
138
156
  end
139
157
 
140
158
  def bfrange_type_one(start_code, end_code, dst)
141
- start_code = str_to_int(start_code)[0]
142
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
143
161
  dst = str_to_int(dst)
144
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
145
165
  # add all values in the range to our mapping
146
166
  (start_code..end_code).each_with_index do |val, idx|
147
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
148
168
  end
149
169
  end
150
170
 
151
171
  def bfrange_type_two(start_code, end_code, dst)
152
- start_code = str_to_int(start_code)[0]
153
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
154
177
  from_range = (start_code..end_code)
155
178
 
156
179
  # add all values in the range to our mapping
157
180
  from_range.each_with_index do |val, idx|
158
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
159
183
  end
160
184
  end
161
185
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -40,20 +41,22 @@ class PDF::Reader
40
41
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
42
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
43
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
46
- elsif enc != nil
47
- enc = enc.to_sym
44
+ @enc_name = if enc.kind_of?(Hash)
45
+ enc[:Encoding] || enc[:BaseEncoding]
46
+ elsif enc && enc.respond_to?(:to_sym)
47
+ enc.to_sym
48
48
  else
49
- enc = nil
49
+ :StandardEncoding
50
50
  end
51
51
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
52
+ @unpack = get_unpack(@enc_name)
53
+ @map_file = get_mapping_file(@enc_name)
55
54
 
56
55
  load_mapping(@map_file) if @map_file
56
+
57
+ if enc.is_a?(Hash) && enc[:Differences]
58
+ self.differences = enc[:Differences]
59
+ end
57
60
  end
58
61
 
59
62
  # set the differences table for this encoding. should be an array in the following format:
@@ -66,16 +69,16 @@ class PDF::Reader
66
69
  #
67
70
  # [25, :A, :B]
68
71
  def differences=(diff)
69
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
70
73
 
71
74
  @differences = {}
72
75
  byte = 0
73
76
  diff.each do |val|
74
77
  if val.kind_of?(Numeric)
75
78
  byte = val.to_i
76
- else
79
+ elsif codepoint = glyphlist.name_to_unicode(val)
77
80
  @differences[byte] = val
78
- @mapping[byte] = glyphlist.name_to_unicode(val)
81
+ @mapping[byte] = codepoint
79
82
  byte += 1
80
83
  end
81
84
  end
@@ -164,7 +167,7 @@ class PDF::Reader
164
167
  end
165
168
 
166
169
  def convert_to_utf8(str)
167
- ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
170
+ ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
168
171
  ret.force_encoding("UTF-8")
169
172
  ret
170
173
  end
@@ -206,7 +209,7 @@ class PDF::Reader
206
209
  def load_mapping(file)
207
210
  File.open(file, "r:BINARY") do |f|
208
211
  f.each do |l|
209
- _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
212
+ _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
210
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
211
214
  end
212
215
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -33,19 +34,30 @@ class PDF::Reader
33
34
  def self.str_assert(lvalue, rvalue, chars=nil)
34
35
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
35
36
  lvalue = lvalue[0,chars] if chars
36
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
37
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
37
38
  end
38
39
  ################################################################################
39
40
  def self.str_assert_not(lvalue, rvalue, chars=nil)
40
41
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
41
42
  lvalue = lvalue[0,chars] if chars
42
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
43
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
43
44
  end
44
45
  ################################################################################
45
46
  def self.assert_equal(lvalue, rvalue)
46
- raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
47
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_type_as_malformed(object, name, klass)
55
+ raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
56
+ end
57
+ ################################################################################
58
+ def self.validate_not_nil(object, name)
59
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
60
+ end
49
61
  end
50
62
 
51
63
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -17,7 +19,11 @@ class PDF::Reader
17
19
  #
18
20
  def filter(data)
19
21
  data = "<~#{data}" unless data.to_s[0,2] == "<~"
20
- ::Ascii85::decode(data)
22
+ if defined?(::Ascii85Native)
23
+ ::Ascii85Native::decode(data)
24
+ else
25
+ ::Ascii85::decode(data)
26
+ end
21
27
  rescue Exception => e
22
28
  # Oops, there was a problem decoding the stream
23
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,