pdf-reader 2.2.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,18 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
61
+
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
51
64
 
52
65
  attr_reader :pos
53
66
 
@@ -55,7 +68,7 @@ class PDF::Reader
55
68
  #
56
69
  # Params:
57
70
  #
58
- # io - an IO stream or string with the raw data to tokenise
71
+ # io - an IO stream (usually a StringIO) with the raw data to tokenise
59
72
  #
60
73
  # options:
61
74
  #
@@ -86,9 +99,12 @@ class PDF::Reader
86
99
  #
87
100
  # options:
88
101
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
102
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
103
+ # that is sitting under the io cursor.
104
+ # Note:
105
+ # Skipping a bare CR is not spec-compliant.
106
+ # This is because the data may start with LF.
107
+ # However we check for CRLF first, so the ambiguity is avoided.
92
108
  def read(bytes, opts = {})
93
109
  reset_pos
94
110
 
@@ -97,9 +113,9 @@ class PDF::Reader
97
113
  str = @io.read(2)
98
114
  if str.nil?
99
115
  return nil
100
- elsif str == "\r\n"
116
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
117
  # do nothing
102
- elsif str[0,1] == "\n"
118
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
119
  @io.seek(-1, IO::SEEK_CUR)
104
120
  else
105
121
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +143,10 @@ class PDF::Reader
127
143
  #
128
144
  def find_first_xref_offset
129
145
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
146
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
147
+ data = @io.read(TRAILING_BYTECOUNT)
148
+
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
132
150
 
133
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
152
  lines = data.split(/[\n\r]+/).reverse
@@ -136,7 +154,12 @@ class PDF::Reader
136
154
 
137
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
138
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
139
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
140
163
  end
141
164
 
142
165
  private
@@ -217,45 +240,73 @@ class PDF::Reader
217
240
  return if @tokens.size < 3
218
241
  return if @tokens[2] != "R"
219
242
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
221
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
- @tokens[1] = nil
223
- @tokens[2] = nil
224
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
225
249
  end
226
250
  end
227
251
 
252
+ # Extract data between ID and EI
253
+ # If the EI follows white-space the space is dropped from the data
254
+ # The EI must followed by white-space or end of buffer
255
+ # This is to reduce the chance of accidentally matching an embedded EI
228
256
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
257
+ idstart = @io.pos
258
+ prevchr = ''
259
+ eisize = 0 # how many chars in the end marker
260
+ seeking = 'E' # what are we looking for now?
261
+ loop do
234
262
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
263
+ break if chr.nil?
264
+ case seeking
265
+ when 'E'
266
+ if chr == 'E'
267
+ seeking = 'I'
268
+ if WHITE_SPACE.include? prevchr
269
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
270
+ else # assume the EI immediately follows the data
271
+ eisize = 2 # leave prevchr in data
272
+ end
273
+ end
274
+ when 'I'
275
+ if chr == 'I'
276
+ seeking = ''
277
+ else
278
+ seeking = 'E'
279
+ end
280
+ when ''
281
+ if WHITE_SPACE.include? chr
282
+ eisize += 1 # Drop trailer
283
+ break
284
+ else
285
+ seeking = 'E'
286
+ end
239
287
  end
288
+ prevchr = chr.is_a?(String) ? chr : ''
240
289
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
244
- @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
290
+ unless seeking == ''
291
+ raise MalformedPDFError, "EI terminator not found"
292
+ end
293
+ eiend = @io.pos
294
+ @io.seek(idstart, IO::SEEK_SET)
295
+ str = @io.read(eiend - eisize - idstart) # get the ID content
296
+ @tokens << str.freeze if str
246
297
  end
247
298
 
248
299
  # if we're currently inside a hex string, read hex nibbles until
249
300
  # we find a closing >
250
301
  #
251
302
  def prepare_hex_token
303
+ finished = :false
252
304
  str = "".dup
253
- finished = false
254
305
 
255
- while !finished
306
+ until finished == :true
256
307
  byte = @io.getbyte
257
308
  if byte.nil?
258
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
259
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
260
311
  str << byte
261
312
  elsif byte <= 32
@@ -264,7 +315,7 @@ class PDF::Reader
264
315
  @tokens << str if str.size > 0
265
316
  @tokens << ">" if byte != 0x3E # '>'
266
317
  @tokens << byte.chr
267
- finished = true
318
+ finished = :true
268
319
  end
269
320
  end
270
321
  end
@@ -311,14 +362,17 @@ class PDF::Reader
311
362
  def prepare_regular_token
312
363
  tok = "".dup
313
364
 
314
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
315
368
  case byte
369
+ when nil
370
+ break
316
371
  when 0x25
317
372
  # comment, ignore everything until the next EOL char
318
- done = false
319
- while !done
320
- byte = @io.getbyte
321
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
322
376
  end
323
377
  when *TOKEN_WHITESPACE
324
378
  # white space, token finished
@@ -388,15 +442,5 @@ class PDF::Reader
388
442
  byte
389
443
  end
390
444
 
391
- # for a handful of tokens we want to tell the parser how to convert them
392
- # into higher level tokens. This methods adds a to_token() method
393
- # to tokens that should remain as strings.
394
- #
395
- def string_token(token)
396
- def token.to_token
397
- to_s
398
- end
399
- token
400
- end
401
445
  end
402
446
  end
@@ -1,8 +1,7 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
- #
5
-
6
5
  require 'forwardable'
7
6
 
8
7
  class PDF::Reader
@@ -32,10 +31,10 @@ class PDF::Reader
32
31
  params << array.shift
33
32
 
34
33
  if params.size == 2 && params.last.is_a?(Array)
35
- widths.merge! parse_first_form(params.first, params.last)
34
+ widths.merge! parse_first_form(params.first.to_i, Array(params.last))
36
35
  params = []
37
36
  elsif params.size == 3
38
- widths.merge! parse_second_form(params[0], params[1], params[2])
37
+ widths.merge! parse_second_form(params[0].to_i, params[1].to_i, params[2].to_i)
39
38
  params = []
40
39
  end
41
40
  end
@@ -53,6 +52,10 @@ class PDF::Reader
53
52
 
54
53
  # this is the form 10 20 123 where all index between 10 and 20 have width 123
55
54
  def parse_second_form(first, final, width)
55
+ if first > final
56
+ raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
57
+ end
58
+
56
59
  (first..final).inject({}) { |accum, index|
57
60
  accum[index] = width
58
61
  accum
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,44 +62,84 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
97
102
  end
98
103
 
104
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
105
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
106
+ #
107
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
108
+ #
109
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
110
+ # exception when we try converting broken UTF-16 to UTF-8
111
+ #
99
112
  def str_to_int(str)
100
- return nil if str.nil? || str.size == 0
101
- unpacked_string = if str.size == 1 # UTF-8
113
+ unpacked_string = if str.bytesize == 1 # UTF-8
102
114
  str.unpack("C*")
103
115
  else # UTF-16
104
116
  str.unpack("n*")
105
117
  end
106
- if unpacked_string.size == 1
107
- unpacked_string
108
- elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
109
- # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
110
- # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
111
- # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
112
- [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
113
- else
114
- # it is a bad idea to just return the first 16 bits, as this doesn't allow
115
- # for ligatures for example fi (U+0066 U+0069)
116
- unpacked_string
118
+ result = []
119
+ while unpacked_string.any? do
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
123
+ # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
124
+ # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
125
+ # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
129
+ else
130
+ result << unpacked_string.shift
131
+ end
117
132
  end
133
+ result
118
134
  end
119
135
 
120
136
  def process_bfchar_instructions(instructions)
121
137
  instructions.each_slice(2) do |one, two|
122
- find = str_to_int(one)
123
- replace = str_to_int(two)
124
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
125
143
  end
126
144
  end
127
145
 
@@ -132,30 +150,36 @@ class PDF::Reader
132
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
133
151
  bfrange_type_two(start, finish, to)
134
152
  else
135
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
136
154
  end
137
155
  end
138
156
  end
139
157
 
140
158
  def bfrange_type_one(start_code, end_code, dst)
141
- start_code = str_to_int(start_code)[0]
142
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
143
161
  dst = str_to_int(dst)
144
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
145
165
  # add all values in the range to our mapping
146
166
  (start_code..end_code).each_with_index do |val, idx|
147
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
148
168
  end
149
169
  end
150
170
 
151
171
  def bfrange_type_two(start_code, end_code, dst)
152
- start_code = str_to_int(start_code)[0]
153
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
154
177
  from_range = (start_code..end_code)
155
178
 
156
179
  # add all values in the range to our mapping
157
180
  from_range.each_with_index do |val, idx|
158
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
159
183
  end
160
184
  end
161
185
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -40,20 +41,22 @@ class PDF::Reader
40
41
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
42
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
43
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
46
- elsif enc != nil
47
- enc = enc.to_sym
44
+ @enc_name = if enc.kind_of?(Hash)
45
+ enc[:Encoding] || enc[:BaseEncoding]
46
+ elsif enc && enc.respond_to?(:to_sym)
47
+ enc.to_sym
48
48
  else
49
- enc = nil
49
+ :StandardEncoding
50
50
  end
51
51
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
52
+ @unpack = get_unpack(@enc_name)
53
+ @map_file = get_mapping_file(@enc_name)
55
54
 
56
55
  load_mapping(@map_file) if @map_file
56
+
57
+ if enc.is_a?(Hash) && enc[:Differences]
58
+ self.differences = enc[:Differences]
59
+ end
57
60
  end
58
61
 
59
62
  # set the differences table for this encoding. should be an array in the following format:
@@ -66,16 +69,16 @@ class PDF::Reader
66
69
  #
67
70
  # [25, :A, :B]
68
71
  def differences=(diff)
69
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
70
73
 
71
74
  @differences = {}
72
75
  byte = 0
73
76
  diff.each do |val|
74
77
  if val.kind_of?(Numeric)
75
78
  byte = val.to_i
76
- else
79
+ elsif codepoint = glyphlist.name_to_unicode(val)
77
80
  @differences[byte] = val
78
- @mapping[byte] = glyphlist.name_to_unicode(val)
81
+ @mapping[byte] = codepoint
79
82
  byte += 1
80
83
  end
81
84
  end
@@ -164,7 +167,7 @@ class PDF::Reader
164
167
  end
165
168
 
166
169
  def convert_to_utf8(str)
167
- ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
170
+ ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
168
171
  ret.force_encoding("UTF-8")
169
172
  ret
170
173
  end
@@ -206,7 +209,7 @@ class PDF::Reader
206
209
  def load_mapping(file)
207
210
  File.open(file, "r:BINARY") do |f|
208
211
  f.each do |l|
209
- _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
212
+ _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
210
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
211
214
  end
212
215
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -33,19 +34,30 @@ class PDF::Reader
33
34
  def self.str_assert(lvalue, rvalue, chars=nil)
34
35
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
35
36
  lvalue = lvalue[0,chars] if chars
36
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
37
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
37
38
  end
38
39
  ################################################################################
39
40
  def self.str_assert_not(lvalue, rvalue, chars=nil)
40
41
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
41
42
  lvalue = lvalue[0,chars] if chars
42
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
43
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
43
44
  end
44
45
  ################################################################################
45
46
  def self.assert_equal(lvalue, rvalue)
46
- raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
47
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_type_as_malformed(object, name, klass)
55
+ raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
56
+ end
57
+ ################################################################################
58
+ def self.validate_not_nil(object, name)
59
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
60
+ end
49
61
  end
50
62
 
51
63
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -17,7 +19,11 @@ class PDF::Reader
17
19
  #
18
20
  def filter(data)
19
21
  data = "<~#{data}" unless data.to_s[0,2] == "<~"
20
- ::Ascii85::decode(data)
22
+ if defined?(::Ascii85Native)
23
+ ::Ascii85Native::decode(data)
24
+ else
25
+ ::Ascii85::decode(data)
26
+ end
21
27
  rescue Exception => e
22
28
  # Oops, there was a problem decoding the stream
23
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,