pdf-reader 2.2.0 → 2.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,18 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
61
|
+
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
51
64
|
|
52
65
|
attr_reader :pos
|
53
66
|
|
@@ -55,7 +68,7 @@ class PDF::Reader
|
|
55
68
|
#
|
56
69
|
# Params:
|
57
70
|
#
|
58
|
-
# io - an IO stream
|
71
|
+
# io - an IO stream (usually a StringIO) with the raw data to tokenise
|
59
72
|
#
|
60
73
|
# options:
|
61
74
|
#
|
@@ -86,9 +99,12 @@ class PDF::Reader
|
|
86
99
|
#
|
87
100
|
# options:
|
88
101
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
102
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
103
|
+
# that is sitting under the io cursor.
|
104
|
+
# Note:
|
105
|
+
# Skipping a bare CR is not spec-compliant.
|
106
|
+
# This is because the data may start with LF.
|
107
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
108
|
def read(bytes, opts = {})
|
93
109
|
reset_pos
|
94
110
|
|
@@ -97,9 +113,9 @@ class PDF::Reader
|
|
97
113
|
str = @io.read(2)
|
98
114
|
if str.nil?
|
99
115
|
return nil
|
100
|
-
elsif str ==
|
116
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
117
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
118
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
119
|
@io.seek(-1, IO::SEEK_CUR)
|
104
120
|
else
|
105
121
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +143,10 @@ class PDF::Reader
|
|
127
143
|
#
|
128
144
|
def find_first_xref_offset
|
129
145
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
146
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
147
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
148
|
+
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
132
150
|
|
133
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
152
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -136,7 +154,12 @@ class PDF::Reader
|
|
136
154
|
|
137
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
138
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
139
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
140
163
|
end
|
141
164
|
|
142
165
|
private
|
@@ -217,45 +240,73 @@ class PDF::Reader
|
|
217
240
|
return if @tokens.size < 3
|
218
241
|
return if @tokens[2] != "R"
|
219
242
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
@tokens[
|
224
|
-
@tokens.
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
225
249
|
end
|
226
250
|
end
|
227
251
|
|
252
|
+
# Extract data between ID and EI
|
253
|
+
# If the EI follows white-space the space is dropped from the data
|
254
|
+
# The EI must followed by white-space or end of buffer
|
255
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
256
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
257
|
+
idstart = @io.pos
|
258
|
+
prevchr = ''
|
259
|
+
eisize = 0 # how many chars in the end marker
|
260
|
+
seeking = 'E' # what are we looking for now?
|
261
|
+
loop do
|
234
262
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
break if chr.nil?
|
264
|
+
case seeking
|
265
|
+
when 'E'
|
266
|
+
if chr == 'E'
|
267
|
+
seeking = 'I'
|
268
|
+
if WHITE_SPACE.include? prevchr
|
269
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
270
|
+
else # assume the EI immediately follows the data
|
271
|
+
eisize = 2 # leave prevchr in data
|
272
|
+
end
|
273
|
+
end
|
274
|
+
when 'I'
|
275
|
+
if chr == 'I'
|
276
|
+
seeking = ''
|
277
|
+
else
|
278
|
+
seeking = 'E'
|
279
|
+
end
|
280
|
+
when ''
|
281
|
+
if WHITE_SPACE.include? chr
|
282
|
+
eisize += 1 # Drop trailer
|
283
|
+
break
|
284
|
+
else
|
285
|
+
seeking = 'E'
|
286
|
+
end
|
239
287
|
end
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
240
289
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
@io.seek(
|
290
|
+
unless seeking == ''
|
291
|
+
raise MalformedPDFError, "EI terminator not found"
|
292
|
+
end
|
293
|
+
eiend = @io.pos
|
294
|
+
@io.seek(idstart, IO::SEEK_SET)
|
295
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
296
|
+
@tokens << str.freeze if str
|
246
297
|
end
|
247
298
|
|
248
299
|
# if we're currently inside a hex string, read hex nibbles until
|
249
300
|
# we find a closing >
|
250
301
|
#
|
251
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
252
304
|
str = "".dup
|
253
|
-
finished = false
|
254
305
|
|
255
|
-
|
306
|
+
until finished == :true
|
256
307
|
byte = @io.getbyte
|
257
308
|
if byte.nil?
|
258
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
259
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
260
311
|
str << byte
|
261
312
|
elsif byte <= 32
|
@@ -264,7 +315,7 @@ class PDF::Reader
|
|
264
315
|
@tokens << str if str.size > 0
|
265
316
|
@tokens << ">" if byte != 0x3E # '>'
|
266
317
|
@tokens << byte.chr
|
267
|
-
finished = true
|
318
|
+
finished = :true
|
268
319
|
end
|
269
320
|
end
|
270
321
|
end
|
@@ -311,14 +362,17 @@ class PDF::Reader
|
|
311
362
|
def prepare_regular_token
|
312
363
|
tok = "".dup
|
313
364
|
|
314
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
315
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
316
371
|
when 0x25
|
317
372
|
# comment, ignore everything until the next EOL char
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
322
376
|
end
|
323
377
|
when *TOKEN_WHITESPACE
|
324
378
|
# white space, token finished
|
@@ -388,15 +442,5 @@ class PDF::Reader
|
|
388
442
|
byte
|
389
443
|
end
|
390
444
|
|
391
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
392
|
-
# into higher level tokens. This methods adds a to_token() method
|
393
|
-
# to tokens that should remain as strings.
|
394
|
-
#
|
395
|
-
def string_token(token)
|
396
|
-
def token.to_token
|
397
|
-
to_s
|
398
|
-
end
|
399
|
-
token
|
400
|
-
end
|
401
445
|
end
|
402
446
|
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
|
-
#
|
5
|
-
|
6
5
|
require 'forwardable'
|
7
6
|
|
8
7
|
class PDF::Reader
|
@@ -32,10 +31,10 @@ class PDF::Reader
|
|
32
31
|
params << array.shift
|
33
32
|
|
34
33
|
if params.size == 2 && params.last.is_a?(Array)
|
35
|
-
widths.merge! parse_first_form(params.first, params.last)
|
34
|
+
widths.merge! parse_first_form(params.first.to_i, Array(params.last))
|
36
35
|
params = []
|
37
36
|
elsif params.size == 3
|
38
|
-
widths.merge! parse_second_form(params[0], params[1], params[2])
|
37
|
+
widths.merge! parse_second_form(params[0].to_i, params[1].to_i, params[2].to_i)
|
39
38
|
params = []
|
40
39
|
end
|
41
40
|
end
|
@@ -53,6 +52,10 @@ class PDF::Reader
|
|
53
52
|
|
54
53
|
# this is the form 10 20 123 where all index between 10 and 20 have width 123
|
55
54
|
def parse_second_form(first, final, width)
|
55
|
+
if first > final
|
56
|
+
raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
|
57
|
+
end
|
58
|
+
|
56
59
|
(first..final).inject({}) { |accum, index|
|
57
60
|
accum[index] = width
|
58
61
|
accum
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,16 +33,17 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
|
-
"begincodespacerange" =>
|
37
|
-
"endcodespacerange" =>
|
38
|
-
"beginbfchar" =>
|
39
|
-
"endbfchar" =>
|
40
|
-
"beginbfrange" =>
|
41
|
-
"endbfrange" =>
|
42
|
-
"begin" =>
|
43
|
-
"begincmap" =>
|
44
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
45
47
|
}
|
46
48
|
|
47
49
|
attr_reader :map
|
@@ -51,30 +53,6 @@ class PDF::Reader
|
|
51
53
|
process_data(data)
|
52
54
|
end
|
53
55
|
|
54
|
-
def process_data(data)
|
55
|
-
parser = build_parser(data)
|
56
|
-
mode = nil
|
57
|
-
instructions = []
|
58
|
-
|
59
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
60
|
-
if token == "beginbfchar"
|
61
|
-
mode = :char
|
62
|
-
elsif token == "endbfchar"
|
63
|
-
process_bfchar_instructions(instructions)
|
64
|
-
instructions = []
|
65
|
-
mode = nil
|
66
|
-
elsif token == "beginbfrange"
|
67
|
-
mode = :range
|
68
|
-
elsif token == "endbfrange"
|
69
|
-
process_bfrange_instructions(instructions)
|
70
|
-
instructions = []
|
71
|
-
mode = nil
|
72
|
-
elsif mode == :char || mode == :range
|
73
|
-
instructions << token
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
56
|
def size
|
79
57
|
@map.size
|
80
58
|
end
|
@@ -84,44 +62,84 @@ class PDF::Reader
|
|
84
62
|
# Returns an array of Integers.
|
85
63
|
#
|
86
64
|
def decode(c)
|
87
|
-
|
88
|
-
return c unless Integer === c
|
89
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
90
66
|
end
|
91
67
|
|
92
68
|
private
|
93
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
94
99
|
def build_parser(instructions)
|
95
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
96
101
|
Parser.new(buffer)
|
97
102
|
end
|
98
103
|
|
104
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
105
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
106
|
+
#
|
107
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
108
|
+
#
|
109
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
110
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
111
|
+
#
|
99
112
|
def str_to_int(str)
|
100
|
-
|
101
|
-
unpacked_string = if str.size == 1 # UTF-8
|
113
|
+
unpacked_string = if str.bytesize == 1 # UTF-8
|
102
114
|
str.unpack("C*")
|
103
115
|
else # UTF-16
|
104
116
|
str.unpack("n*")
|
105
117
|
end
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
118
|
+
result = []
|
119
|
+
while unpacked_string.any? do
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
123
|
+
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
124
|
+
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
125
|
+
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
129
|
+
else
|
130
|
+
result << unpacked_string.shift
|
131
|
+
end
|
117
132
|
end
|
133
|
+
result
|
118
134
|
end
|
119
135
|
|
120
136
|
def process_bfchar_instructions(instructions)
|
121
137
|
instructions.each_slice(2) do |one, two|
|
122
|
-
find = str_to_int(one)
|
123
|
-
replace = str_to_int(two)
|
124
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
|
@@ -132,30 +150,36 @@ class PDF::Reader
|
|
132
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
133
151
|
bfrange_type_two(start, finish, to)
|
134
152
|
else
|
135
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
136
154
|
end
|
137
155
|
end
|
138
156
|
end
|
139
157
|
|
140
158
|
def bfrange_type_one(start_code, end_code, dst)
|
141
|
-
start_code = str_to_int(start_code)
|
142
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
143
161
|
dst = str_to_int(dst)
|
144
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
145
165
|
# add all values in the range to our mapping
|
146
166
|
(start_code..end_code).each_with_index do |val, idx|
|
147
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
148
168
|
end
|
149
169
|
end
|
150
170
|
|
151
171
|
def bfrange_type_two(start_code, end_code, dst)
|
152
|
-
start_code = str_to_int(start_code)
|
153
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
154
177
|
from_range = (start_code..end_code)
|
155
178
|
|
156
179
|
# add all values in the range to our mapping
|
157
180
|
from_range.each_with_index do |val, idx|
|
158
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
159
183
|
end
|
160
184
|
end
|
161
185
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -40,20 +41,22 @@ class PDF::Reader
|
|
40
41
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
41
42
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
42
43
|
|
43
|
-
if enc.kind_of?(Hash)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
enc = enc.to_sym
|
44
|
+
@enc_name = if enc.kind_of?(Hash)
|
45
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
46
|
+
elsif enc && enc.respond_to?(:to_sym)
|
47
|
+
enc.to_sym
|
48
48
|
else
|
49
|
-
|
49
|
+
:StandardEncoding
|
50
50
|
end
|
51
51
|
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@map_file = get_mapping_file(enc)
|
52
|
+
@unpack = get_unpack(@enc_name)
|
53
|
+
@map_file = get_mapping_file(@enc_name)
|
55
54
|
|
56
55
|
load_mapping(@map_file) if @map_file
|
56
|
+
|
57
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
58
|
+
self.differences = enc[:Differences]
|
59
|
+
end
|
57
60
|
end
|
58
61
|
|
59
62
|
# set the differences table for this encoding. should be an array in the following format:
|
@@ -66,16 +69,16 @@ class PDF::Reader
|
|
66
69
|
#
|
67
70
|
# [25, :A, :B]
|
68
71
|
def differences=(diff)
|
69
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
70
73
|
|
71
74
|
@differences = {}
|
72
75
|
byte = 0
|
73
76
|
diff.each do |val|
|
74
77
|
if val.kind_of?(Numeric)
|
75
78
|
byte = val.to_i
|
76
|
-
|
79
|
+
elsif codepoint = glyphlist.name_to_unicode(val)
|
77
80
|
@differences[byte] = val
|
78
|
-
@mapping[byte] =
|
81
|
+
@mapping[byte] = codepoint
|
79
82
|
byte += 1
|
80
83
|
end
|
81
84
|
end
|
@@ -164,7 +167,7 @@ class PDF::Reader
|
|
164
167
|
end
|
165
168
|
|
166
169
|
def convert_to_utf8(str)
|
167
|
-
ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
|
170
|
+
ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
|
168
171
|
ret.force_encoding("UTF-8")
|
169
172
|
ret
|
170
173
|
end
|
@@ -206,7 +209,7 @@ class PDF::Reader
|
|
206
209
|
def load_mapping(file)
|
207
210
|
File.open(file, "r:BINARY") do |f|
|
208
211
|
f.each do |l|
|
209
|
-
_m, single_byte, unicode = *l.match(
|
212
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
210
213
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
211
214
|
end
|
212
215
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -33,19 +34,30 @@ class PDF::Reader
|
|
33
34
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
35
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
36
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
37
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
38
|
end
|
38
39
|
################################################################################
|
39
40
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
41
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
42
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
43
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
44
|
end
|
44
45
|
################################################################################
|
45
46
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
47
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_type_as_malformed(object, name, klass)
|
55
|
+
raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
56
|
+
end
|
57
|
+
################################################################################
|
58
|
+
def self.validate_not_nil(object, name)
|
59
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
60
|
+
end
|
49
61
|
end
|
50
62
|
|
51
63
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -17,7 +19,11 @@ class PDF::Reader
|
|
17
19
|
#
|
18
20
|
def filter(data)
|
19
21
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
22
|
+
if defined?(::Ascii85Native)
|
23
|
+
::Ascii85Native::decode(data)
|
24
|
+
else
|
25
|
+
::Ascii85::decode(data)
|
26
|
+
end
|
21
27
|
rescue Exception => e
|
22
28
|
# Oops, there was a problem decoding the stream
|
23
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|