pdf-reader 2.2.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,18 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
61
|
+
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
51
64
|
|
52
65
|
attr_reader :pos
|
53
66
|
|
@@ -55,7 +68,7 @@ class PDF::Reader
|
|
55
68
|
#
|
56
69
|
# Params:
|
57
70
|
#
|
58
|
-
# io - an IO stream
|
71
|
+
# io - an IO stream (usually a StringIO) with the raw data to tokenise
|
59
72
|
#
|
60
73
|
# options:
|
61
74
|
#
|
@@ -86,9 +99,12 @@ class PDF::Reader
|
|
86
99
|
#
|
87
100
|
# options:
|
88
101
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
102
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
103
|
+
# that is sitting under the io cursor.
|
104
|
+
# Note:
|
105
|
+
# Skipping a bare CR is not spec-compliant.
|
106
|
+
# This is because the data may start with LF.
|
107
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
108
|
def read(bytes, opts = {})
|
93
109
|
reset_pos
|
94
110
|
|
@@ -97,9 +113,9 @@ class PDF::Reader
|
|
97
113
|
str = @io.read(2)
|
98
114
|
if str.nil?
|
99
115
|
return nil
|
100
|
-
elsif str ==
|
116
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
117
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
118
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
119
|
@io.seek(-1, IO::SEEK_CUR)
|
104
120
|
else
|
105
121
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +143,10 @@ class PDF::Reader
|
|
127
143
|
#
|
128
144
|
def find_first_xref_offset
|
129
145
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
146
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
147
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
148
|
+
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
132
150
|
|
133
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
152
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -136,7 +154,12 @@ class PDF::Reader
|
|
136
154
|
|
137
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
138
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
139
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
140
163
|
end
|
141
164
|
|
142
165
|
private
|
@@ -217,45 +240,73 @@ class PDF::Reader
|
|
217
240
|
return if @tokens.size < 3
|
218
241
|
return if @tokens[2] != "R"
|
219
242
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
@tokens[
|
224
|
-
@tokens.
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
225
249
|
end
|
226
250
|
end
|
227
251
|
|
252
|
+
# Extract data between ID and EI
|
253
|
+
# If the EI follows white-space the space is dropped from the data
|
254
|
+
# The EI must followed by white-space or end of buffer
|
255
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
256
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
257
|
+
idstart = @io.pos
|
258
|
+
prevchr = ''
|
259
|
+
eisize = 0 # how many chars in the end marker
|
260
|
+
seeking = 'E' # what are we looking for now?
|
261
|
+
loop do
|
234
262
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
break if chr.nil?
|
264
|
+
case seeking
|
265
|
+
when 'E'
|
266
|
+
if chr == 'E'
|
267
|
+
seeking = 'I'
|
268
|
+
if WHITE_SPACE.include? prevchr
|
269
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
270
|
+
else # assume the EI immediately follows the data
|
271
|
+
eisize = 2 # leave prevchr in data
|
272
|
+
end
|
273
|
+
end
|
274
|
+
when 'I'
|
275
|
+
if chr == 'I'
|
276
|
+
seeking = ''
|
277
|
+
else
|
278
|
+
seeking = 'E'
|
279
|
+
end
|
280
|
+
when ''
|
281
|
+
if WHITE_SPACE.include? chr
|
282
|
+
eisize += 1 # Drop trailer
|
283
|
+
break
|
284
|
+
else
|
285
|
+
seeking = 'E'
|
286
|
+
end
|
239
287
|
end
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
240
289
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
@io.seek(
|
290
|
+
unless seeking == ''
|
291
|
+
raise MalformedPDFError, "EI terminator not found"
|
292
|
+
end
|
293
|
+
eiend = @io.pos
|
294
|
+
@io.seek(idstart, IO::SEEK_SET)
|
295
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
296
|
+
@tokens << str.freeze if str
|
246
297
|
end
|
247
298
|
|
248
299
|
# if we're currently inside a hex string, read hex nibbles until
|
249
300
|
# we find a closing >
|
250
301
|
#
|
251
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
252
304
|
str = "".dup
|
253
|
-
finished = false
|
254
305
|
|
255
|
-
|
306
|
+
until finished == :true
|
256
307
|
byte = @io.getbyte
|
257
308
|
if byte.nil?
|
258
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
259
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
260
311
|
str << byte
|
261
312
|
elsif byte <= 32
|
@@ -264,7 +315,7 @@ class PDF::Reader
|
|
264
315
|
@tokens << str if str.size > 0
|
265
316
|
@tokens << ">" if byte != 0x3E # '>'
|
266
317
|
@tokens << byte.chr
|
267
|
-
finished = true
|
318
|
+
finished = :true
|
268
319
|
end
|
269
320
|
end
|
270
321
|
end
|
@@ -311,14 +362,17 @@ class PDF::Reader
|
|
311
362
|
def prepare_regular_token
|
312
363
|
tok = "".dup
|
313
364
|
|
314
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
315
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
316
371
|
when 0x25
|
317
372
|
# comment, ignore everything until the next EOL char
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
322
376
|
end
|
323
377
|
when *TOKEN_WHITESPACE
|
324
378
|
# white space, token finished
|
@@ -388,15 +442,5 @@ class PDF::Reader
|
|
388
442
|
byte
|
389
443
|
end
|
390
444
|
|
391
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
392
|
-
# into higher level tokens. This methods adds a to_token() method
|
393
|
-
# to tokens that should remain as strings.
|
394
|
-
#
|
395
|
-
def string_token(token)
|
396
|
-
def token.to_token
|
397
|
-
to_s
|
398
|
-
end
|
399
|
-
token
|
400
|
-
end
|
401
445
|
end
|
402
446
|
end
|
@@ -1,8 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
|
-
#
|
5
|
-
|
6
5
|
require 'forwardable'
|
7
6
|
|
8
7
|
class PDF::Reader
|
@@ -32,10 +31,10 @@ class PDF::Reader
|
|
32
31
|
params << array.shift
|
33
32
|
|
34
33
|
if params.size == 2 && params.last.is_a?(Array)
|
35
|
-
widths.merge! parse_first_form(params.first, params.last)
|
34
|
+
widths.merge! parse_first_form(params.first.to_i, Array(params.last))
|
36
35
|
params = []
|
37
36
|
elsif params.size == 3
|
38
|
-
widths.merge! parse_second_form(params[0], params[1], params[2])
|
37
|
+
widths.merge! parse_second_form(params[0].to_i, params[1].to_i, params[2].to_i)
|
39
38
|
params = []
|
40
39
|
end
|
41
40
|
end
|
@@ -53,6 +52,10 @@ class PDF::Reader
|
|
53
52
|
|
54
53
|
# this is the form 10 20 123 where all index between 10 and 20 have width 123
|
55
54
|
def parse_second_form(first, final, width)
|
55
|
+
if first > final
|
56
|
+
raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
|
57
|
+
end
|
58
|
+
|
56
59
|
(first..final).inject({}) { |accum, index|
|
57
60
|
accum[index] = width
|
58
61
|
accum
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,16 +33,17 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
|
-
"begincodespacerange" =>
|
37
|
-
"endcodespacerange" =>
|
38
|
-
"beginbfchar" =>
|
39
|
-
"endbfchar" =>
|
40
|
-
"beginbfrange" =>
|
41
|
-
"endbfrange" =>
|
42
|
-
"begin" =>
|
43
|
-
"begincmap" =>
|
44
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
45
47
|
}
|
46
48
|
|
47
49
|
attr_reader :map
|
@@ -51,30 +53,6 @@ class PDF::Reader
|
|
51
53
|
process_data(data)
|
52
54
|
end
|
53
55
|
|
54
|
-
def process_data(data)
|
55
|
-
parser = build_parser(data)
|
56
|
-
mode = nil
|
57
|
-
instructions = []
|
58
|
-
|
59
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
60
|
-
if token == "beginbfchar"
|
61
|
-
mode = :char
|
62
|
-
elsif token == "endbfchar"
|
63
|
-
process_bfchar_instructions(instructions)
|
64
|
-
instructions = []
|
65
|
-
mode = nil
|
66
|
-
elsif token == "beginbfrange"
|
67
|
-
mode = :range
|
68
|
-
elsif token == "endbfrange"
|
69
|
-
process_bfrange_instructions(instructions)
|
70
|
-
instructions = []
|
71
|
-
mode = nil
|
72
|
-
elsif mode == :char || mode == :range
|
73
|
-
instructions << token
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
56
|
def size
|
79
57
|
@map.size
|
80
58
|
end
|
@@ -84,44 +62,84 @@ class PDF::Reader
|
|
84
62
|
# Returns an array of Integers.
|
85
63
|
#
|
86
64
|
def decode(c)
|
87
|
-
|
88
|
-
return c unless Integer === c
|
89
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
90
66
|
end
|
91
67
|
|
92
68
|
private
|
93
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
94
99
|
def build_parser(instructions)
|
95
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
96
101
|
Parser.new(buffer)
|
97
102
|
end
|
98
103
|
|
104
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
105
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
106
|
+
#
|
107
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
108
|
+
#
|
109
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
110
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
111
|
+
#
|
99
112
|
def str_to_int(str)
|
100
|
-
|
101
|
-
unpacked_string = if str.size == 1 # UTF-8
|
113
|
+
unpacked_string = if str.bytesize == 1 # UTF-8
|
102
114
|
str.unpack("C*")
|
103
115
|
else # UTF-16
|
104
116
|
str.unpack("n*")
|
105
117
|
end
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
118
|
+
result = []
|
119
|
+
while unpacked_string.any? do
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
123
|
+
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
124
|
+
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
125
|
+
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
129
|
+
else
|
130
|
+
result << unpacked_string.shift
|
131
|
+
end
|
117
132
|
end
|
133
|
+
result
|
118
134
|
end
|
119
135
|
|
120
136
|
def process_bfchar_instructions(instructions)
|
121
137
|
instructions.each_slice(2) do |one, two|
|
122
|
-
find = str_to_int(one)
|
123
|
-
replace = str_to_int(two)
|
124
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
125
143
|
end
|
126
144
|
end
|
127
145
|
|
@@ -132,30 +150,36 @@ class PDF::Reader
|
|
132
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
133
151
|
bfrange_type_two(start, finish, to)
|
134
152
|
else
|
135
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
136
154
|
end
|
137
155
|
end
|
138
156
|
end
|
139
157
|
|
140
158
|
def bfrange_type_one(start_code, end_code, dst)
|
141
|
-
start_code = str_to_int(start_code)
|
142
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
143
161
|
dst = str_to_int(dst)
|
144
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
145
165
|
# add all values in the range to our mapping
|
146
166
|
(start_code..end_code).each_with_index do |val, idx|
|
147
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
148
168
|
end
|
149
169
|
end
|
150
170
|
|
151
171
|
def bfrange_type_two(start_code, end_code, dst)
|
152
|
-
start_code = str_to_int(start_code)
|
153
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
154
177
|
from_range = (start_code..end_code)
|
155
178
|
|
156
179
|
# add all values in the range to our mapping
|
157
180
|
from_range.each_with_index do |val, idx|
|
158
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
159
183
|
end
|
160
184
|
end
|
161
185
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -40,20 +41,22 @@ class PDF::Reader
|
|
40
41
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
41
42
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
42
43
|
|
43
|
-
if enc.kind_of?(Hash)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
enc = enc.to_sym
|
44
|
+
@enc_name = if enc.kind_of?(Hash)
|
45
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
46
|
+
elsif enc && enc.respond_to?(:to_sym)
|
47
|
+
enc.to_sym
|
48
48
|
else
|
49
|
-
|
49
|
+
:StandardEncoding
|
50
50
|
end
|
51
51
|
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@map_file = get_mapping_file(enc)
|
52
|
+
@unpack = get_unpack(@enc_name)
|
53
|
+
@map_file = get_mapping_file(@enc_name)
|
55
54
|
|
56
55
|
load_mapping(@map_file) if @map_file
|
56
|
+
|
57
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
58
|
+
self.differences = enc[:Differences]
|
59
|
+
end
|
57
60
|
end
|
58
61
|
|
59
62
|
# set the differences table for this encoding. should be an array in the following format:
|
@@ -66,16 +69,16 @@ class PDF::Reader
|
|
66
69
|
#
|
67
70
|
# [25, :A, :B]
|
68
71
|
def differences=(diff)
|
69
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
70
73
|
|
71
74
|
@differences = {}
|
72
75
|
byte = 0
|
73
76
|
diff.each do |val|
|
74
77
|
if val.kind_of?(Numeric)
|
75
78
|
byte = val.to_i
|
76
|
-
|
79
|
+
elsif codepoint = glyphlist.name_to_unicode(val)
|
77
80
|
@differences[byte] = val
|
78
|
-
@mapping[byte] =
|
81
|
+
@mapping[byte] = codepoint
|
79
82
|
byte += 1
|
80
83
|
end
|
81
84
|
end
|
@@ -164,7 +167,7 @@ class PDF::Reader
|
|
164
167
|
end
|
165
168
|
|
166
169
|
def convert_to_utf8(str)
|
167
|
-
ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
|
170
|
+
ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
|
168
171
|
ret.force_encoding("UTF-8")
|
169
172
|
ret
|
170
173
|
end
|
@@ -206,7 +209,7 @@ class PDF::Reader
|
|
206
209
|
def load_mapping(file)
|
207
210
|
File.open(file, "r:BINARY") do |f|
|
208
211
|
f.each do |l|
|
209
|
-
_m, single_byte, unicode = *l.match(
|
212
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
210
213
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
211
214
|
end
|
212
215
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -33,19 +34,30 @@ class PDF::Reader
|
|
33
34
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
35
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
36
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
37
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
38
|
end
|
38
39
|
################################################################################
|
39
40
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
41
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
42
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
43
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
44
|
end
|
44
45
|
################################################################################
|
45
46
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
47
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_type_as_malformed(object, name, klass)
|
55
|
+
raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
56
|
+
end
|
57
|
+
################################################################################
|
58
|
+
def self.validate_not_nil(object, name)
|
59
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
60
|
+
end
|
49
61
|
end
|
50
62
|
|
51
63
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -17,7 +19,11 @@ class PDF::Reader
|
|
17
19
|
#
|
18
20
|
def filter(data)
|
19
21
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
22
|
+
if defined?(::Ascii85Native)
|
23
|
+
::Ascii85Native::decode(data)
|
24
|
+
else
|
25
|
+
::Ascii85::decode(data)
|
26
|
+
end
|
21
27
|
rescue Exception => e
|
22
28
|
# Oops, there was a problem decoding the stream
|
23
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|