pdf-reader 2.6.0 → 2.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +30 -1
- data/Rakefile +1 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -33
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +16 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +8 -6
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +28 -32
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +26 -8
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +21 -3
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +22 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c734cf3cfc0abf1102f813976d4936d33b57815f114ce92224bbd605fe16a2
|
4
|
+
data.tar.gz: f52b1751f83717a7bc96c56e8d830559d387fb430cfa6fa2a78604d98c7476f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72fda8f6b32c20782adca6cca44d291c7cbe4ac9d858da5ed1c815af2a7d6680e3906cac47a8414923c8db639fd51365d9da8612c1c7f79a674b22448bb35cae
|
7
|
+
data.tar.gz: fa79a29d80a36d37e1188769bf7991d5108bbe08b11711a7c9bb1741cedd3682b77afe219a24ae7844fdbf10b23ca3eb5434f4b9418d7002f07fb8edf9dd6e26
|
data/CHANGELOG
CHANGED
@@ -1,6 +1,35 @@
|
|
1
|
+
v2.9.1 (4th February 2022)
|
2
|
+
- Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
|
3
|
+
- Other small bug fixes
|
4
|
+
|
5
|
+
v2.9.0 (24th January 2022)
|
6
|
+
- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
|
7
|
+
- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
|
8
|
+
- For sorbet users, additional type annotations are included in the gem
|
9
|
+
|
10
|
+
v2.8.0 (28th Decemeber 2021)
|
11
|
+
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
12
|
+
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
13
|
+
- including extracting the text for only part of the page
|
14
|
+
- Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
|
15
|
+
- Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
|
16
|
+
- Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
|
17
|
+
|
18
|
+
v2.7.0 (13th December 2021)
|
19
|
+
- Include RBI type files in the gem
|
20
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
21
|
+
now be typed checked by sorbet
|
22
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
23
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
24
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
25
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
26
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
27
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
28
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
29
|
+
|
1
30
|
v2.6.0 (12th November 2021)
|
2
31
|
- Text extraction improvements
|
3
|
-
- Improved text layout on pages with a
|
32
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
33
|
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
34
|
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
35
|
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 28
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/rspec.rb
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
|
10
|
+
# a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
|
11
|
+
#
|
12
|
+
class AesV2SecurityHandler
|
13
|
+
|
14
|
+
def initialize(key)
|
15
|
+
@encrypt_key = key
|
16
|
+
end
|
17
|
+
|
18
|
+
##7.6.2 General Encryption Algorithm
|
19
|
+
#
|
20
|
+
# Algorithm 1: Encryption of data using the AES-128-CBC algorithm
|
21
|
+
#
|
22
|
+
# version == 4 and CFM == AESV2
|
23
|
+
#
|
24
|
+
# buf - a string to decrypt
|
25
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
|
+
#
|
27
|
+
def decrypt( buf, ref )
|
28
|
+
objKey = @encrypt_key.dup
|
29
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
31
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
32
|
+
length = objKey.length < 16 ? objKey.length : 16
|
33
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
|
+
cipher.decrypt
|
35
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
|
+
cipher.iv = buf[0..15]
|
37
|
+
cipher.update(buf[16..-1]) + cipher.final
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest'
|
6
|
+
require 'openssl'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
|
11
|
+
# Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
|
12
|
+
#
|
13
|
+
class AesV3SecurityHandler
|
14
|
+
|
15
|
+
def initialize(key)
|
16
|
+
@encrypt_key = key
|
17
|
+
@cipher = "AES-256-CBC"
|
18
|
+
end
|
19
|
+
|
20
|
+
##7.6.2 General Encryption Algorithm
|
21
|
+
#
|
22
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
|
+
#
|
24
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
25
|
+
#
|
26
|
+
# buf - a string to decrypt
|
27
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
|
+
#
|
29
|
+
def decrypt( buf, ref )
|
30
|
+
cipher = OpenSSL::Cipher.new(@cipher)
|
31
|
+
cipher.decrypt
|
32
|
+
cipher.key = @encrypt_key.dup
|
33
|
+
cipher.iv = buf[0..15]
|
34
|
+
cipher.update(buf[16..-1]) + cipher.final
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Filter our text/characters that are positioned outside a rectangle. Usually the page
|
8
|
+
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
|
+
class BoundingRectangleRunsFilter
|
10
|
+
|
11
|
+
def self.runs_within_rect(runs, rect)
|
12
|
+
runs.select { |run| rect.contains?(run.origin) }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -58,6 +59,9 @@ class PDF::Reader
|
|
58
59
|
# Allow for this here
|
59
60
|
TRAILING_BYTECOUNT = 5000
|
60
61
|
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
64
|
+
|
61
65
|
attr_reader :pos
|
62
66
|
|
63
67
|
# Creates a new buffer.
|
@@ -142,13 +146,20 @@ class PDF::Reader
|
|
142
146
|
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
143
147
|
data = @io.read(TRAILING_BYTECOUNT)
|
144
148
|
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
150
|
+
|
145
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
146
152
|
lines = data.split(/[\n\r]+/).reverse
|
147
153
|
eof_index = lines.index { |l| l.strip[/^%%EOF/] }
|
148
154
|
|
149
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
150
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
151
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
152
163
|
end
|
153
164
|
|
154
165
|
private
|
@@ -229,13 +240,12 @@ class PDF::Reader
|
|
229
240
|
return if @tokens.size < 3
|
230
241
|
return if @tokens[2] != "R"
|
231
242
|
|
232
|
-
|
233
|
-
|
234
|
-
if
|
235
|
-
@tokens[0] = PDF::Reader::Reference.new(
|
236
|
-
@tokens
|
237
|
-
@tokens
|
238
|
-
@tokens.compact!
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
239
249
|
end
|
240
250
|
end
|
241
251
|
|
@@ -245,7 +255,7 @@ class PDF::Reader
|
|
245
255
|
# This is to reduce the chance of accidentally matching an embedded EI
|
246
256
|
def prepare_inline_token
|
247
257
|
idstart = @io.pos
|
248
|
-
|
258
|
+
prevchr = ''
|
249
259
|
eisize = 0 # how many chars in the end marker
|
250
260
|
seeking = 'E' # what are we looking for now?
|
251
261
|
loop do
|
@@ -263,11 +273,11 @@ class PDF::Reader
|
|
263
273
|
end
|
264
274
|
when 'I'
|
265
275
|
if chr == 'I'
|
266
|
-
seeking =
|
276
|
+
seeking = ''
|
267
277
|
else
|
268
278
|
seeking = 'E'
|
269
279
|
end
|
270
|
-
when
|
280
|
+
when ''
|
271
281
|
if WHITE_SPACE.include? chr
|
272
282
|
eisize += 1 # Drop trailer
|
273
283
|
break
|
@@ -275,28 +285,28 @@ class PDF::Reader
|
|
275
285
|
seeking = 'E'
|
276
286
|
end
|
277
287
|
end
|
278
|
-
prevchr = chr
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
279
289
|
end
|
280
|
-
unless seeking ==
|
290
|
+
unless seeking == ''
|
281
291
|
raise MalformedPDFError, "EI terminator not found"
|
282
292
|
end
|
283
293
|
eiend = @io.pos
|
284
294
|
@io.seek(idstart, IO::SEEK_SET)
|
285
295
|
str = @io.read(eiend - eisize - idstart) # get the ID content
|
286
|
-
@tokens <<
|
296
|
+
@tokens << str.freeze if str
|
287
297
|
end
|
288
298
|
|
289
299
|
# if we're currently inside a hex string, read hex nibbles until
|
290
300
|
# we find a closing >
|
291
301
|
#
|
292
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
293
304
|
str = "".dup
|
294
|
-
finished = false
|
295
305
|
|
296
|
-
|
306
|
+
until finished == :true
|
297
307
|
byte = @io.getbyte
|
298
308
|
if byte.nil?
|
299
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
300
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
301
311
|
str << byte
|
302
312
|
elsif byte <= 32
|
@@ -305,7 +315,7 @@ class PDF::Reader
|
|
305
315
|
@tokens << str if str.size > 0
|
306
316
|
@tokens << ">" if byte != 0x3E # '>'
|
307
317
|
@tokens << byte.chr
|
308
|
-
finished = true
|
318
|
+
finished = :true
|
309
319
|
end
|
310
320
|
end
|
311
321
|
end
|
@@ -352,14 +362,17 @@ class PDF::Reader
|
|
352
362
|
def prepare_regular_token
|
353
363
|
tok = "".dup
|
354
364
|
|
355
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
356
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
357
371
|
when 0x25
|
358
372
|
# comment, ignore everything until the next EOL char
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
363
376
|
end
|
364
377
|
when *TOKEN_WHITESPACE
|
365
378
|
# white space, token finished
|
@@ -429,15 +442,5 @@ class PDF::Reader
|
|
429
442
|
byte
|
430
443
|
end
|
431
444
|
|
432
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
433
|
-
# into higher level tokens. This methods adds a to_token() method
|
434
|
-
# to tokens that should remain as strings.
|
435
|
-
#
|
436
|
-
def string_token(token)
|
437
|
-
def token.to_token
|
438
|
-
to_s
|
439
|
-
end
|
440
|
-
token
|
441
|
-
end
|
442
445
|
end
|
443
446
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,16 +33,17 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
|
-
"begincodespacerange" =>
|
37
|
-
"endcodespacerange" =>
|
38
|
-
"beginbfchar" =>
|
39
|
-
"endbfchar" =>
|
40
|
-
"beginbfrange" =>
|
41
|
-
"endbfrange" =>
|
42
|
-
"begin" =>
|
43
|
-
"begincmap" =>
|
44
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
45
47
|
}
|
46
48
|
|
47
49
|
attr_reader :map
|
@@ -51,30 +53,6 @@ class PDF::Reader
|
|
51
53
|
process_data(data)
|
52
54
|
end
|
53
55
|
|
54
|
-
def process_data(data)
|
55
|
-
parser = build_parser(data)
|
56
|
-
mode = nil
|
57
|
-
instructions = []
|
58
|
-
|
59
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
60
|
-
if token == "beginbfchar"
|
61
|
-
mode = :char
|
62
|
-
elsif token == "endbfchar"
|
63
|
-
process_bfchar_instructions(instructions)
|
64
|
-
instructions = []
|
65
|
-
mode = nil
|
66
|
-
elsif token == "beginbfrange"
|
67
|
-
mode = :range
|
68
|
-
elsif token == "endbfrange"
|
69
|
-
process_bfrange_instructions(instructions)
|
70
|
-
instructions = []
|
71
|
-
mode = nil
|
72
|
-
elsif mode == :char || mode == :range
|
73
|
-
instructions << token
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
56
|
def size
|
79
57
|
@map.size
|
80
58
|
end
|
@@ -84,13 +62,40 @@ class PDF::Reader
|
|
84
62
|
# Returns an array of Integers.
|
85
63
|
#
|
86
64
|
def decode(c)
|
87
|
-
|
88
|
-
return c unless Integer === c
|
89
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
90
66
|
end
|
91
67
|
|
92
68
|
private
|
93
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
94
99
|
def build_parser(instructions)
|
95
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
96
101
|
Parser.new(buffer)
|
@@ -105,7 +110,6 @@ class PDF::Reader
|
|
105
110
|
# exception when we try converting broken UTF-16 to UTF-8
|
106
111
|
#
|
107
112
|
def str_to_int(str)
|
108
|
-
return nil if str.nil? || str.size == 0
|
109
113
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
110
114
|
str.unpack("C*")
|
111
115
|
else # UTF-16
|
@@ -113,12 +117,15 @@ class PDF::Reader
|
|
113
117
|
end
|
114
118
|
result = []
|
115
119
|
while unpacked_string.any? do
|
116
|
-
if unpacked_string.size >= 2 &&
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
117
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
118
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
119
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
120
|
-
|
121
|
-
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
122
129
|
else
|
123
130
|
result << unpacked_string.shift
|
124
131
|
end
|
@@ -128,9 +135,11 @@ class PDF::Reader
|
|
128
135
|
|
129
136
|
def process_bfchar_instructions(instructions)
|
130
137
|
instructions.each_slice(2) do |one, two|
|
131
|
-
find = str_to_int(one)
|
132
|
-
replace = str_to_int(two)
|
133
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
134
143
|
end
|
135
144
|
end
|
136
145
|
|
@@ -141,30 +150,36 @@ class PDF::Reader
|
|
141
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
142
151
|
bfrange_type_two(start, finish, to)
|
143
152
|
else
|
144
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
145
154
|
end
|
146
155
|
end
|
147
156
|
end
|
148
157
|
|
149
158
|
def bfrange_type_one(start_code, end_code, dst)
|
150
|
-
start_code = str_to_int(start_code)
|
151
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
152
161
|
dst = str_to_int(dst)
|
153
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
154
165
|
# add all values in the range to our mapping
|
155
166
|
(start_code..end_code).each_with_index do |val, idx|
|
156
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
157
168
|
end
|
158
169
|
end
|
159
170
|
|
160
171
|
def bfrange_type_two(start_code, end_code, dst)
|
161
|
-
start_code = str_to_int(start_code)
|
162
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
163
177
|
from_range = (start_code..end_code)
|
164
178
|
|
165
179
|
# add all values in the range to our mapping
|
166
180
|
from_range.each_with_index do |val, idx|
|
167
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
168
183
|
end
|
169
184
|
end
|
170
185
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -46,6 +47,21 @@ class PDF::Reader
|
|
46
47
|
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_type_as_malformed(object, name, klass)
|
55
|
+
raise MalformedPDFError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
56
|
+
end
|
57
|
+
################################################################################
|
58
|
+
def self.validate_not_nil(object, name)
|
59
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
60
|
+
end
|
61
|
+
################################################################################
|
62
|
+
def self.validate_not_nil_as_malformed(object, name)
|
63
|
+
raise MalformedPDFError, "#{object} must not be nil" if object.nil?
|
64
|
+
end
|
49
65
|
end
|
50
66
|
|
51
67
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,8 +7,9 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
|
-
@options = options
|
12
|
+
@options = options
|
11
13
|
end
|
12
14
|
|
13
15
|
################################################################################
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|