pdf-reader 2.14.0 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +22 -0
- data/lib/pdf/reader/advanced_text_run_filter.rb +17 -2
- data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
- data/lib/pdf/reader/buffer.rb +35 -17
- data/lib/pdf/reader/cid_widths.rb +7 -1
- data/lib/pdf/reader/cmap.rb +14 -3
- data/lib/pdf/reader/encoding.rb +37 -12
- data/lib/pdf/reader/error.rb +6 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +4 -0
- data/lib/pdf/reader/filter/flate.rb +5 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +90 -22
- data/lib/pdf/reader/font_descriptor.rb +76 -23
- data/lib/pdf/reader/form_xobject.rb +11 -0
- data/lib/pdf/reader/glyph_hash.rb +34 -9
- data/lib/pdf/reader/key_builder_v5.rb +17 -9
- data/lib/pdf/reader/lzw.rb +17 -6
- data/lib/pdf/reader/no_text_filter.rb +1 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +7 -2
- data/lib/pdf/reader/object_hash.rb +116 -9
- data/lib/pdf/reader/object_stream.rb +19 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +7 -1
- data/lib/pdf/reader/page.rb +41 -7
- data/lib/pdf/reader/page_layout.rb +25 -8
- data/lib/pdf/reader/page_state.rb +5 -2
- data/lib/pdf/reader/page_text_receiver.rb +6 -2
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +51 -10
- data/lib/pdf/reader/point.rb +9 -2
- data/lib/pdf/reader/print_receiver.rb +2 -6
- data/lib/pdf/reader/rc4_security_handler.rb +2 -0
- data/lib/pdf/reader/rectangle.rb +24 -1
- data/lib/pdf/reader/reference.rb +10 -1
- data/lib/pdf/reader/register_receiver.rb +15 -2
- data/lib/pdf/reader/resources.rb +9 -0
- data/lib/pdf/reader/security_handler_factory.rb +13 -0
- data/lib/pdf/reader/standard_key_builder.rb +37 -23
- data/lib/pdf/reader/stream.rb +9 -3
- data/lib/pdf/reader/synchronized_cache.rb +5 -2
- data/lib/pdf/reader/text_run.rb +28 -1
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +33 -2
- data/lib/pdf/reader/type_check.rb +10 -3
- data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
- data/lib/pdf/reader/validating_receiver.rb +29 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +10 -3
- data/lib/pdf/reader/width_calculator/composite.rb +5 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +5 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +3 -1
- data/lib/pdf/reader/width_calculator/type_zero.rb +2 -0
- data/lib/pdf/reader/xref.rb +28 -7
- data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
- data/lib/pdf/reader.rb +18 -2
- data/rbi/pdf-reader.rbi +1502 -1594
- metadata +17 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1be615eb6abc5557e61ba53958c7211fac0f1528e75dc54eff27ffb5554d7c80
|
4
|
+
data.tar.gz: 875221f31dc119cd0f7ae3cc0246b3bbb70f6127c0047ec924c8030e9186b55b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b4501ca72d06b5a569fdcc77f384131fbd85342f8da7a084a02210ec7a3821e8b9f1cad88685262d0cc4e993f7b0031bed5d510c353c7d8fb5fe28f97a2ea83
|
7
|
+
data.tar.gz: a4fe329f2d8ae7cc295cb17d573963ddab6c0cde52d6524ad182f4651dab8ba90215bcb1ecf60c7fcf248135aed152b50a1d34afa03b270b93c5a172ac4048b3
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,25 @@
|
|
1
|
+
v2.15.0 (13th August 2025)
|
2
|
+
|
3
|
+
- Overhaul sorbet types, moving from an external RBI file to inline comments in RBS syntax
|
4
|
+
- multiple PRs, but mainly https://github.com/yob/pdf-reader/pull/562
|
5
|
+
- See https://railsatscale.com/2025-04-23-rbs-support-for-sorbet/
|
6
|
+
- No impact expected for most users, but projects that use sorbet may find subtle changes in
|
7
|
+
the RBI file that is shipped with the gem
|
8
|
+
- Relax version requirements for dependency `afm`, allow 1.x (https://github.com/yob/pdf-reader/pull/557)
|
9
|
+
- Improve text positioning logic in some PDFs (https://github.com/yob/pdf-reader/pull/554)
|
10
|
+
- Multiple fixes for encrypted files
|
11
|
+
- Some files with passwords > 32 bytes long (https://github.com/yob/pdf-reader/pull/555)
|
12
|
+
- Some files that contain cipher text with a 16 byte IV and no further blocks (https://github.com/yob/pdf-reader/pull/561)
|
13
|
+
- Some files that encrypted data with no padding (https://github.com/yob/pdf-reader/pull/564)
|
14
|
+
- Add jruby 10 to CI matrix (https://github.com/yob/pdf-reader/pull/552)
|
15
|
+
|
16
|
+
v2.14.1 (4th February 2025)
|
17
|
+
- Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
|
18
|
+
|
19
|
+
v2.14.0 (29th January 2025)
|
20
|
+
- Raise minimum supported ruby to 2.1 (https://github.com/yob/pdf-reader/pull/543)
|
21
|
+
- Add support for filtering to Page#text (https://github.com/yob/pdf-reader/pull/545)
|
22
|
+
|
1
23
|
v2.13.0 (2nd November 2024)
|
2
24
|
- Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
|
3
25
|
- Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
|
@@ -46,28 +46,37 @@ class PDF::Reader
|
|
46
46
|
less_than_or_equal
|
47
47
|
include
|
48
48
|
exclude
|
49
|
-
]
|
49
|
+
] #: Array[Symbol]
|
50
50
|
|
51
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
|
51
52
|
def self.only(text_runs, filter_hash)
|
52
53
|
new(text_runs, filter_hash).only
|
53
54
|
end
|
54
55
|
|
56
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
|
55
57
|
def self.exclude(text_runs, filter_hash)
|
56
58
|
new(text_runs, filter_hash).exclude
|
57
59
|
end
|
58
60
|
|
59
|
-
|
61
|
+
#: Array[PDF::Reader::TextRun]
|
62
|
+
attr_reader :text_runs
|
60
63
|
|
64
|
+
#: Hash[Symbol, untyped]
|
65
|
+
attr_reader :filter_hash
|
66
|
+
|
67
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> void
|
61
68
|
def initialize(text_runs, filter_hash)
|
62
69
|
@text_runs = text_runs
|
63
70
|
@filter_hash = filter_hash
|
64
71
|
end
|
65
72
|
|
73
|
+
#: () -> Array[PDF::Reader::TextRun]
|
66
74
|
def only
|
67
75
|
return text_runs if filter_hash.empty?
|
68
76
|
text_runs.select { |text_run| evaluate_filter(text_run) }
|
69
77
|
end
|
70
78
|
|
79
|
+
#: () -> Array[PDF::Reader::TextRun]
|
71
80
|
def exclude
|
72
81
|
return text_runs if filter_hash.empty?
|
73
82
|
text_runs.reject { |text_run| evaluate_filter(text_run) }
|
@@ -75,6 +84,7 @@ class PDF::Reader
|
|
75
84
|
|
76
85
|
private
|
77
86
|
|
87
|
+
#: (PDF::Reader::TextRun) -> bool
|
78
88
|
def evaluate_filter(text_run)
|
79
89
|
if filter_hash[:or]
|
80
90
|
evaluate_or_filters(text_run, filter_hash[:or])
|
@@ -85,24 +95,28 @@ class PDF::Reader
|
|
85
95
|
end
|
86
96
|
end
|
87
97
|
|
98
|
+
#: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
|
88
99
|
def evaluate_or_filters(text_run, conditions)
|
89
100
|
conditions.any? do |condition|
|
90
101
|
evaluate_filters(text_run, condition)
|
91
102
|
end
|
92
103
|
end
|
93
104
|
|
105
|
+
#: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
|
94
106
|
def evaluate_and_filters(text_run, conditions)
|
95
107
|
conditions.all? do |condition|
|
96
108
|
evaluate_filters(text_run, condition)
|
97
109
|
end
|
98
110
|
end
|
99
111
|
|
112
|
+
#: (PDF::Reader::TextRun, Hash[Symbol, untyped]) -> bool
|
100
113
|
def evaluate_filters(text_run, filter_hash)
|
101
114
|
filter_hash.all? do |attribute, conditions|
|
102
115
|
evaluate_attribute_conditions(text_run, attribute, conditions)
|
103
116
|
end
|
104
117
|
end
|
105
118
|
|
119
|
+
#: (PDF::Reader::TextRun, Symbol, Hash[Symbol, untyped]) -> bool
|
106
120
|
def evaluate_attribute_conditions(text_run, attribute, conditions)
|
107
121
|
conditions.all? do |operator, value|
|
108
122
|
unless VALID_OPERATORS.include?(operator)
|
@@ -113,6 +127,7 @@ class PDF::Reader
|
|
113
127
|
end
|
114
128
|
end
|
115
129
|
|
130
|
+
#: (untyped, Symbol, untyped) -> bool
|
116
131
|
def apply_operator(attribute_value, operator, filter_value)
|
117
132
|
case operator
|
118
133
|
when :equal
|
@@ -11,6 +11,7 @@ class PDF::Reader
|
|
11
11
|
#
|
12
12
|
class AesV2SecurityHandler
|
13
13
|
|
14
|
+
#: (String) -> void
|
14
15
|
def initialize(key)
|
15
16
|
@encrypt_key = key
|
16
17
|
end
|
@@ -21,10 +22,38 @@ class PDF::Reader
|
|
21
22
|
#
|
22
23
|
# version == 4 and CFM == AESV2
|
23
24
|
#
|
25
|
+
# used to decrypt PDF streams (buf). Input data should be in bytesizes of
|
26
|
+
# a multiple of 16, anything else is an error. The first 16 bytes are the initialization
|
27
|
+
# vector, so any input of exactly 16 bytes decrypts to an empty string
|
28
|
+
#
|
24
29
|
# buf - a string to decrypt
|
25
30
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
31
|
#
|
32
|
+
#: (String, PDF::Reader::Reference) -> String
|
27
33
|
def decrypt( buf, ref )
|
34
|
+
if buf.bytesize % 16 > 0
|
35
|
+
raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
|
36
|
+
elsif buf.bytesize == 16
|
37
|
+
return ""
|
38
|
+
else
|
39
|
+
begin
|
40
|
+
internal_decrypt(buf, ref)
|
41
|
+
rescue OpenSSL::Cipher::CipherError
|
42
|
+
# If we failed to decrypt it might be a padding error, so try again
|
43
|
+
# and assume no padding in the ciphertext. This will "suceed" but might
|
44
|
+
# return garbage if the key is incorrect but that's OK - well before this
|
45
|
+
# class is used we have confirmed the user provided key is correct so if
|
46
|
+
# this works without error we can be confident the returned plaintext is
|
47
|
+
# correct
|
48
|
+
internal_decrypt(buf, ref, false)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
#: (String, PDF::Reader::Reference, ?bool) -> String
|
56
|
+
def internal_decrypt(buf, ref, padding = true)
|
28
57
|
objKey = @encrypt_key.dup
|
29
58
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
59
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
@@ -32,6 +61,7 @@ class PDF::Reader
|
|
32
61
|
length = objKey.length < 16 ? objKey.length : 16
|
33
62
|
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
63
|
cipher.decrypt
|
64
|
+
cipher.padding = 0 unless padding
|
35
65
|
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
66
|
cipher.iv = buf[0..15]
|
37
67
|
cipher.update(buf[16..-1]) + cipher.final
|
@@ -12,27 +12,59 @@ class PDF::Reader
|
|
12
12
|
#
|
13
13
|
class AesV3SecurityHandler
|
14
14
|
|
15
|
+
#: (String) -> void
|
15
16
|
def initialize(key)
|
17
|
+
if key.bytesize != 32
|
18
|
+
raise PDF::Reader::MalformedPDFError.new(
|
19
|
+
"AES-256 key must be exactly 32 bytes, got #{key.bytesize}"
|
20
|
+
)
|
21
|
+
end
|
16
22
|
@encrypt_key = key
|
17
|
-
@cipher = "AES-256-CBC"
|
23
|
+
@cipher = "AES-256-CBC" #: String
|
18
24
|
end
|
19
25
|
|
20
26
|
##7.6.2 General Encryption Algorithm
|
21
27
|
#
|
22
28
|
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
29
|
#
|
24
|
-
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
30
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf). Input data should be in bytesizes of
|
31
|
+
# a multiple of 16, anything else is an error. The first 16 bytes are the initialization
|
32
|
+
# vector, so any input of exactly 16 bytes decrypts to an empty string
|
25
33
|
#
|
26
34
|
# buf - a string to decrypt
|
27
35
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
36
|
#
|
37
|
+
#: (String, PDF::Reader::Reference) -> String
|
29
38
|
def decrypt( buf, ref )
|
39
|
+
if buf.bytesize % 16 > 0
|
40
|
+
raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
|
41
|
+
elsif buf.bytesize == 16
|
42
|
+
return ""
|
43
|
+
else
|
44
|
+
begin
|
45
|
+
internal_decrypt(buf, ref)
|
46
|
+
rescue OpenSSL::Cipher::CipherError
|
47
|
+
# If we failed to decrypt it might be a padding error, so try again
|
48
|
+
# and assume no padding in the ciphertext. This will "suceed" but might
|
49
|
+
# return garbage if the key is incorrect but that's OK - well before this
|
50
|
+
# class is used we have confirmed the user provided key is correct so if
|
51
|
+
# this works without error we can be confident the returned plaintext is
|
52
|
+
# correct
|
53
|
+
internal_decrypt(buf, ref, false)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
#: (String, PDF::Reader::Reference, ?bool) -> String
|
61
|
+
def internal_decrypt(buf, ref, padding = true)
|
30
62
|
cipher = OpenSSL::Cipher.new(@cipher)
|
31
63
|
cipher.decrypt
|
64
|
+
cipher.padding = 0 unless padding
|
32
65
|
cipher.key = @encrypt_key.dup
|
33
66
|
cipher.iv = buf[0..15]
|
34
67
|
cipher.update(buf[16..-1]) + cipher.final
|
35
68
|
end
|
36
|
-
|
37
69
|
end
|
38
70
|
end
|
@@ -8,6 +8,7 @@ class PDF::Reader
|
|
8
8
|
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
9
|
class BoundingRectangleRunsFilter
|
10
10
|
|
11
|
+
#: (Array[PDF::Reader::TextRun], PDF::Reader::Rectangle) -> Array[PDF::Reader::TextRun]
|
11
12
|
def self.runs_within_rect(runs, rect)
|
12
13
|
runs.select { |run| rect.contains?(run.origin) }
|
13
14
|
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -38,30 +38,31 @@ class PDF::Reader
|
|
38
38
|
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
39
39
|
#
|
40
40
|
class Buffer
|
41
|
-
TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
|
42
|
-
TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
|
41
|
+
TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
|
42
|
+
TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]
|
43
43
|
|
44
44
|
# some strings for comparissons. Declaring them here avoids creating new
|
45
45
|
# strings that need GC over and over
|
46
|
-
LEFT_PAREN = "("
|
47
|
-
LESS_THAN = "<"
|
48
|
-
STREAM = "stream"
|
49
|
-
ID = "ID"
|
50
|
-
FWD_SLASH = "/"
|
51
|
-
NULL_BYTE = "\x00"
|
52
|
-
CR = "\r"
|
53
|
-
LF = "\n"
|
54
|
-
CRLF = "\r\n"
|
55
|
-
WHITE_SPACE = ["\n", "\r", ' ']
|
46
|
+
LEFT_PAREN = "(" #: String
|
47
|
+
LESS_THAN = "<" #: String
|
48
|
+
STREAM = "stream" #: String
|
49
|
+
ID = "ID" #: String
|
50
|
+
FWD_SLASH = "/" #: String
|
51
|
+
NULL_BYTE = "\x00" #: String
|
52
|
+
CR = "\r" #: String
|
53
|
+
LF = "\n" #: String
|
54
|
+
CRLF = "\r\n" #: String
|
55
|
+
WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]
|
56
56
|
|
57
57
|
# Quite a few PDFs have trailing junk.
|
58
58
|
# This can be several k of nuls in some cases
|
59
59
|
# Allow for this here
|
60
|
-
TRAILING_BYTECOUNT = 5000
|
60
|
+
TRAILING_BYTECOUNT = 5000 #: Integer
|
61
61
|
|
62
62
|
# must match whole tokens
|
63
|
-
DIGITS_ONLY = %r{\A\d+\z}
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z} #: Regexp
|
64
64
|
|
65
|
+
#: Integer
|
65
66
|
attr_reader :pos
|
66
67
|
|
67
68
|
# Creates a new buffer.
|
@@ -76,17 +77,19 @@ class PDF::Reader
|
|
76
77
|
# :content_stream - set to true if buffer will be tokenising a
|
77
78
|
# content stream. Defaults to false
|
78
79
|
#
|
80
|
+
#: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
|
79
81
|
def initialize(io, opts = {})
|
80
82
|
@io = io
|
81
|
-
@tokens = []
|
82
|
-
@in_content_stream = opts[:content_stream]
|
83
|
+
@tokens = [] #: Array[String | PDF::Reader::Reference]
|
84
|
+
@in_content_stream = opts[:content_stream] #: bool
|
83
85
|
|
84
86
|
@io.seek(opts[:seek]) if opts[:seek]
|
85
|
-
@pos = @io.pos
|
87
|
+
@pos = @io.pos #: Integer
|
86
88
|
end
|
87
89
|
|
88
90
|
# return true if there are no more tokens left
|
89
91
|
#
|
92
|
+
#: () -> bool
|
90
93
|
def empty?
|
91
94
|
prepare_tokens if @tokens.size < 3
|
92
95
|
|
@@ -105,6 +108,7 @@ class PDF::Reader
|
|
105
108
|
# Skipping a bare CR is not spec-compliant.
|
106
109
|
# This is because the data may start with LF.
|
107
110
|
# However we check for CRLF first, so the ambiguity is avoided.
|
111
|
+
#: (Integer, ?Hash[Symbol, untyped]) -> String?
|
108
112
|
def read(bytes, opts = {})
|
109
113
|
reset_pos
|
110
114
|
|
@@ -130,6 +134,7 @@ class PDF::Reader
|
|
130
134
|
# return the next token from the source. Returns a string if a token
|
131
135
|
# is found, nil if there are no tokens left.
|
132
136
|
#
|
137
|
+
#: () -> (nil | String | PDF::Reader::Reference)
|
133
138
|
def token
|
134
139
|
reset_pos
|
135
140
|
prepare_tokens if @tokens.size < 3
|
@@ -141,6 +146,7 @@ class PDF::Reader
|
|
141
146
|
|
142
147
|
# return the byte offset where the first XRef table in th source can be found.
|
143
148
|
#
|
149
|
+
#: () -> Integer
|
144
150
|
def find_first_xref_offset
|
145
151
|
check_size_is_non_zero
|
146
152
|
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
@@ -164,6 +170,7 @@ class PDF::Reader
|
|
164
170
|
|
165
171
|
private
|
166
172
|
|
173
|
+
#: () -> void
|
167
174
|
def check_size_is_non_zero
|
168
175
|
@io.seek(-1, IO::SEEK_END)
|
169
176
|
@io.seek(0)
|
@@ -173,12 +180,14 @@ class PDF::Reader
|
|
173
180
|
|
174
181
|
# Returns true if this buffer is parsing a content stream
|
175
182
|
#
|
183
|
+
#: () -> bool
|
176
184
|
def in_content_stream?
|
177
185
|
@in_content_stream ? true : false
|
178
186
|
end
|
179
187
|
|
180
188
|
# Some bastard moved our IO stream cursor. Restore it.
|
181
189
|
#
|
190
|
+
#: () -> void
|
182
191
|
def reset_pos
|
183
192
|
@io.seek(@pos) if @io.pos != @pos
|
184
193
|
end
|
@@ -186,12 +195,14 @@ class PDF::Reader
|
|
186
195
|
# save the current position of the source IO stream. If someone else (like another buffer)
|
187
196
|
# moves the cursor, we can then restore it.
|
188
197
|
#
|
198
|
+
#: () -> void
|
189
199
|
def save_pos
|
190
200
|
@pos = @io.pos
|
191
201
|
end
|
192
202
|
|
193
203
|
# attempt to prime the buffer with the next few tokens.
|
194
204
|
#
|
205
|
+
#: () -> void
|
195
206
|
def prepare_tokens
|
196
207
|
10.times do
|
197
208
|
case state
|
@@ -208,6 +219,7 @@ class PDF::Reader
|
|
208
219
|
# tokenising behaves slightly differently based on the current context.
|
209
220
|
# Determine the current context/state by examining the last token we found
|
210
221
|
#
|
222
|
+
#: () -> Symbol
|
211
223
|
def state
|
212
224
|
case @tokens.last
|
213
225
|
when LEFT_PAREN then :literal_string
|
@@ -236,6 +248,7 @@ class PDF::Reader
|
|
236
248
|
# indirect reference, so test for that case first and avoid the relatively
|
237
249
|
# expensive regexp checks if possible.
|
238
250
|
#
|
251
|
+
#: () -> void
|
239
252
|
def merge_indirect_reference
|
240
253
|
return if @tokens.size < 3
|
241
254
|
return if @tokens[2] != "R"
|
@@ -253,6 +266,7 @@ class PDF::Reader
|
|
253
266
|
# If the EI follows white-space the space is dropped from the data
|
254
267
|
# The EI must followed by white-space or end of buffer
|
255
268
|
# This is to reduce the chance of accidentally matching an embedded EI
|
269
|
+
#: () -> void
|
256
270
|
def prepare_inline_token
|
257
271
|
idstart = @io.pos
|
258
272
|
prevchr = ''
|
@@ -299,6 +313,7 @@ class PDF::Reader
|
|
299
313
|
# if we're currently inside a hex string, read hex nibbles until
|
300
314
|
# we find a closing >
|
301
315
|
#
|
316
|
+
#: () -> void
|
302
317
|
def prepare_hex_token
|
303
318
|
str = "".dup
|
304
319
|
|
@@ -328,6 +343,7 @@ class PDF::Reader
|
|
328
343
|
# processing to fix things like escaped new lines, but that's someone else's
|
329
344
|
# problem.
|
330
345
|
#
|
346
|
+
#: () -> void
|
331
347
|
def prepare_literal_token
|
332
348
|
str = "".dup
|
333
349
|
count = 1
|
@@ -358,6 +374,7 @@ class PDF::Reader
|
|
358
374
|
# What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
|
359
375
|
# to read up on it.
|
360
376
|
#
|
377
|
+
#: () -> void
|
361
378
|
def prepare_regular_token
|
362
379
|
tok = "".dup
|
363
380
|
|
@@ -435,6 +452,7 @@ class PDF::Reader
|
|
435
452
|
# peek at the next character in the io stream, leaving the stream position
|
436
453
|
# untouched
|
437
454
|
#
|
455
|
+
#: () -> (Integer | nil)
|
438
456
|
def peek_byte
|
439
457
|
byte = @io.getbyte
|
440
458
|
@io.seek(-1, IO::SEEK_CUR) if byte
|
@@ -18,12 +18,14 @@ class PDF::Reader
|
|
18
18
|
# Graphics State Operators
|
19
19
|
def_delegators :@widths, :[], :fetch
|
20
20
|
|
21
|
+
#: (Numeric, Array[Numeric]) -> void
|
21
22
|
def initialize(default, array)
|
22
|
-
@widths = parse_array(default, array.dup)
|
23
|
+
@widths = parse_array(default, array.dup) #: Hash[Numeric, Numeric]
|
23
24
|
end
|
24
25
|
|
25
26
|
private
|
26
27
|
|
28
|
+
#: (Numeric, Array[Numeric]) -> Hash[Numeric, Numeric]
|
27
29
|
def parse_array(default, array)
|
28
30
|
widths = Hash.new(default)
|
29
31
|
params = []
|
@@ -43,6 +45,8 @@ class PDF::Reader
|
|
43
45
|
|
44
46
|
# this is the form 10 [234 63 234 346 47 234] where width of index 10 is
|
45
47
|
# 234, index 11 is 63, etc
|
48
|
+
#
|
49
|
+
#: (Integer, Array[Numeric]) -> Hash[Numeric, Numeric]
|
46
50
|
def parse_first_form(first, widths)
|
47
51
|
widths.inject({}) { |accum, glyph_width|
|
48
52
|
accum[first + accum.size] = glyph_width
|
@@ -51,6 +55,8 @@ class PDF::Reader
|
|
51
55
|
end
|
52
56
|
|
53
57
|
# this is the form 10 20 123 where all index between 10 and 20 have width 123
|
58
|
+
#
|
59
|
+
#: (Integer, Integer, Numeric) -> Hash[Numeric, Numeric]
|
54
60
|
def parse_second_form(first, final, width)
|
55
61
|
if first > final
|
56
62
|
raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -44,15 +44,18 @@ class PDF::Reader
|
|
44
44
|
"begin" => :noop,
|
45
45
|
"begincmap" => :noop,
|
46
46
|
"def" => :noop
|
47
|
-
}
|
47
|
+
} #: Hash[String, Symbol]
|
48
48
|
|
49
|
+
#: Hash[Integer, Array[Integer]]
|
49
50
|
attr_reader :map
|
50
51
|
|
52
|
+
#: (String) -> void
|
51
53
|
def initialize(data)
|
52
|
-
@map = {}
|
54
|
+
@map = {} #: Hash[Integer, Array[Integer]]
|
53
55
|
process_data(data)
|
54
56
|
end
|
55
57
|
|
58
|
+
#: () -> Integer
|
56
59
|
def size
|
57
60
|
@map.size
|
58
61
|
end
|
@@ -61,12 +64,14 @@ class PDF::Reader
|
|
61
64
|
#
|
62
65
|
# Returns an array of Integers.
|
63
66
|
#
|
67
|
+
#: (Integer) -> Array[Integer]
|
64
68
|
def decode(c)
|
65
69
|
@map.fetch(c, [])
|
66
70
|
end
|
67
71
|
|
68
72
|
private
|
69
73
|
|
74
|
+
#: (String, ?Symbol) -> void
|
70
75
|
def process_data(data, initial_mode = :none)
|
71
76
|
parser = build_parser(data)
|
72
77
|
mode = initial_mode
|
@@ -96,6 +101,7 @@ class PDF::Reader
|
|
96
101
|
end
|
97
102
|
|
98
103
|
|
104
|
+
#: (String) -> PDF::Reader::Parser
|
99
105
|
def build_parser(instructions)
|
100
106
|
buffer = Buffer.new(StringIO.new(instructions))
|
101
107
|
Parser.new(buffer)
|
@@ -109,6 +115,7 @@ class PDF::Reader
|
|
109
115
|
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
110
116
|
# exception when we try converting broken UTF-16 to UTF-8
|
111
117
|
#
|
118
|
+
#: (String) -> Array[Integer]
|
112
119
|
def str_to_int(str)
|
113
120
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
114
121
|
str.unpack("C*")
|
@@ -133,6 +140,7 @@ class PDF::Reader
|
|
133
140
|
result
|
134
141
|
end
|
135
142
|
|
143
|
+
#: (Array[String]) -> void
|
136
144
|
def process_bfchar_instructions(instructions)
|
137
145
|
instructions.each_slice(2) do |one, two|
|
138
146
|
find = str_to_int(one.to_s)
|
@@ -143,6 +151,7 @@ class PDF::Reader
|
|
143
151
|
end
|
144
152
|
end
|
145
153
|
|
154
|
+
#: (Array[Array[String] | String]) -> void
|
146
155
|
def process_bfrange_instructions(instructions)
|
147
156
|
instructions.each_slice(3) do |start, finish, to|
|
148
157
|
if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
|
@@ -155,6 +164,7 @@ class PDF::Reader
|
|
155
164
|
end
|
156
165
|
end
|
157
166
|
|
167
|
+
#: (String, String, String) -> void
|
158
168
|
def bfrange_type_one(start_code, end_code, dst)
|
159
169
|
start_code = str_to_int(start_code).first
|
160
170
|
end_code = str_to_int(end_code).first
|
@@ -168,6 +178,7 @@ class PDF::Reader
|
|
168
178
|
end
|
169
179
|
end
|
170
180
|
|
181
|
+
#: (String, String, Array[String]) -> void
|
171
182
|
def bfrange_type_two(start_code, end_code, dst)
|
172
183
|
start_code = str_to_int(start_code).first
|
173
184
|
end_code = str_to_int(end_code).first
|