unicode-display_width 3.1.0 → 3.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/README.md +2 -1
- data/data/display_width.marshal.gz +0 -0
- data/lib/unicode/display_width/constants.rb +1 -1
- data/lib/unicode/display_width.rb +108 -149
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a85ca57ca5e291c17993e526d222dda44b884286484b3831bb8173ce92aafb1a
|
4
|
+
data.tar.gz: d1036dfc6464459de04a713e273d09dea767a3b9a9629d9e491052c2ffe97c23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d669e8a2866b56a78bafb3fff6d2d6430fab6bb1ca2633aeaac68e0634ca14374ac0b325bc7159ef90afe0bdffd9c154700cae1fc3183b1d74281ff4b5024e1b
|
7
|
+
data.tar.gz: 5f319484d27dad70b3851398e11cd3cb93b5c4f41a6c3a76c958d505d8357f9e303b661fd7a0339262d1458b82cb8619e6682ee2dbf8c583d33fbde4fd1a8680
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 3.1.2
|
4
|
+
|
5
|
+
- Performance improvements
|
6
|
+
|
7
|
+
## 3.1.1
|
8
|
+
|
9
|
+
- Performance improvements
|
10
|
+
|
3
11
|
## 3.1.0
|
4
12
|
|
5
13
|
**Improve Emoji support:**
|
@@ -7,7 +15,7 @@
|
|
7
15
|
- Emoji modes: Differentiate between well-formed Emoji (`:possible`) and any
|
8
16
|
ZWJ/modifier sequence (`:all`). The latter is more common and more efficient
|
9
17
|
to implement.
|
10
|
-
- Unify
|
18
|
+
- Unify `:rgi_{fqe,mqe,uqe}` options to just `:rgi` to keep things simpler (corresponds to
|
11
19
|
the former `:rgi_uqe` option). Most terminals that want to support the RGI set
|
12
20
|
will probably want to catch Emoji sequences with missing VS16s.
|
13
21
|
- Add new `:all_no_vs16` and `:rgi_at` modes to be able to support some terminals
|
data/README.md
CHANGED
@@ -114,10 +114,11 @@ The `emoji:` option can be used to configure which type of Emoji should be consi
|
|
114
114
|
`:all_no_vs16` | EAW (1 or 2) | 2 for all ZWJ/modifier/keycap sequences, even if they are not well-formed Emoji sequences | WezTerm
|
115
115
|
`:possible`| 2 | 2 for all possible/well-formed Emoji sequences | ?
|
116
116
|
`:rgi` | 2 | 2 for all [RGI Emoji](https://www.unicode.org/reports/tr51/#def_rgi_set) sequences | ?
|
117
|
-
`:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have
|
117
|
+
`:rgi_at` | EAW (1 or 2) | 1 or 2: Like `:rgi`, but Emoji sequences starting with a default-text Emoji have EAW | Apple Terminal
|
118
118
|
`:vs16` | 2 | 2 * number of partial Emoji (sequences never considered to represent a combined Emoji) | kitty?
|
119
119
|
`false` or `:none` | EAW (1 or 2) | No Emoji adjustments | gnome-terminal, many older terminals
|
120
120
|
|
121
|
+
- *EAW:* East Asian Width
|
121
122
|
- *RGI Emoji:* Emoji Recommended for General Interchange
|
122
123
|
- *ZWJ:* Zero-width Joiner: Codepoint `U+200D`,used in many Emoji sequences
|
123
124
|
|
Binary file
|
@@ -10,8 +10,8 @@ module Unicode
|
|
10
10
|
class DisplayWidth
|
11
11
|
DEFAULT_AMBIGUOUS = 1
|
12
12
|
INITIAL_DEPTH = 0x10000
|
13
|
-
ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n
|
14
|
-
ASCII_NON_ZERO_STRING = "\0\x05\a\b\n
|
13
|
+
ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
|
14
|
+
ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
|
15
15
|
ASCII_BACKSPACE = "\b"
|
16
16
|
AMBIGUOUS_MAP = {
|
17
17
|
1 => :WIDTH_ONE,
|
@@ -21,6 +21,10 @@ module Unicode
|
|
21
21
|
WIDTH_ONE: 768,
|
22
22
|
WIDTH_TWO: 161,
|
23
23
|
}
|
24
|
+
NOT_COMMON_NARROW_REGEX = {
|
25
|
+
WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
|
26
|
+
WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
|
27
|
+
}
|
24
28
|
FIRST_4096 = {
|
25
29
|
WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
|
26
30
|
WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
|
@@ -30,126 +34,61 @@ module Unicode
|
|
30
34
|
rgi_at: :REGEX_INCLUDE_MQE_UQE,
|
31
35
|
possible: :REGEX_WELL_FORMED,
|
32
36
|
}
|
33
|
-
|
37
|
+
REGEX_EMOJI_VS16 = Regexp.union(
|
38
|
+
Regexp.compile(
|
39
|
+
Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
|
40
|
+
"(?<![#*0-9])" +
|
41
|
+
"\u{FE0F}"
|
42
|
+
),
|
43
|
+
Unicode::Emoji::REGEX_EMOJI_KEYCAP
|
44
|
+
)
|
34
45
|
REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
|
35
|
-
|
46
|
+
REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)
|
36
47
|
|
37
48
|
# Returns monospace display width of string
|
38
49
|
def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
|
39
|
-
unless
|
40
|
-
|
41
|
-
options.merge! old_options
|
42
|
-
end
|
43
|
-
|
44
|
-
options[:ambiguous] = ambiguous if ambiguous
|
45
|
-
options[:ambiguous] ||= DEFAULT_AMBIGUOUS
|
46
|
-
|
47
|
-
if options[:ambiguous] != 1 && options[:ambiguous] != 2
|
48
|
-
raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
|
49
|
-
end
|
50
|
-
|
51
|
-
if overwrite && !overwrite.empty?
|
52
|
-
warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
|
53
|
-
options[:overwrite] = overwrite
|
54
|
-
end
|
55
|
-
options[:overwrite] ||= {}
|
56
|
-
|
57
|
-
if [nil, true, :auto].include?(options[:emoji])
|
58
|
-
options[:emoji] = EmojiSupport.recommended
|
59
|
-
end
|
50
|
+
string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
|
51
|
+
options = normalize_options(string, ambiguous, overwrite, old_options, **options)
|
60
52
|
|
61
|
-
|
53
|
+
width = 0
|
62
54
|
|
63
|
-
|
64
|
-
|
65
|
-
width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
|
66
|
-
end
|
55
|
+
unless options[:overwrite].empty?
|
56
|
+
width, string = width_custom(string, options[:overwrite])
|
67
57
|
end
|
68
58
|
|
69
|
-
if
|
70
|
-
return
|
71
|
-
width_no_overwrite(string, index_full, index_low, first_ambiguous)
|
72
|
-
end
|
59
|
+
if string.ascii_only?
|
60
|
+
return width + width_ascii(string)
|
73
61
|
end
|
74
62
|
|
75
|
-
|
76
|
-
end
|
63
|
+
ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
|
77
64
|
|
78
|
-
|
79
|
-
|
80
|
-
if string.match?(ASCII_NON_ZERO_REGEX)
|
81
|
-
res = string.delete(ASCII_NON_ZERO_STRING).size - string.count(ASCII_BACKSPACE)
|
82
|
-
return res < 0 ? 0 : res
|
65
|
+
unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
|
66
|
+
return width + string.size
|
83
67
|
end
|
84
68
|
|
85
|
-
# Pure ASCII
|
86
|
-
string.size
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.width_frame(string, options)
|
90
69
|
# Retrieve Emoji width
|
91
|
-
if options[:emoji]
|
92
|
-
|
93
|
-
else
|
94
|
-
res, string = emoji_width(
|
70
|
+
if options[:emoji] != :none
|
71
|
+
e_width, string = emoji_width(
|
95
72
|
string,
|
96
73
|
options[:emoji],
|
97
74
|
options[:ambiguous],
|
98
75
|
)
|
99
|
-
|
100
|
-
|
101
|
-
# Prepare indexes
|
102
|
-
ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
|
103
|
-
|
104
|
-
# Get general width
|
105
|
-
res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
|
106
|
-
|
107
|
-
# Return result + prevent negative lengths
|
108
|
-
res < 0 ? 0 : res
|
109
|
-
end
|
110
|
-
|
111
|
-
def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ = {})
|
112
|
-
res = 0
|
113
|
-
|
114
|
-
# Make sure we have UTF-8
|
115
|
-
string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
|
76
|
+
width += e_width
|
116
77
|
|
117
|
-
|
118
|
-
|
119
|
-
res += batch.size
|
120
|
-
else
|
121
|
-
batch.each_codepoint{ |codepoint|
|
122
|
-
if codepoint > 15 && codepoint < first_ambiguous
|
123
|
-
res += 1
|
124
|
-
elsif codepoint < 0x1001
|
125
|
-
res += index_low[codepoint] || 1
|
126
|
-
else
|
127
|
-
d = INITIAL_DEPTH
|
128
|
-
w = index_full[codepoint / d]
|
129
|
-
while w.instance_of? Array
|
130
|
-
w = w[(codepoint %= d) / (d /= 16)]
|
131
|
-
end
|
132
|
-
|
133
|
-
res += w || 1
|
134
|
-
end
|
135
|
-
}
|
78
|
+
unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
|
79
|
+
return width + string.size
|
136
80
|
end
|
137
|
-
|
138
|
-
|
139
|
-
res
|
140
|
-
end
|
81
|
+
end
|
141
82
|
|
142
|
-
|
143
|
-
|
144
|
-
|
83
|
+
index_full = INDEX[ambiguous_index_name]
|
84
|
+
index_low = FIRST_4096[ambiguous_index_name]
|
85
|
+
first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]
|
145
86
|
|
146
87
|
string.each_codepoint{ |codepoint|
|
147
|
-
if
|
148
|
-
|
149
|
-
elsif codepoint > 15 && codepoint < first_ambiguous
|
150
|
-
res += 1
|
88
|
+
if codepoint > 15 && codepoint < first_ambiguous
|
89
|
+
width += 1
|
151
90
|
elsif codepoint < 0x1001
|
152
|
-
|
91
|
+
width += index_low[codepoint] || 1
|
153
92
|
else
|
154
93
|
d = INITIAL_DEPTH
|
155
94
|
w = index_full[codepoint / d]
|
@@ -157,19 +96,44 @@ module Unicode
|
|
157
96
|
w = w[(codepoint %= d) / (d /= 16)]
|
158
97
|
end
|
159
98
|
|
160
|
-
|
99
|
+
width += w || 1
|
161
100
|
end
|
162
101
|
}
|
163
102
|
|
164
|
-
|
103
|
+
# Return result + prevent negative lengths
|
104
|
+
width < 0 ? 0 : width
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns width of custom overwrites and remaining string
|
108
|
+
def self.width_custom(string, overwrite)
|
109
|
+
width = 0
|
110
|
+
|
111
|
+
string = string.each_codepoint.select{ |codepoint|
|
112
|
+
if overwrite[codepoint]
|
113
|
+
width += overwrite[codepoint]
|
114
|
+
nil
|
115
|
+
else
|
116
|
+
codepoint
|
117
|
+
end
|
118
|
+
}.pack("U*")
|
119
|
+
|
120
|
+
[width, string]
|
165
121
|
end
|
166
122
|
|
123
|
+
# Returns width for ASCII-only strings. Will consider zero-width control symbols.
|
124
|
+
def self.width_ascii(string)
|
125
|
+
if string.match?(ASCII_NON_ZERO_REGEX)
|
126
|
+
res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
|
127
|
+
return res < 0 ? 0 : res
|
128
|
+
end
|
129
|
+
|
130
|
+
string.bytesize
|
131
|
+
end
|
167
132
|
|
133
|
+
# Returns width of all considered Emoji and remaining string
|
168
134
|
def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
|
169
135
|
res = 0
|
170
136
|
|
171
|
-
string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
|
172
|
-
|
173
137
|
if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
|
174
138
|
emoji_width_via_possible(
|
175
139
|
string,
|
@@ -177,45 +141,23 @@ module Unicode
|
|
177
141
|
mode == :rgi_at,
|
178
142
|
ambiguous,
|
179
143
|
)
|
144
|
+
|
180
145
|
elsif mode == :all_no_vs16
|
181
|
-
|
146
|
+
no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
|
147
|
+
[res, no_emoji_string]
|
148
|
+
|
182
149
|
elsif mode == :vs16
|
183
|
-
|
150
|
+
no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
|
151
|
+
[res, no_emoji_string]
|
152
|
+
|
184
153
|
elsif mode == :all
|
185
|
-
|
186
|
-
|
187
|
-
|
154
|
+
no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
|
155
|
+
[res, no_emoji_string]
|
156
|
+
|
188
157
|
else
|
189
158
|
[0, string]
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
# Ensure all explicit VS16 sequences have width 2
|
194
|
-
def self.emoji_width_basic(string)
|
195
|
-
res = 0
|
196
|
-
|
197
|
-
no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
|
198
|
-
if basic_emoji.size >= 2 # VS16 present
|
199
|
-
res += 2
|
200
|
-
""
|
201
|
-
else
|
202
|
-
basic_emoji
|
203
|
-
end
|
204
|
-
}
|
205
|
-
|
206
|
-
[res, no_emoji_string]
|
207
|
-
end
|
208
|
-
|
209
|
-
# Use simplistic ZWJ/modifier/kecap sequence matching
|
210
|
-
def self.emoji_width_all(string)
|
211
|
-
res = 0
|
212
|
-
|
213
|
-
no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){
|
214
|
-
res += 2
|
215
|
-
""
|
216
|
-
}
|
217
159
|
|
218
|
-
|
160
|
+
end
|
219
161
|
end
|
220
162
|
|
221
163
|
# Match possible Emoji first, then refine
|
@@ -223,13 +165,9 @@ module Unicode
|
|
223
165
|
res = 0
|
224
166
|
|
225
167
|
# For each string possibly an emoji
|
226
|
-
no_emoji_string = string.gsub(
|
227
|
-
# Skip notorious false positives
|
228
|
-
if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
|
229
|
-
emoji_candidate
|
230
|
-
|
168
|
+
no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
|
231
169
|
# Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
|
232
|
-
|
170
|
+
if emoji_candidate == emoji_candidate[emoji_set_regex]
|
233
171
|
if strict_eaw
|
234
172
|
res += self.of(emoji_candidate[0], ambiguous, emoji: false)
|
235
173
|
else
|
@@ -241,14 +179,7 @@ module Unicode
|
|
241
179
|
else
|
242
180
|
if !strict_eaw
|
243
181
|
# Ensure all explicit VS16 sequences have width 2
|
244
|
-
emoji_candidate.gsub!(
|
245
|
-
if basic_emoji.size == 2 # VS16 present
|
246
|
-
res += 2
|
247
|
-
""
|
248
|
-
else
|
249
|
-
basic_emoji
|
250
|
-
end
|
251
|
-
}
|
182
|
+
emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
|
252
183
|
end
|
253
184
|
|
254
185
|
emoji_candidate
|
@@ -258,6 +189,34 @@ module Unicode
|
|
258
189
|
[res, no_emoji_string]
|
259
190
|
end
|
260
191
|
|
192
|
+
def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
|
193
|
+
unless old_options.empty?
|
194
|
+
warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
|
195
|
+
options.merge! old_options
|
196
|
+
end
|
197
|
+
|
198
|
+
options[:ambiguous] = ambiguous if ambiguous
|
199
|
+
options[:ambiguous] ||= DEFAULT_AMBIGUOUS
|
200
|
+
|
201
|
+
if options[:ambiguous] != 1 && options[:ambiguous] != 2
|
202
|
+
raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
|
203
|
+
end
|
204
|
+
|
205
|
+
if overwrite && !overwrite.empty?
|
206
|
+
warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
|
207
|
+
options[:overwrite] = overwrite
|
208
|
+
end
|
209
|
+
options[:overwrite] ||= {}
|
210
|
+
|
211
|
+
if [nil, true, :auto].include?(options[:emoji])
|
212
|
+
options[:emoji] = EmojiSupport.recommended
|
213
|
+
elsif options[:emoji] == false
|
214
|
+
options[:emoji] = :none
|
215
|
+
end
|
216
|
+
|
217
|
+
options
|
218
|
+
end
|
219
|
+
|
261
220
|
def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
|
262
221
|
@ambiguous = ambiguous
|
263
222
|
@overwrite = overwrite
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode-display_width
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-11-
|
11
|
+
date: 2024-11-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode-emoji
|
@@ -17,6 +17,9 @@ dependencies:
|
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '4.0'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 4.0.4
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -24,6 +27,9 @@ dependencies:
|
|
24
27
|
- - "~>"
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '4.0'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 4.0.4
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: rspec
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|