fast_unicode-display_width 0.1.1-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +137 -0
- data/ext/fast_unicode/display_width/Cargo.toml +25 -0
- data/ext/fast_unicode/display_width/build.rs +206 -0
- data/ext/fast_unicode/display_width/data/EastAsianWidth.txt +2721 -0
- data/ext/fast_unicode/display_width/data/emoji-test.txt +5518 -0
- data/ext/fast_unicode/display_width/data/upstream_width_table.txt +1292 -0
- data/ext/fast_unicode/display_width/extconf.rb +4 -0
- data/ext/fast_unicode/display_width/src/lib.rs +520 -0
- data/lib/fast_unicode/display_width/3.4/display_width.bundle +0 -0
- data/lib/fast_unicode/display_width/4.0/display_width.bundle +0 -0
- data/lib/fast_unicode/display_width/emoji_support.rb +36 -0
- data/lib/fast_unicode/display_width/string_ext.rb +20 -0
- data/lib/fast_unicode/display_width/version.rb +7 -0
- data/lib/fast_unicode/display_width.rb +95 -0
- metadata +173 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
use magnus::{
|
|
2
|
+
function,
|
|
3
|
+
prelude::*,
|
|
4
|
+
r_hash::ForEach,
|
|
5
|
+
value::ReprValue,
|
|
6
|
+
Error, RHash, RString, Ruby, Value,
|
|
7
|
+
};
|
|
8
|
+
use std::collections::HashMap;
|
|
9
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
10
|
+
use unicode_width::UnicodeWidthChar;
|
|
11
|
+
|
|
12
|
+
include!(concat!(env!("OUT_DIR"), "/rgi_set.rs"));
|
|
13
|
+
include!(concat!(env!("OUT_DIR"), "/width_delta.rs"));
|
|
14
|
+
|
|
15
|
+
/// Look up the per-codepoint width-correction delta vs `unicode-width` 0.2.
|
|
16
|
+
///
|
|
17
|
+
/// Returns the (delta_amb1, delta_amb2) the runtime must add on top of the
|
|
18
|
+
/// `unicode-width` value so we land on upstream's table. `0` is returned for
|
|
19
|
+
/// codepoints that match upstream verbatim — the common case — and the early
|
|
20
|
+
/// MIN/MAX screen lets bulk paths skip the binary search entirely on inputs
|
|
21
|
+
/// that touch no diverging codepoints.
|
|
22
|
+
#[inline]
|
|
23
|
+
fn width_delta(cp: u32) -> (i8, i8) {
|
|
24
|
+
// Latin-1 codepoints get a direct array lookup — they're dense in the
|
|
25
|
+
// delta table (most Latin-1 letters carry a d2 delta because upstream
|
|
26
|
+
// treats them as ambiguous=wide), so the binary search dominated the
|
|
27
|
+
// cost on accented-Latin and middle-dot inputs.
|
|
28
|
+
if cp < 0x100 {
|
|
29
|
+
return LATIN1_DELTA[cp as usize];
|
|
30
|
+
}
|
|
31
|
+
if cp < WIDTH_DELTA_MIN || cp > WIDTH_DELTA_MAX {
|
|
32
|
+
return (0, 0);
|
|
33
|
+
}
|
|
34
|
+
match WIDTH_DELTA.binary_search_by(|&(start, end, _, _)| {
|
|
35
|
+
if cp < start {
|
|
36
|
+
std::cmp::Ordering::Greater
|
|
37
|
+
} else if cp > end {
|
|
38
|
+
std::cmp::Ordering::Less
|
|
39
|
+
} else {
|
|
40
|
+
std::cmp::Ordering::Equal
|
|
41
|
+
}
|
|
42
|
+
}) {
|
|
43
|
+
Ok(idx) => {
|
|
44
|
+
let (_, _, d1, d2) = WIDTH_DELTA[idx];
|
|
45
|
+
(d1, d2)
|
|
46
|
+
}
|
|
47
|
+
Err(_) => (0, 0),
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
#[inline]
|
|
52
|
+
fn runtime_error<E: ToString>(ruby: &Ruby, e: E) -> Error {
|
|
53
|
+
Error::new(ruby.exception_runtime_error(), e.to_string())
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#[inline]
|
|
57
|
+
fn type_error(ruby: &Ruby, msg: &'static str) -> Error {
|
|
58
|
+
Error::new(ruby.exception_type_error(), msg)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[inline]
|
|
62
|
+
fn argument_error<E: ToString>(ruby: &Ruby, e: E) -> Error {
|
|
63
|
+
Error::new(ruby.exception_arg_error(), e.to_string())
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
#[derive(Copy, Clone, Eq, PartialEq)]
|
|
67
|
+
enum EmojiMode {
|
|
68
|
+
None,
|
|
69
|
+
All,
|
|
70
|
+
AllNoVs16,
|
|
71
|
+
Vs16,
|
|
72
|
+
Rgi,
|
|
73
|
+
RgiAt,
|
|
74
|
+
Possible,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Canonical symbol↔integer mapping for emoji modes. The integer code is the
|
|
78
|
+
/// index into this slice, so `from_code` and the Ruby-facing `EMOJI_MODE_CODES`
|
|
79
|
+
/// hash (built in `init`) stay in sync by construction. To add a mode, append
|
|
80
|
+
/// here and extend `EmojiMode`.
|
|
81
|
+
const EMOJI_MODES: &[(&str, EmojiMode)] = &[
|
|
82
|
+
("none", EmojiMode::None),
|
|
83
|
+
("all", EmojiMode::All),
|
|
84
|
+
("all_no_vs16", EmojiMode::AllNoVs16),
|
|
85
|
+
("vs16", EmojiMode::Vs16),
|
|
86
|
+
("rgi", EmojiMode::Rgi),
|
|
87
|
+
("rgi_at", EmojiMode::RgiAt),
|
|
88
|
+
("possible", EmojiMode::Possible),
|
|
89
|
+
];
|
|
90
|
+
|
|
91
|
+
impl EmojiMode {
|
|
92
|
+
fn from_code(code: i64) -> Option<Self> {
|
|
93
|
+
let idx = usize::try_from(code).ok()?;
|
|
94
|
+
EMOJI_MODES.get(idx).map(|&(_, m)| m)
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// Compute the monospace display width of `data`, treating it as UTF-8.
|
|
99
|
+
///
|
|
100
|
+
/// - `ambiguous`: 1 → narrow (`width()`), 2 → wide (`width_cjk()`).
|
|
101
|
+
/// - `overwrite`: `nil` or `Hash{Integer => Integer}` codepoint→width.
|
|
102
|
+
/// - `emoji_mode`: integer code, see `EmojiMode::from_code`.
|
|
103
|
+
fn width_native(
|
|
104
|
+
ruby: &Ruby,
|
|
105
|
+
data: RString,
|
|
106
|
+
ambiguous: i64,
|
|
107
|
+
overwrite: Value,
|
|
108
|
+
emoji_mode: i64,
|
|
109
|
+
) -> Result<i64, Error> {
|
|
110
|
+
let cjk = match ambiguous {
|
|
111
|
+
1 => false,
|
|
112
|
+
2 => true,
|
|
113
|
+
_ => return Err(argument_error(ruby, "ambiguous must be 1 or 2")),
|
|
114
|
+
};
|
|
115
|
+
let mode = EmojiMode::from_code(emoji_mode)
|
|
116
|
+
.ok_or_else(|| argument_error(ruby, "invalid emoji mode code"))?;
|
|
117
|
+
|
|
118
|
+
let overwrites = build_overwrite_map(ruby, overwrite)?;
|
|
119
|
+
let bytes = unsafe { data.as_slice() };
|
|
120
|
+
|
|
121
|
+
// Pure-ASCII inputs can't contain emoji sequences, so the emoji branches
|
|
122
|
+
// are irrelevant. Skip both `from_utf8` and grapheme iteration.
|
|
123
|
+
if overwrites.is_none() && bytes.is_ascii() {
|
|
124
|
+
return Ok(width_ascii(bytes) as i64);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
let s = std::str::from_utf8(bytes).map_err(|e| runtime_error(ruby, e))?;
|
|
128
|
+
|
|
129
|
+
if mode == EmojiMode::None {
|
|
130
|
+
return Ok(width_none(s, cjk, overwrites.as_ref()) as i64);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// vs16 is a 2-/3-codepoint lookahead pattern (`base + FE0F [+ 20E3]`),
|
|
134
|
+
// not a true grapheme construct. Walking codepoints directly avoids the
|
|
135
|
+
// per-cluster cost of `unicode-segmentation`, which dominated the
|
|
136
|
+
// measured runtime on every vs16 input.
|
|
137
|
+
if mode == EmojiMode::Vs16 {
|
|
138
|
+
return Ok(width_vs16(s, cjk, overwrites.as_ref()));
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
Ok(width_with_emoji(s, cjk, mode, overwrites.as_ref()))
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[inline]
|
|
145
|
+
fn width_none(s: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> usize {
|
|
146
|
+
if let Some(map) = ow {
|
|
147
|
+
return sum_with_overwrites(s, cjk, map);
|
|
148
|
+
}
|
|
149
|
+
if let Some(n) = common_narrow_shortcut(s, cjk) {
|
|
150
|
+
return n;
|
|
151
|
+
}
|
|
152
|
+
// Can't use `s.width()` / `s.width_cjk()`: unicode-width 0.2 applies
|
|
153
|
+
// its own emoji-cluster logic at the string level (VS16 promotes the
|
|
154
|
+
// preceding text-presentation char to width 2, skin-tone sequences
|
|
155
|
+
// collapse, …). Upstream's `emoji: :none` walks codepoints one at a
|
|
156
|
+
// time, so we do too. `codepoint_width` already folds in the delta
|
|
157
|
+
// table per-cp; ASCII goes through `ascii_codepoint_contribution`.
|
|
158
|
+
let mut total: i64 = 0;
|
|
159
|
+
for c in s.chars() {
|
|
160
|
+
let cp = c as u32;
|
|
161
|
+
if cp < 0x80 {
|
|
162
|
+
total += ascii_codepoint_contribution(cp as u8);
|
|
163
|
+
} else {
|
|
164
|
+
total += codepoint_width(c, cjk);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
total.max(0) as usize
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Upstream's "common narrow" early-exit: when *every* codepoint sits inside
|
|
171
|
+
/// the common-narrow band (0x10..=0x2FF for `width()`, 0x10..=0xA1 for
|
|
172
|
+
/// `width_cjk()`), upstream returns `string.size` and never consults the
|
|
173
|
+
/// INDEX. That bypass intentionally collapses a handful of ambiguous
|
|
174
|
+
/// codepoints (e.g. U+00A1 under cjk, which the INDEX records as 2) down to
|
|
175
|
+
/// 1, so we mirror it exactly. Mirrors `NOT_COMMON_NARROW_REGEX` in
|
|
176
|
+
/// `display_width.rb`.
|
|
177
|
+
#[inline]
|
|
178
|
+
fn common_narrow_shortcut(s: &str, cjk: bool) -> Option<usize> {
|
|
179
|
+
let max_common: u32 = if cjk { 0xA1 } else { 0x2FF };
|
|
180
|
+
let mut count = 0usize;
|
|
181
|
+
for c in s.chars() {
|
|
182
|
+
let cp = c as u32;
|
|
183
|
+
if cp < 0x10 || cp > max_common {
|
|
184
|
+
return None;
|
|
185
|
+
}
|
|
186
|
+
count += 1;
|
|
187
|
+
}
|
|
188
|
+
Some(count)
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/// vs16-specific fast path: scan codepoints linearly, handling the two
|
|
192
|
+
/// upstream patterns by lookahead instead of grapheme segmentation.
|
|
193
|
+
///
|
|
194
|
+
/// Patterns matched (each → width 2, consumes the whole run):
|
|
195
|
+
/// - `BASE + FE0F` where BASE is not `#`, `*`, or `0..=9` (text-presentation
|
|
196
|
+
/// promotion).
|
|
197
|
+
/// - `BASE + FE0F + 20E3` where BASE is `#`, `*`, or `0..=9` (keycap).
|
|
198
|
+
///
|
|
199
|
+
/// All other codepoints contribute their per-codepoint width — combining
|
|
200
|
+
/// marks and ZWJ are width 0 in the upstream table, so codepoint-sum agrees
|
|
201
|
+
/// with cluster-sum for the non-emoji portion.
|
|
202
|
+
fn width_vs16(s: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> i64 {
|
|
203
|
+
let mut total: i64 = 0;
|
|
204
|
+
let mut chars = s.chars().peekable();
|
|
205
|
+
while let Some(c) = chars.next() {
|
|
206
|
+
if chars.peek().copied() == Some('\u{FE0F}') {
|
|
207
|
+
let is_keycap_base = matches!(c, '#' | '*' | '0'..='9');
|
|
208
|
+
if !is_keycap_base {
|
|
209
|
+
chars.next(); // consume FE0F
|
|
210
|
+
total += 2;
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
// Keycap candidate: peek past FE0F for U+20E3 without consuming
|
|
214
|
+
// unless the full sequence matches. `Peekable<Chars>` is Clone
|
|
215
|
+
// (just a slice cursor), so this is cheap.
|
|
216
|
+
let mut probe = chars.clone();
|
|
217
|
+
probe.next(); // skip FE0F in the probe
|
|
218
|
+
if probe.peek().copied() == Some('\u{20E3}') {
|
|
219
|
+
chars.next(); // FE0F
|
|
220
|
+
chars.next(); // 20E3
|
|
221
|
+
total += 2;
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
// Digit + FE0F without the 20E3 follow-up: fall through to the
|
|
225
|
+
// per-codepoint sum so digit contributes 1 and FE0F contributes 0.
|
|
226
|
+
}
|
|
227
|
+
let cp = c as u32;
|
|
228
|
+
if let Some(map) = ow {
|
|
229
|
+
if let Some(&w) = map.get(&cp) {
|
|
230
|
+
total += w;
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
if cp < 0x80 {
|
|
235
|
+
total += ascii_codepoint_contribution(cp as u8);
|
|
236
|
+
} else {
|
|
237
|
+
total += codepoint_width(c, cjk);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
total.max(0)
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
fn width_with_emoji(s: &str, cjk: bool, mode: EmojiMode, ow: Option<&HashMap<u32, i64>>) -> i64 {
|
|
244
|
+
// Byte-level screen: when no codepoint that *could* trigger a multi-
|
|
245
|
+
// codepoint emoji cluster (ZWJ, FE0F, skin tone, keycap, regional
|
|
246
|
+
// indicator) appears anywhere in the input, every grapheme cluster is
|
|
247
|
+
// necessarily a single codepoint. For single-codepoint clusters the
|
|
248
|
+
// emoji-mode branches all collapse to the per-codepoint width
|
|
249
|
+
// (`emoji_cluster_width` returns `None` for the multi-codepoint patterns
|
|
250
|
+
// and RGI single-codepoint matches always equal the codepoint width
|
|
251
|
+
// because upstream's table agrees with `unicode-width` on those points).
|
|
252
|
+
// Falling back to `width_none` avoids the grapheme iterator entirely.
|
|
253
|
+
if !could_emit_emoji_cluster(s.as_bytes(), mode) {
|
|
254
|
+
return width_none(s, cjk, ow) as i64;
|
|
255
|
+
}
|
|
256
|
+
let mut total: i64 = 0;
|
|
257
|
+
for cluster in s.graphemes(true) {
|
|
258
|
+
if let Some(w) = emoji_cluster_width(cluster, mode, cjk) {
|
|
259
|
+
total += w;
|
|
260
|
+
} else {
|
|
261
|
+
total += cluster_width_via_tables(cluster, cjk, ow);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
total.max(0)
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/// Byte-level prescan for codepoints that begin or participate in a
|
|
268
|
+
/// multi-codepoint emoji cluster. Returns `true` if such a codepoint *might*
|
|
269
|
+
/// be present (the screen is conservative — false positives just route the
|
|
270
|
+
/// caller through the grapheme path, which is always correct).
|
|
271
|
+
///
|
|
272
|
+
/// Triggers checked:
|
|
273
|
+
/// - 0xE2 (3-byte UTF-8 lead) → covers ZWJ (E2 80 8D) and U+20E3 keycap.
|
|
274
|
+
/// - 0xEF → covers FE0F (EF B8 8F).
|
|
275
|
+
/// - 0xF0 0x9F 0x8F → skin tones (1F3FB..1F3FF, 4-byte lead 0xF0).
|
|
276
|
+
/// - 0xF0 0x9F 0x87 → regional indicators (1F1E6..1F1FF, flag clusters).
|
|
277
|
+
///
|
|
278
|
+
/// For modes that don't react to FE0F (e.g. `AllNoVs16`) the 0xEF check is
|
|
279
|
+
/// still safe — it just routes a few more inputs through the slow path. The
|
|
280
|
+
/// RGI/Possible modes need every multi-codepoint trigger because RGI clusters
|
|
281
|
+
/// can join via ZWJ or RI pairs.
|
|
282
|
+
#[inline]
|
|
283
|
+
fn could_emit_emoji_cluster(bytes: &[u8], _mode: EmojiMode) -> bool {
|
|
284
|
+
let n = bytes.len();
|
|
285
|
+
let mut i = 0;
|
|
286
|
+
while i < n {
|
|
287
|
+
let b = bytes[i];
|
|
288
|
+
if b == 0xE2 || b == 0xEF {
|
|
289
|
+
return true;
|
|
290
|
+
}
|
|
291
|
+
if b == 0xF0 && i + 2 < n && bytes[i + 1] == 0x9F {
|
|
292
|
+
let third = bytes[i + 2];
|
|
293
|
+
if third == 0x8F || third == 0x87 {
|
|
294
|
+
return true;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
i += 1;
|
|
298
|
+
}
|
|
299
|
+
false
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/// Returns `Some(width)` if the given grapheme cluster is consumed by the
|
|
303
|
+
/// emoji mode, else `None` (caller falls back to per-codepoint widths).
|
|
304
|
+
///
|
|
305
|
+
/// `EmojiMode::None` and `EmojiMode::Vs16` are routed to dedicated paths in
|
|
306
|
+
/// `width_native` before reaching here; they're rejected by the caller, not
|
|
307
|
+
/// matched here.
|
|
308
|
+
fn emoji_cluster_width(cluster: &str, mode: EmojiMode, cjk: bool) -> Option<i64> {
|
|
309
|
+
match mode {
|
|
310
|
+
EmojiMode::All => is_emoji_sequence_or_vs16(cluster).then_some(2),
|
|
311
|
+
EmojiMode::AllNoVs16 => is_emoji_sequence_no_vs16(cluster).then_some(2),
|
|
312
|
+
EmojiMode::Rgi | EmojiMode::Possible => {
|
|
313
|
+
RGI_SEQUENCES.contains(cluster).then_some(2)
|
|
314
|
+
}
|
|
315
|
+
EmojiMode::RgiAt => RGI_SEQUENCES.contains(cluster).then(|| {
|
|
316
|
+
let first = cluster.chars().next().unwrap();
|
|
317
|
+
if (first as u32) < 0x80 {
|
|
318
|
+
ascii_codepoint_contribution(first as u8)
|
|
319
|
+
} else {
|
|
320
|
+
codepoint_width(first, cjk)
|
|
321
|
+
}
|
|
322
|
+
}),
|
|
323
|
+
EmojiMode::None | EmojiMode::Vs16 => unreachable!(
|
|
324
|
+
"EmojiMode::None and EmojiMode::Vs16 are dispatched before width_with_emoji"
|
|
325
|
+
),
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/// Multi-codepoint cluster containing ZWJ, skin-tone modifier, VS16, or
|
|
330
|
+
/// COMBINING ENCLOSING KEYCAP. Matches upstream's REGEX_EMOJI_ALL_SEQUENCES
|
|
331
|
+
/// + REGEX_EMOJI_KEYCAP + REGEX_TEXT_PRESENTATION+VS16 union.
|
|
332
|
+
fn is_emoji_sequence_or_vs16(cluster: &str) -> bool {
|
|
333
|
+
let mut chars = cluster.chars();
|
|
334
|
+
if chars.next().is_none() {
|
|
335
|
+
return false;
|
|
336
|
+
}
|
|
337
|
+
let mut count = 1usize;
|
|
338
|
+
let mut has_marker = false;
|
|
339
|
+
for c in chars {
|
|
340
|
+
count += 1;
|
|
341
|
+
if matches!(
|
|
342
|
+
c,
|
|
343
|
+
'\u{200D}' | '\u{FE0F}' | '\u{1F3FB}'..='\u{1F3FF}' | '\u{20E3}'
|
|
344
|
+
) {
|
|
345
|
+
has_marker = true;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
count > 1 && has_marker
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/// Same as above but excludes pure-VS16 (text-presentation + VS16) clusters.
|
|
352
|
+
fn is_emoji_sequence_no_vs16(cluster: &str) -> bool {
|
|
353
|
+
let mut chars = cluster.chars();
|
|
354
|
+
if chars.next().is_none() {
|
|
355
|
+
return false;
|
|
356
|
+
}
|
|
357
|
+
let mut count = 1usize;
|
|
358
|
+
let mut has_non_vs16_marker = false;
|
|
359
|
+
for c in chars {
|
|
360
|
+
count += 1;
|
|
361
|
+
if matches!(
|
|
362
|
+
c,
|
|
363
|
+
'\u{200D}' | '\u{1F3FB}'..='\u{1F3FF}' | '\u{20E3}'
|
|
364
|
+
) {
|
|
365
|
+
has_non_vs16_marker = true;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
count > 1 && has_non_vs16_marker
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
#[inline]
|
|
372
|
+
fn cluster_width_via_tables(cluster: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> i64 {
|
|
373
|
+
let mut total: i64 = 0;
|
|
374
|
+
for c in cluster.chars() {
|
|
375
|
+
let cp = c as u32;
|
|
376
|
+
if let Some(map) = ow {
|
|
377
|
+
if let Some(&w) = map.get(&cp) {
|
|
378
|
+
total += w;
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if cp < 0x80 {
|
|
383
|
+
total += ascii_codepoint_contribution(cp as u8);
|
|
384
|
+
} else {
|
|
385
|
+
total += codepoint_width(c, cjk);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
total
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/// Per-codepoint width matching upstream's table — `unicode-width` lookup
|
|
392
|
+
/// (with the same None→1 fallback `UnicodeWidthStr` uses) plus the delta
|
|
393
|
+
/// correction. ASCII is handled separately via `ascii_codepoint_contribution`.
|
|
394
|
+
#[inline]
|
|
395
|
+
fn codepoint_width(c: char, cjk: bool) -> i64 {
|
|
396
|
+
let base = if cjk {
|
|
397
|
+
UnicodeWidthChar::width_cjk(c)
|
|
398
|
+
} else {
|
|
399
|
+
UnicodeWidthChar::width(c)
|
|
400
|
+
}
|
|
401
|
+
.unwrap_or(1) as i64;
|
|
402
|
+
let (d1, d2) = width_delta(c as u32);
|
|
403
|
+
let delta = if cjk { d2 } else { d1 } as i64;
|
|
404
|
+
base + delta
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
#[inline]
|
|
408
|
+
fn ascii_codepoint_contribution(b: u8) -> i64 {
|
|
409
|
+
// Upstream's ASCII rule: zero-list bytes contribute 0, backspace
|
|
410
|
+
// contributes -1, everything else (tab, DEL, other C0, printable) 1.
|
|
411
|
+
if b == 0x08 {
|
|
412
|
+
-1
|
|
413
|
+
} else if is_zero_width_ascii(b) {
|
|
414
|
+
0
|
|
415
|
+
} else {
|
|
416
|
+
1
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
/// Upstream's `ASCII_NON_ZERO_STRING` decoded: bytes whose display width is 0.
|
|
421
|
+
/// 0x00, 0x05, BEL (0x07), BS (0x08), LF (0x0A), VT/FF/CR/SO/SI (0x0B..=0x0F).
|
|
422
|
+
#[inline]
|
|
423
|
+
fn is_zero_width_ascii(b: u8) -> bool {
|
|
424
|
+
b < 0x10 && !matches!(b, 0x01..=0x04 | 0x06 | 0x09)
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
#[inline]
|
|
428
|
+
fn width_ascii(bytes: &[u8]) -> usize {
|
|
429
|
+
// Two-pass: a SIMD-friendly screen first to short-circuit the very common
|
|
430
|
+
// all-printable case (`bytes.len()` directly), and only walk a second time
|
|
431
|
+
// when there's actually a C0 or DEL byte to subtract. A single-pass loop
|
|
432
|
+
// counting `zero`/`bs` unconditionally regressed long-ASCII throughput by
|
|
433
|
+
// ~3x because the all-printable case is by far the dominant input.
|
|
434
|
+
if count_c0_and_del(bytes) == 0 {
|
|
435
|
+
return bytes.len();
|
|
436
|
+
}
|
|
437
|
+
let (zero, bs) = count_zero_and_bs(bytes);
|
|
438
|
+
bytes.len().saturating_sub(zero).saturating_sub(bs)
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
#[inline]
|
|
442
|
+
fn count_c0_and_del(bytes: &[u8]) -> usize {
|
|
443
|
+
bytes
|
|
444
|
+
.iter()
|
|
445
|
+
.map(|&b| (b < 0x20 || b == 0x7F) as usize)
|
|
446
|
+
.sum()
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
#[inline]
|
|
450
|
+
fn count_zero_and_bs(bytes: &[u8]) -> (usize, usize) {
|
|
451
|
+
let mut zero: usize = 0;
|
|
452
|
+
let mut bs: usize = 0;
|
|
453
|
+
for &b in bytes {
|
|
454
|
+
zero += is_zero_width_ascii(b) as usize;
|
|
455
|
+
bs += (b == 0x08) as usize;
|
|
456
|
+
}
|
|
457
|
+
(zero, bs)
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
fn build_overwrite_map(
|
|
461
|
+
ruby: &Ruby,
|
|
462
|
+
overwrite: Value,
|
|
463
|
+
) -> Result<Option<HashMap<u32, i64>>, Error> {
|
|
464
|
+
if overwrite.is_nil() {
|
|
465
|
+
return Ok(None);
|
|
466
|
+
}
|
|
467
|
+
let hash = RHash::from_value(overwrite)
|
|
468
|
+
.ok_or_else(|| type_error(ruby, "overwrite must be a Hash or nil"))?;
|
|
469
|
+
let mut map: HashMap<u32, i64> = HashMap::with_capacity(hash.len());
|
|
470
|
+
hash.foreach(|cp: i64, w: i64| {
|
|
471
|
+
if !(0..=0x10_FFFF).contains(&cp) {
|
|
472
|
+
return Err(argument_error(
|
|
473
|
+
ruby,
|
|
474
|
+
format!("overwrite codepoint {cp} is out of range (0..=0x10FFFF)"),
|
|
475
|
+
));
|
|
476
|
+
}
|
|
477
|
+
map.insert(cp as u32, w);
|
|
478
|
+
Ok(ForEach::Continue)
|
|
479
|
+
})?;
|
|
480
|
+
Ok(Some(map))
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
#[inline]
|
|
484
|
+
fn sum_with_overwrites(s: &str, cjk: bool, map: &HashMap<u32, i64>) -> usize {
|
|
485
|
+
let mut total: i64 = 0;
|
|
486
|
+
for c in s.chars() {
|
|
487
|
+
let cp = c as u32;
|
|
488
|
+
if let Some(&w) = map.get(&cp) {
|
|
489
|
+
total += w;
|
|
490
|
+
} else if cp < 0x80 {
|
|
491
|
+
total += ascii_codepoint_contribution(cp as u8);
|
|
492
|
+
} else {
|
|
493
|
+
total += codepoint_width(c, cjk);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
total.max(0) as usize
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
#[magnus::init(name = "display_width")]
|
|
500
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
501
|
+
let module = ruby.define_module("FastUnicode")?;
|
|
502
|
+
let klass = module.define_class("DisplayWidth", ruby.class_object())?;
|
|
503
|
+
|
|
504
|
+
// FFI binding lives under `DisplayWidth::Native` so the public surface
|
|
505
|
+
// doesn't advertise `width_native` (which bypasses encoding normalization
|
|
506
|
+
// and can crash the extension on invalid UTF-8). The Ruby side marks
|
|
507
|
+
// `Native` as `private_constant`.
|
|
508
|
+
let native = klass.define_module("Native")?;
|
|
509
|
+
native.define_singleton_method("width_native", function!(width_native, 4))?;
|
|
510
|
+
|
|
511
|
+
// Single source of truth for the symbol↔code mapping: derived from
|
|
512
|
+
// `EMOJI_MODES` so Ruby and Rust can't drift apart.
|
|
513
|
+
let codes = ruby.hash_new();
|
|
514
|
+
for (idx, (name, _)) in EMOJI_MODES.iter().enumerate() {
|
|
515
|
+
codes.aset(ruby.to_symbol(name), idx as i64)?;
|
|
516
|
+
}
|
|
517
|
+
codes.freeze();
|
|
518
|
+
klass.const_set("EMOJI_MODE_CODES", codes)?;
|
|
519
|
+
Ok(())
|
|
520
|
+
}
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module FastUnicode
|
|
4
|
+
class DisplayWidth
|
|
5
|
+
# Ported from upstream's `Unicode::DisplayWidth::EmojiSupport`. Picks a
|
|
6
|
+
# default emoji mode based on which terminal emulator the process is
|
|
7
|
+
# running under, so the gem's default behavior matches the upstream
|
|
8
|
+
# gem's recommendation for the same environment.
|
|
9
|
+
module EmojiSupport
|
|
10
|
+
# No memoization here: callers that need the value frozen at load time
|
|
11
|
+
# (e.g. `DEFAULT_EMOJI_CODE`) cache it themselves. Recomputing on each
|
|
12
|
+
# call keeps the function ENV-honest and makes it testable.
|
|
13
|
+
def self.recommended
|
|
14
|
+
# Upstream returns `:rqi` here (a typo); it falls through to the
|
|
15
|
+
# `else` branch in `emoji_width` and behaves like `:none`. We
|
|
16
|
+
# preserve the typo so our default exactly matches upstream's.
|
|
17
|
+
return :rqi if ENV["CI"]
|
|
18
|
+
|
|
19
|
+
case ENV["TERM_PROGRAM"]
|
|
20
|
+
when "iTerm.app" then return :all
|
|
21
|
+
when "Apple_Terminal" then return :rgi_at
|
|
22
|
+
when "WezTerm" then return :all_no_vs16
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
case ENV["TERM"]
|
|
26
|
+
when "contour", "foot" then return :all
|
|
27
|
+
when /kitty/ then return :vs16
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
return :vs16 if ENV["WT_SESSION"]
|
|
31
|
+
|
|
32
|
+
:none
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../display_width'
|
|
4
|
+
|
|
5
|
+
module FastUnicode
|
|
6
|
+
class DisplayWidth
|
|
7
|
+
# Refinement adding `String#display_width`. Matches the shape of
|
|
8
|
+
# `Unicode::DisplayWidth::StringExt` so call sites can be swapped 1:1.
|
|
9
|
+
#
|
|
10
|
+
# using FastUnicode::DisplayWidth::StringExt
|
|
11
|
+
# "一二三".display_width # => 6
|
|
12
|
+
module StringExt
|
|
13
|
+
refine String do
|
|
14
|
+
def display_width(*args, **kwargs)
|
|
15
|
+
FastUnicode::DisplayWidth.of(self, *args, **kwargs)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'display_width/display_width'
|
|
4
|
+
require_relative 'display_width/version'
|
|
5
|
+
require_relative 'display_width/emoji_support'
|
|
6
|
+
|
|
7
|
+
module FastUnicode
|
|
8
|
+
# Drop-in API shape for `Unicode::DisplayWidth` (v3.x). Renamed to the
|
|
9
|
+
# `FastUnicode` namespace so both gems can coexist without conflict.
|
|
10
|
+
#
|
|
11
|
+
# Supported:
|
|
12
|
+
# - `ambiguous` (1 or 2) — East Asian Width ambiguous behavior.
|
|
13
|
+
# - `overwrite` ({Integer => Integer}) — fixed widths for codepoints.
|
|
14
|
+
# - `emoji:` (:none, :all, :all_no_vs16, :vs16, :rgi, :rgi_at, :possible,
|
|
15
|
+
# true, :auto, false). `nil`/`true`/`:auto` resolve to
|
|
16
|
+
# `EmojiSupport.recommended`. `false` is equivalent to `:none`.
|
|
17
|
+
#
|
|
18
|
+
# Returns an Integer (column count, never negative).
|
|
19
|
+
class DisplayWidth
|
|
20
|
+
# `Native` and `EMOJI_MODE_CODES` are defined by the Rust extension during
|
|
21
|
+
# `init` (see `ext/fast_unicode/display_width/src/lib.rs`). `Native` holds
|
|
22
|
+
# the unsafe FFI binding (no encoding normalization, raw integer enum
|
|
23
|
+
# codes) and is intentionally hidden — call `DisplayWidth.of` instead.
|
|
24
|
+
private_constant :Native
|
|
25
|
+
|
|
26
|
+
DEFAULT_EMOJI_CODE = EMOJI_MODE_CODES.fetch(EmojiSupport.recommended, 0)
|
|
27
|
+
|
|
28
|
+
# Private sentinel for "caller did not pass `emoji:`". Distinct from any
|
|
29
|
+
# user-supplied value, including `nil`, `true`, and `:auto`, which the
|
|
30
|
+
# upstream gem treats as explicit requests for the detected default.
|
|
31
|
+
OMITTED = Object.new.freeze
|
|
32
|
+
private_constant :OMITTED
|
|
33
|
+
|
|
34
|
+
# Encoding normalization + emoji-mode resolution. Public methods funnel
|
|
35
|
+
# through `Internal.compute` so the unsafe `Native.width_native` binding
|
|
36
|
+
# only has one caller and the normalization can't be bypassed.
|
|
37
|
+
module Internal
|
|
38
|
+
module_function
|
|
39
|
+
|
|
40
|
+
def compute(string, ambiguous, overwrite, emoji_code)
|
|
41
|
+
Native.width_native(normalize_encoding(string), ambiguous, overwrite, emoji_code)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Mirrors the upstream gem's encoding contract: a BINARY string is
|
|
45
|
+
# reinterpreted as UTF-8 only if its bytes form valid UTF-8; otherwise
|
|
46
|
+
# it falls through to `String#encode` with replacement. Non-UTF-8
|
|
47
|
+
# encodings are always transcoded so the Rust side never sees invalid
|
|
48
|
+
# bytes.
|
|
49
|
+
def normalize_encoding(string)
|
|
50
|
+
if string.encoding == Encoding::BINARY
|
|
51
|
+
candidate = string.dup.force_encoding(Encoding::UTF_8)
|
|
52
|
+
return candidate if candidate.valid_encoding?
|
|
53
|
+
return string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
54
|
+
end
|
|
55
|
+
return string if string.encoding == Encoding::UTF_8
|
|
56
|
+
string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Resolves the user-facing `emoji:` argument to the integer code passed
|
|
60
|
+
# to Rust. `nil`/`true`/`:auto` collapse to the detected default,
|
|
61
|
+
# mirroring upstream. The `OMITTED` sentinel never reaches here — call
|
|
62
|
+
# sites short-circuit it to `DEFAULT_EMOJI_CODE` first.
|
|
63
|
+
def resolve_emoji_code(value)
|
|
64
|
+
case value
|
|
65
|
+
when nil, true, :auto then DEFAULT_EMOJI_CODE
|
|
66
|
+
when false then EMOJI_MODE_CODES[:none]
|
|
67
|
+
else
|
|
68
|
+
EMOJI_MODE_CODES.fetch(value) do
|
|
69
|
+
raise ArgumentError, "unknown emoji mode: #{value.inspect}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
private_constant :Internal
|
|
75
|
+
|
|
76
|
+
def self.of(string, ambiguous = 1, **opts)
|
|
77
|
+
if opts.empty?
|
|
78
|
+
return Internal.compute(string, ambiguous, nil, DEFAULT_EMOJI_CODE)
|
|
79
|
+
end
|
|
80
|
+
ambig = opts.key?(:ambiguous) ? opts[:ambiguous] : ambiguous
|
|
81
|
+
code = opts.key?(:emoji) ? Internal.resolve_emoji_code(opts[:emoji]) : DEFAULT_EMOJI_CODE
|
|
82
|
+
Internal.compute(string, ambig, opts[:overwrite], code)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def initialize(ambiguous: 1, overwrite: nil, emoji: OMITTED)
|
|
86
|
+
@ambiguous = ambiguous
|
|
87
|
+
@overwrite = overwrite
|
|
88
|
+
@emoji_code = emoji.equal?(OMITTED) ? DEFAULT_EMOJI_CODE : Internal.resolve_emoji_code(emoji)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def of(string)
|
|
92
|
+
Internal.compute(string, @ambiguous, @overwrite, @emoji_code)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|