fast_unicode-display_width 0.1.1-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("fast_unicode/display_width/display_width")
@@ -0,0 +1,520 @@
1
+ use magnus::{
2
+ function,
3
+ prelude::*,
4
+ r_hash::ForEach,
5
+ value::ReprValue,
6
+ Error, RHash, RString, Ruby, Value,
7
+ };
8
+ use std::collections::HashMap;
9
+ use unicode_segmentation::UnicodeSegmentation;
10
+ use unicode_width::UnicodeWidthChar;
11
+
12
+ include!(concat!(env!("OUT_DIR"), "/rgi_set.rs"));
13
+ include!(concat!(env!("OUT_DIR"), "/width_delta.rs"));
14
+
15
+ /// Look up the per-codepoint width-correction delta vs `unicode-width` 0.2.
16
+ ///
17
+ /// Returns the (delta_amb1, delta_amb2) the runtime must add on top of the
18
+ /// `unicode-width` value so we land on upstream's table. `0` is returned for
19
+ /// codepoints that match upstream verbatim — the common case — and the early
20
+ /// MIN/MAX screen lets bulk paths skip the binary search entirely on inputs
21
+ /// that touch no diverging codepoints.
22
+ #[inline]
23
+ fn width_delta(cp: u32) -> (i8, i8) {
24
+ // Latin-1 codepoints get a direct array lookup — they're dense in the
25
+ // delta table (most Latin-1 letters carry a d2 delta because upstream
26
+ // treats them as ambiguous=wide), so the binary search dominated the
27
+ // cost on accented-Latin and middle-dot inputs.
28
+ if cp < 0x100 {
29
+ return LATIN1_DELTA[cp as usize];
30
+ }
31
+ if cp < WIDTH_DELTA_MIN || cp > WIDTH_DELTA_MAX {
32
+ return (0, 0);
33
+ }
34
+ match WIDTH_DELTA.binary_search_by(|&(start, end, _, _)| {
35
+ if cp < start {
36
+ std::cmp::Ordering::Greater
37
+ } else if cp > end {
38
+ std::cmp::Ordering::Less
39
+ } else {
40
+ std::cmp::Ordering::Equal
41
+ }
42
+ }) {
43
+ Ok(idx) => {
44
+ let (_, _, d1, d2) = WIDTH_DELTA[idx];
45
+ (d1, d2)
46
+ }
47
+ Err(_) => (0, 0),
48
+ }
49
+ }
50
+
51
+ #[inline]
52
+ fn runtime_error<E: ToString>(ruby: &Ruby, e: E) -> Error {
53
+ Error::new(ruby.exception_runtime_error(), e.to_string())
54
+ }
55
+
56
+ #[inline]
57
+ fn type_error(ruby: &Ruby, msg: &'static str) -> Error {
58
+ Error::new(ruby.exception_type_error(), msg)
59
+ }
60
+
61
+ #[inline]
62
+ fn argument_error<E: ToString>(ruby: &Ruby, e: E) -> Error {
63
+ Error::new(ruby.exception_arg_error(), e.to_string())
64
+ }
65
+
66
+ #[derive(Copy, Clone, Eq, PartialEq)]
67
+ enum EmojiMode {
68
+ None,
69
+ All,
70
+ AllNoVs16,
71
+ Vs16,
72
+ Rgi,
73
+ RgiAt,
74
+ Possible,
75
+ }
76
+
77
+ /// Canonical symbol↔integer mapping for emoji modes. The integer code is the
78
+ /// index into this slice, so `from_code` and the Ruby-facing `EMOJI_MODE_CODES`
79
+ /// hash (built in `init`) stay in sync by construction. To add a mode, append
80
+ /// here and extend `EmojiMode`.
81
+ const EMOJI_MODES: &[(&str, EmojiMode)] = &[
82
+ ("none", EmojiMode::None),
83
+ ("all", EmojiMode::All),
84
+ ("all_no_vs16", EmojiMode::AllNoVs16),
85
+ ("vs16", EmojiMode::Vs16),
86
+ ("rgi", EmojiMode::Rgi),
87
+ ("rgi_at", EmojiMode::RgiAt),
88
+ ("possible", EmojiMode::Possible),
89
+ ];
90
+
91
+ impl EmojiMode {
92
+ fn from_code(code: i64) -> Option<Self> {
93
+ let idx = usize::try_from(code).ok()?;
94
+ EMOJI_MODES.get(idx).map(|&(_, m)| m)
95
+ }
96
+ }
97
+
98
+ /// Compute the monospace display width of `data`, treating it as UTF-8.
99
+ ///
100
+ /// - `ambiguous`: 1 → narrow (`width()`), 2 → wide (`width_cjk()`).
101
+ /// - `overwrite`: `nil` or `Hash{Integer => Integer}` codepoint→width.
102
+ /// - `emoji_mode`: integer code, see `EmojiMode::from_code`.
103
+ fn width_native(
104
+ ruby: &Ruby,
105
+ data: RString,
106
+ ambiguous: i64,
107
+ overwrite: Value,
108
+ emoji_mode: i64,
109
+ ) -> Result<i64, Error> {
110
+ let cjk = match ambiguous {
111
+ 1 => false,
112
+ 2 => true,
113
+ _ => return Err(argument_error(ruby, "ambiguous must be 1 or 2")),
114
+ };
115
+ let mode = EmojiMode::from_code(emoji_mode)
116
+ .ok_or_else(|| argument_error(ruby, "invalid emoji mode code"))?;
117
+
118
+ let overwrites = build_overwrite_map(ruby, overwrite)?;
119
+ let bytes = unsafe { data.as_slice() };
120
+
121
+ // Pure-ASCII inputs can't contain emoji sequences, so the emoji branches
122
+ // are irrelevant. Skip both `from_utf8` and grapheme iteration.
123
+ if overwrites.is_none() && bytes.is_ascii() {
124
+ return Ok(width_ascii(bytes) as i64);
125
+ }
126
+
127
+ let s = std::str::from_utf8(bytes).map_err(|e| runtime_error(ruby, e))?;
128
+
129
+ if mode == EmojiMode::None {
130
+ return Ok(width_none(s, cjk, overwrites.as_ref()) as i64);
131
+ }
132
+
133
+ // vs16 is a 2-/3-codepoint lookahead pattern (`base + FE0F [+ 20E3]`),
134
+ // not a true grapheme construct. Walking codepoints directly avoids the
135
+ // per-cluster cost of `unicode-segmentation`, which dominated the
136
+ // measured runtime on every vs16 input.
137
+ if mode == EmojiMode::Vs16 {
138
+ return Ok(width_vs16(s, cjk, overwrites.as_ref()));
139
+ }
140
+
141
+ Ok(width_with_emoji(s, cjk, mode, overwrites.as_ref()))
142
+ }
143
+
144
+ #[inline]
145
+ fn width_none(s: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> usize {
146
+ if let Some(map) = ow {
147
+ return sum_with_overwrites(s, cjk, map);
148
+ }
149
+ if let Some(n) = common_narrow_shortcut(s, cjk) {
150
+ return n;
151
+ }
152
+ // Can't use `s.width()` / `s.width_cjk()`: unicode-width 0.2 applies
153
+ // its own emoji-cluster logic at the string level (VS16 promotes the
154
+ // preceding text-presentation char to width 2, skin-tone sequences
155
+ // collapse, …). Upstream's `emoji: :none` walks codepoints one at a
156
+ // time, so we do too. `codepoint_width` already folds in the delta
157
+ // table per-cp; ASCII goes through `ascii_codepoint_contribution`.
158
+ let mut total: i64 = 0;
159
+ for c in s.chars() {
160
+ let cp = c as u32;
161
+ if cp < 0x80 {
162
+ total += ascii_codepoint_contribution(cp as u8);
163
+ } else {
164
+ total += codepoint_width(c, cjk);
165
+ }
166
+ }
167
+ total.max(0) as usize
168
+ }
169
+
170
+ /// Upstream's "common narrow" early-exit: when *every* codepoint sits inside
171
+ /// the common-narrow band (0x10..=0x2FF for `width()`, 0x10..=0xA1 for
172
+ /// `width_cjk()`), upstream returns `string.size` and never consults the
173
+ /// INDEX. That bypass intentionally collapses a handful of ambiguous
174
+ /// codepoints (e.g. U+00A1 under cjk, which the INDEX records as 2) down to
175
+ /// 1, so we mirror it exactly. Mirrors `NOT_COMMON_NARROW_REGEX` in
176
+ /// `display_width.rb`.
177
+ #[inline]
178
+ fn common_narrow_shortcut(s: &str, cjk: bool) -> Option<usize> {
179
+ let max_common: u32 = if cjk { 0xA1 } else { 0x2FF };
180
+ let mut count = 0usize;
181
+ for c in s.chars() {
182
+ let cp = c as u32;
183
+ if cp < 0x10 || cp > max_common {
184
+ return None;
185
+ }
186
+ count += 1;
187
+ }
188
+ Some(count)
189
+ }
190
+
191
+ /// vs16-specific fast path: scan codepoints linearly, handling the two
192
+ /// upstream patterns by lookahead instead of grapheme segmentation.
193
+ ///
194
+ /// Patterns matched (each → width 2, consumes the whole run):
195
+ /// - `BASE + FE0F` where BASE is not `#`, `*`, or `0..=9` (text-presentation
196
+ /// promotion).
197
+ /// - `BASE + FE0F + 20E3` where BASE is `#`, `*`, or `0..=9` (keycap).
198
+ ///
199
+ /// All other codepoints contribute their per-codepoint width — combining
200
+ /// marks and ZWJ are width 0 in the upstream table, so codepoint-sum agrees
201
+ /// with cluster-sum for the non-emoji portion.
202
+ fn width_vs16(s: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> i64 {
203
+ let mut total: i64 = 0;
204
+ let mut chars = s.chars().peekable();
205
+ while let Some(c) = chars.next() {
206
+ if chars.peek().copied() == Some('\u{FE0F}') {
207
+ let is_keycap_base = matches!(c, '#' | '*' | '0'..='9');
208
+ if !is_keycap_base {
209
+ chars.next(); // consume FE0F
210
+ total += 2;
211
+ continue;
212
+ }
213
+ // Keycap candidate: peek past FE0F for U+20E3 without consuming
214
+ // unless the full sequence matches. `Peekable<Chars>` is Clone
215
+ // (just a slice cursor), so this is cheap.
216
+ let mut probe = chars.clone();
217
+ probe.next(); // skip FE0F in the probe
218
+ if probe.peek().copied() == Some('\u{20E3}') {
219
+ chars.next(); // FE0F
220
+ chars.next(); // 20E3
221
+ total += 2;
222
+ continue;
223
+ }
224
+ // Digit + FE0F without the 20E3 follow-up: fall through to the
225
+ // per-codepoint sum so digit contributes 1 and FE0F contributes 0.
226
+ }
227
+ let cp = c as u32;
228
+ if let Some(map) = ow {
229
+ if let Some(&w) = map.get(&cp) {
230
+ total += w;
231
+ continue;
232
+ }
233
+ }
234
+ if cp < 0x80 {
235
+ total += ascii_codepoint_contribution(cp as u8);
236
+ } else {
237
+ total += codepoint_width(c, cjk);
238
+ }
239
+ }
240
+ total.max(0)
241
+ }
242
+
243
+ fn width_with_emoji(s: &str, cjk: bool, mode: EmojiMode, ow: Option<&HashMap<u32, i64>>) -> i64 {
244
+ // Byte-level screen: when no codepoint that *could* trigger a multi-
245
+ // codepoint emoji cluster (ZWJ, FE0F, skin tone, keycap, regional
246
+ // indicator) appears anywhere in the input, every grapheme cluster is
247
+ // necessarily a single codepoint. For single-codepoint clusters the
248
+ // emoji-mode branches all collapse to the per-codepoint width
249
+ // (`emoji_cluster_width` returns `None` for the multi-codepoint patterns
250
+ // and RGI single-codepoint matches always equal the codepoint width
251
+ // because upstream's table agrees with `unicode-width` on those points).
252
+ // Falling back to `width_none` avoids the grapheme iterator entirely.
253
+ if !could_emit_emoji_cluster(s.as_bytes(), mode) {
254
+ return width_none(s, cjk, ow) as i64;
255
+ }
256
+ let mut total: i64 = 0;
257
+ for cluster in s.graphemes(true) {
258
+ if let Some(w) = emoji_cluster_width(cluster, mode, cjk) {
259
+ total += w;
260
+ } else {
261
+ total += cluster_width_via_tables(cluster, cjk, ow);
262
+ }
263
+ }
264
+ total.max(0)
265
+ }
266
+
267
+ /// Byte-level prescan for codepoints that begin or participate in a
268
+ /// multi-codepoint emoji cluster. Returns `true` if such a codepoint *might*
269
+ /// be present (the screen is conservative — false positives just route the
270
+ /// caller through the grapheme path, which is always correct).
271
+ ///
272
+ /// Triggers checked:
273
+ /// - 0xE2 (3-byte UTF-8 lead) → covers ZWJ (E2 80 8D) and U+20E3 keycap.
274
+ /// - 0xEF → covers FE0F (EF B8 8F).
275
+ /// - 0xF0 0x9F 0x8F → skin tones (1F3FB..1F3FF, 4-byte lead 0xF0).
276
+ /// - 0xF0 0x9F 0x87 → regional indicators (1F1E6..1F1FF, flag clusters).
277
+ ///
278
+ /// For modes that don't react to FE0F (e.g. `AllNoVs16`) the 0xEF check is
279
+ /// still safe — it just routes a few more inputs through the slow path. The
280
+ /// RGI/Possible modes need every multi-codepoint trigger because RGI clusters
281
+ /// can join via ZWJ or RI pairs.
282
+ #[inline]
283
+ fn could_emit_emoji_cluster(bytes: &[u8], _mode: EmojiMode) -> bool {
284
+ let n = bytes.len();
285
+ let mut i = 0;
286
+ while i < n {
287
+ let b = bytes[i];
288
+ if b == 0xE2 || b == 0xEF {
289
+ return true;
290
+ }
291
+ if b == 0xF0 && i + 2 < n && bytes[i + 1] == 0x9F {
292
+ let third = bytes[i + 2];
293
+ if third == 0x8F || third == 0x87 {
294
+ return true;
295
+ }
296
+ }
297
+ i += 1;
298
+ }
299
+ false
300
+ }
301
+
302
+ /// Returns `Some(width)` if the given grapheme cluster is consumed by the
303
+ /// emoji mode, else `None` (caller falls back to per-codepoint widths).
304
+ ///
305
+ /// `EmojiMode::None` and `EmojiMode::Vs16` are routed to dedicated paths in
306
+ /// `width_native` before reaching here; they're rejected by the caller, not
307
+ /// matched here.
308
+ fn emoji_cluster_width(cluster: &str, mode: EmojiMode, cjk: bool) -> Option<i64> {
309
+ match mode {
310
+ EmojiMode::All => is_emoji_sequence_or_vs16(cluster).then_some(2),
311
+ EmojiMode::AllNoVs16 => is_emoji_sequence_no_vs16(cluster).then_some(2),
312
+ EmojiMode::Rgi | EmojiMode::Possible => {
313
+ RGI_SEQUENCES.contains(cluster).then_some(2)
314
+ }
315
+ EmojiMode::RgiAt => RGI_SEQUENCES.contains(cluster).then(|| {
316
+ let first = cluster.chars().next().unwrap();
317
+ if (first as u32) < 0x80 {
318
+ ascii_codepoint_contribution(first as u8)
319
+ } else {
320
+ codepoint_width(first, cjk)
321
+ }
322
+ }),
323
+ EmojiMode::None | EmojiMode::Vs16 => unreachable!(
324
+ "EmojiMode::None and EmojiMode::Vs16 are dispatched before width_with_emoji"
325
+ ),
326
+ }
327
+ }
328
+
329
+ /// Multi-codepoint cluster containing ZWJ, skin-tone modifier, VS16, or
330
+ /// COMBINING ENCLOSING KEYCAP. Matches upstream's REGEX_EMOJI_ALL_SEQUENCES
331
+ /// + REGEX_EMOJI_KEYCAP + REGEX_TEXT_PRESENTATION+VS16 union.
332
+ fn is_emoji_sequence_or_vs16(cluster: &str) -> bool {
333
+ let mut chars = cluster.chars();
334
+ if chars.next().is_none() {
335
+ return false;
336
+ }
337
+ let mut count = 1usize;
338
+ let mut has_marker = false;
339
+ for c in chars {
340
+ count += 1;
341
+ if matches!(
342
+ c,
343
+ '\u{200D}' | '\u{FE0F}' | '\u{1F3FB}'..='\u{1F3FF}' | '\u{20E3}'
344
+ ) {
345
+ has_marker = true;
346
+ }
347
+ }
348
+ count > 1 && has_marker
349
+ }
350
+
351
+ /// Same as above but excludes pure-VS16 (text-presentation + VS16) clusters.
352
+ fn is_emoji_sequence_no_vs16(cluster: &str) -> bool {
353
+ let mut chars = cluster.chars();
354
+ if chars.next().is_none() {
355
+ return false;
356
+ }
357
+ let mut count = 1usize;
358
+ let mut has_non_vs16_marker = false;
359
+ for c in chars {
360
+ count += 1;
361
+ if matches!(
362
+ c,
363
+ '\u{200D}' | '\u{1F3FB}'..='\u{1F3FF}' | '\u{20E3}'
364
+ ) {
365
+ has_non_vs16_marker = true;
366
+ }
367
+ }
368
+ count > 1 && has_non_vs16_marker
369
+ }
370
+
371
+ #[inline]
372
+ fn cluster_width_via_tables(cluster: &str, cjk: bool, ow: Option<&HashMap<u32, i64>>) -> i64 {
373
+ let mut total: i64 = 0;
374
+ for c in cluster.chars() {
375
+ let cp = c as u32;
376
+ if let Some(map) = ow {
377
+ if let Some(&w) = map.get(&cp) {
378
+ total += w;
379
+ continue;
380
+ }
381
+ }
382
+ if cp < 0x80 {
383
+ total += ascii_codepoint_contribution(cp as u8);
384
+ } else {
385
+ total += codepoint_width(c, cjk);
386
+ }
387
+ }
388
+ total
389
+ }
390
+
391
+ /// Per-codepoint width matching upstream's table — `unicode-width` lookup
392
+ /// (with the same None→1 fallback `UnicodeWidthStr` uses) plus the delta
393
+ /// correction. ASCII is handled separately via `ascii_codepoint_contribution`.
394
+ #[inline]
395
+ fn codepoint_width(c: char, cjk: bool) -> i64 {
396
+ let base = if cjk {
397
+ UnicodeWidthChar::width_cjk(c)
398
+ } else {
399
+ UnicodeWidthChar::width(c)
400
+ }
401
+ .unwrap_or(1) as i64;
402
+ let (d1, d2) = width_delta(c as u32);
403
+ let delta = if cjk { d2 } else { d1 } as i64;
404
+ base + delta
405
+ }
406
+
407
+ #[inline]
408
+ fn ascii_codepoint_contribution(b: u8) -> i64 {
409
+ // Upstream's ASCII rule: zero-list bytes contribute 0, backspace
410
+ // contributes -1, everything else (tab, DEL, other C0, printable) 1.
411
+ if b == 0x08 {
412
+ -1
413
+ } else if is_zero_width_ascii(b) {
414
+ 0
415
+ } else {
416
+ 1
417
+ }
418
+ }
419
+
420
+ /// Upstream's `ASCII_NON_ZERO_STRING` decoded: bytes whose display width is 0.
421
+ /// 0x00, 0x05, BEL (0x07), BS (0x08), LF (0x0A), VT/FF/CR/SO/SI (0x0B..=0x0F).
422
+ #[inline]
423
+ fn is_zero_width_ascii(b: u8) -> bool {
424
+ b < 0x10 && !matches!(b, 0x01..=0x04 | 0x06 | 0x09)
425
+ }
426
+
427
+ #[inline]
428
+ fn width_ascii(bytes: &[u8]) -> usize {
429
+ // Two-pass: a SIMD-friendly screen first to short-circuit the very common
430
+ // all-printable case (`bytes.len()` directly), and only walk a second time
431
+ // when there's actually a C0 or DEL byte to subtract. A single-pass loop
432
+ // counting `zero`/`bs` unconditionally regressed long-ASCII throughput by
433
+ // ~3x because the all-printable case is by far the dominant input.
434
+ if count_c0_and_del(bytes) == 0 {
435
+ return bytes.len();
436
+ }
437
+ let (zero, bs) = count_zero_and_bs(bytes);
438
+ bytes.len().saturating_sub(zero).saturating_sub(bs)
439
+ }
440
+
441
+ #[inline]
442
+ fn count_c0_and_del(bytes: &[u8]) -> usize {
443
+ bytes
444
+ .iter()
445
+ .map(|&b| (b < 0x20 || b == 0x7F) as usize)
446
+ .sum()
447
+ }
448
+
449
+ #[inline]
450
+ fn count_zero_and_bs(bytes: &[u8]) -> (usize, usize) {
451
+ let mut zero: usize = 0;
452
+ let mut bs: usize = 0;
453
+ for &b in bytes {
454
+ zero += is_zero_width_ascii(b) as usize;
455
+ bs += (b == 0x08) as usize;
456
+ }
457
+ (zero, bs)
458
+ }
459
+
460
+ fn build_overwrite_map(
461
+ ruby: &Ruby,
462
+ overwrite: Value,
463
+ ) -> Result<Option<HashMap<u32, i64>>, Error> {
464
+ if overwrite.is_nil() {
465
+ return Ok(None);
466
+ }
467
+ let hash = RHash::from_value(overwrite)
468
+ .ok_or_else(|| type_error(ruby, "overwrite must be a Hash or nil"))?;
469
+ let mut map: HashMap<u32, i64> = HashMap::with_capacity(hash.len());
470
+ hash.foreach(|cp: i64, w: i64| {
471
+ if !(0..=0x10_FFFF).contains(&cp) {
472
+ return Err(argument_error(
473
+ ruby,
474
+ format!("overwrite codepoint {cp} is out of range (0..=0x10FFFF)"),
475
+ ));
476
+ }
477
+ map.insert(cp as u32, w);
478
+ Ok(ForEach::Continue)
479
+ })?;
480
+ Ok(Some(map))
481
+ }
482
+
483
+ #[inline]
484
+ fn sum_with_overwrites(s: &str, cjk: bool, map: &HashMap<u32, i64>) -> usize {
485
+ let mut total: i64 = 0;
486
+ for c in s.chars() {
487
+ let cp = c as u32;
488
+ if let Some(&w) = map.get(&cp) {
489
+ total += w;
490
+ } else if cp < 0x80 {
491
+ total += ascii_codepoint_contribution(cp as u8);
492
+ } else {
493
+ total += codepoint_width(c, cjk);
494
+ }
495
+ }
496
+ total.max(0) as usize
497
+ }
498
+
499
+ #[magnus::init(name = "display_width")]
500
+ fn init(ruby: &Ruby) -> Result<(), Error> {
501
+ let module = ruby.define_module("FastUnicode")?;
502
+ let klass = module.define_class("DisplayWidth", ruby.class_object())?;
503
+
504
+ // FFI binding lives under `DisplayWidth::Native` so the public surface
505
+ // doesn't advertise `width_native` (which bypasses encoding normalization
506
+ // and can crash the extension on invalid UTF-8). The Ruby side marks
507
+ // `Native` as `private_constant`.
508
+ let native = klass.define_module("Native")?;
509
+ native.define_singleton_method("width_native", function!(width_native, 4))?;
510
+
511
+ // Single source of truth for the symbol↔code mapping: derived from
512
+ // `EMOJI_MODES` so Ruby and Rust can't drift apart.
513
+ let codes = ruby.hash_new();
514
+ for (idx, (name, _)) in EMOJI_MODES.iter().enumerate() {
515
+ codes.aset(ruby.to_symbol(name), idx as i64)?;
516
+ }
517
+ codes.freeze();
518
+ klass.const_set("EMOJI_MODE_CODES", codes)?;
519
+ Ok(())
520
+ }
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FastUnicode
4
+ class DisplayWidth
5
+ # Ported from upstream's `Unicode::DisplayWidth::EmojiSupport`. Picks a
6
+ # default emoji mode based on which terminal emulator the process is
7
+ # running under, so the gem's default behavior matches the upstream
8
+ # gem's recommendation for the same environment.
9
+ module EmojiSupport
10
+ # No memoization here: callers that need the value frozen at load time
11
+ # (e.g. `DEFAULT_EMOJI_CODE`) cache it themselves. Recomputing on each
12
+ # call keeps the function ENV-honest and makes it testable.
13
+ def self.recommended
14
+ # Upstream returns `:rqi` here (a typo); it falls through to the
15
+ # `else` branch in `emoji_width` and behaves like `:none`. We
16
+ # preserve the typo so our default exactly matches upstream's.
17
+ return :rqi if ENV["CI"]
18
+
19
+ case ENV["TERM_PROGRAM"]
20
+ when "iTerm.app" then return :all
21
+ when "Apple_Terminal" then return :rgi_at
22
+ when "WezTerm" then return :all_no_vs16
23
+ end
24
+
25
+ case ENV["TERM"]
26
+ when "contour", "foot" then return :all
27
+ when /kitty/ then return :vs16
28
+ end
29
+
30
+ return :vs16 if ENV["WT_SESSION"]
31
+
32
+ :none
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../display_width'
4
+
5
+ module FastUnicode
6
+ class DisplayWidth
7
+ # Refinement adding `String#display_width`. Matches the shape of
8
+ # `Unicode::DisplayWidth::StringExt` so call sites can be swapped 1:1.
9
+ #
10
+ # using FastUnicode::DisplayWidth::StringExt
11
+ # "一二三".display_width # => 6
12
+ module StringExt
13
+ refine String do
14
+ def display_width(*args, **kwargs)
15
+ FastUnicode::DisplayWidth.of(self, *args, **kwargs)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module FastUnicode
4
+ class DisplayWidth
5
+ VERSION = '0.1.1'
6
+ end
7
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'display_width/display_width'
4
+ require_relative 'display_width/version'
5
+ require_relative 'display_width/emoji_support'
6
+
7
+ module FastUnicode
8
+ # Drop-in API shape for `Unicode::DisplayWidth` (v3.x). Renamed to the
9
+ # `FastUnicode` namespace so both gems can coexist without conflict.
10
+ #
11
+ # Supported:
12
+ # - `ambiguous` (1 or 2) — East Asian Width ambiguous behavior.
13
+ # - `overwrite` ({Integer => Integer}) — fixed widths for codepoints.
14
+ # - `emoji:` (:none, :all, :all_no_vs16, :vs16, :rgi, :rgi_at, :possible,
15
+ # true, :auto, false). `nil`/`true`/`:auto` resolve to
16
+ # `EmojiSupport.recommended`. `false` is equivalent to `:none`.
17
+ #
18
+ # Returns an Integer (column count, never negative).
19
+ class DisplayWidth
20
+ # `Native` and `EMOJI_MODE_CODES` are defined by the Rust extension during
21
+ # `init` (see `ext/fast_unicode/display_width/src/lib.rs`). `Native` holds
22
+ # the unsafe FFI binding (no encoding normalization, raw integer enum
23
+ # codes) and is intentionally hidden — call `DisplayWidth.of` instead.
24
+ private_constant :Native
25
+
26
+ DEFAULT_EMOJI_CODE = EMOJI_MODE_CODES.fetch(EmojiSupport.recommended, 0)
27
+
28
+ # Private sentinel for "caller did not pass `emoji:`". Distinct from any
29
+ # user-supplied value, including `nil`, `true`, and `:auto`, which the
30
+ # upstream gem treats as explicit requests for the detected default.
31
+ OMITTED = Object.new.freeze
32
+ private_constant :OMITTED
33
+
34
+ # Encoding normalization + emoji-mode resolution. Public methods funnel
35
+ # through `Internal.compute` so the unsafe `Native.width_native` binding
36
+ # only has one caller and the normalization can't be bypassed.
37
+ module Internal
38
+ module_function
39
+
40
+ def compute(string, ambiguous, overwrite, emoji_code)
41
+ Native.width_native(normalize_encoding(string), ambiguous, overwrite, emoji_code)
42
+ end
43
+
44
+ # Mirrors the upstream gem's encoding contract: a BINARY string is
45
+ # reinterpreted as UTF-8 only if its bytes form valid UTF-8; otherwise
46
+ # it falls through to `String#encode` with replacement. Non-UTF-8
47
+ # encodings are always transcoded so the Rust side never sees invalid
48
+ # bytes.
49
+ def normalize_encoding(string)
50
+ if string.encoding == Encoding::BINARY
51
+ candidate = string.dup.force_encoding(Encoding::UTF_8)
52
+ return candidate if candidate.valid_encoding?
53
+ return string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
54
+ end
55
+ return string if string.encoding == Encoding::UTF_8
56
+ string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
57
+ end
58
+
59
+ # Resolves the user-facing `emoji:` argument to the integer code passed
60
+ # to Rust. `nil`/`true`/`:auto` collapse to the detected default,
61
+ # mirroring upstream. The `OMITTED` sentinel never reaches here — call
62
+ # sites short-circuit it to `DEFAULT_EMOJI_CODE` first.
63
+ def resolve_emoji_code(value)
64
+ case value
65
+ when nil, true, :auto then DEFAULT_EMOJI_CODE
66
+ when false then EMOJI_MODE_CODES[:none]
67
+ else
68
+ EMOJI_MODE_CODES.fetch(value) do
69
+ raise ArgumentError, "unknown emoji mode: #{value.inspect}"
70
+ end
71
+ end
72
+ end
73
+ end
74
+ private_constant :Internal
75
+
76
+ def self.of(string, ambiguous = 1, **opts)
77
+ if opts.empty?
78
+ return Internal.compute(string, ambiguous, nil, DEFAULT_EMOJI_CODE)
79
+ end
80
+ ambig = opts.key?(:ambiguous) ? opts[:ambiguous] : ambiguous
81
+ code = opts.key?(:emoji) ? Internal.resolve_emoji_code(opts[:emoji]) : DEFAULT_EMOJI_CODE
82
+ Internal.compute(string, ambig, opts[:overwrite], code)
83
+ end
84
+
85
+ def initialize(ambiguous: 1, overwrite: nil, emoji: OMITTED)
86
+ @ambiguous = ambiguous
87
+ @overwrite = overwrite
88
+ @emoji_code = emoji.equal?(OMITTED) ? DEFAULT_EMOJI_CODE : Internal.resolve_emoji_code(emoji)
89
+ end
90
+
91
+ def of(string)
92
+ Internal.compute(string, @ambiguous, @overwrite, @emoji_code)
93
+ end
94
+ end
95
+ end