fast_unicode-display_width 0.1.1-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 94a537f3603995914d03b2fcaf2fc42ec1b6550845e01f151acf8b582f9c4448
4
+ data.tar.gz: 27d162bdfb70aa7ac4738dad8d64c1b42b32d1118bd147f9a5c0e32854428577
5
+ SHA512:
6
+ metadata.gz: 4f60cb5ff9b45eacc24672712c6f1181e3da3512842380490b9457e250ef722d511865014536afeadc82f18bff37a1947148b40af9ed301300554e958c835652
7
+ data.tar.gz: d66351985a364db6d8d39712cd7e68e9cefc7a3c577e1dc5749ea240f2f2346d6c1484f64eea59ebeae0b713655703266e8253ea09bab94127adc9232cb241af
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 Ville Lautanala
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,137 @@
1
+ # fast_unicode-display_width
2
+
3
+ A fast, Rust-backed implementation of the
4
+ [`unicode-display_width`](https://github.com/janlelis/unicode-display_width) gem.
5
+ Computes the monospace display width of a string by delegating to the
6
+ [`unicode-width`](https://crates.io/crates/unicode-width) Rust crate through a
7
+ [`magnus`](https://github.com/matsadler/magnus) native extension.
8
+
9
+ > Built as a demo for [Helsinki Ruby Brigade](https://www.meetup.com/helsinki-ruby-brigade/), May 2026.
10
+
11
+ ## Installation
12
+
13
+ Requires Rust toolchain (stable) and Ruby `>= 3.0`. With those in place:
14
+
15
+ ```
16
+ gem install fast_unicode-display_width
17
+ ```
18
+
19
+ A precompiled native gem is also published for common platforms (linux x86_64 /
20
+ aarch64, darwin x86_64 / arm64, mingw); installs from those skip the Rust
21
+ build.
22
+
23
+ ## Usage
24
+
25
+ The API mirrors `Unicode::DisplayWidth` (v3.x) under the `FastUnicode`
26
+ namespace — swap your `require` and rename the class to migrate.
27
+
28
+ ```ruby
29
+ require 'fast_unicode/display_width'
30
+
31
+ FastUnicode::DisplayWidth.of("hello") # => 5
32
+ FastUnicode::DisplayWidth.of("一二三") # => 6
33
+ FastUnicode::DisplayWidth.of("·") # => 1
34
+ FastUnicode::DisplayWidth.of("·", 2) # => 2 (ambiguous → wide)
35
+ FastUnicode::DisplayWidth.of("·", ambiguous: 2)
36
+ FastUnicode::DisplayWidth.of("\t", overwrite: { 0x09 => 4 }) # => 4
37
+
38
+ # Reusable config (accepts ambiguous:, overwrite:, emoji:):
39
+ dw = FastUnicode::DisplayWidth.new(ambiguous: 2, overwrite: { 0x09 => 4 })
40
+ dw.of("·\t") # => 6
41
+
42
+ # Optional String refinement:
43
+ require 'fast_unicode/display_width/string_ext'
44
+ using FastUnicode::DisplayWidth::StringExt
45
+ "一二三".display_width # => 6
46
+ ```
47
+
48
+ ### Supported options
49
+
50
+ | Option | Status | Notes |
51
+ | ------------ | ------ | ------------------------------------------------------------------------------------------- |
52
+ | `ambiguous` | ✅ | `1` (narrow, default) or `2` (wide / CJK context). |
53
+ | `overwrite` | ✅ | `{ Integer => Integer }` codepoint → fixed width. Proc N/A. |
54
+ | `emoji:` | ✅ | `:none`, `:all`, `:all_no_vs16`, `:vs16`, `:rgi`, `:rgi_at`, `:possible`, `true`/`:auto`, `false`. |
55
+
56
+ ### Emoji handling
57
+
58
+ Multi-codepoint emoji sequences (ZWJ joins, regional indicators, modifier
59
+ sequences, keycap, VS16) are recognised in Rust via grapheme-cluster
60
+ iteration ([`unicode-segmentation`](https://crates.io/crates/unicode-segmentation))
61
+ plus a perfect-hash set of RGI sequences generated at build time from
62
+ Unicode's vendored `emoji-test.txt` (Unicode 17.0). No Ruby
63
+ `unicode-emoji` dependency; the whole pass happens in a single FFI call.
64
+
65
+ The default emoji mode is auto-detected from `TERM_PROGRAM` / `TERM`
66
+ etc. via `FastUnicode::DisplayWidth::EmojiSupport.recommended`
67
+ (mirroring upstream); pass `emoji: false` to disable. On terminals
68
+ whose recommended mode is `:none` (most non-emoji setups), the emoji
69
+ path is skipped entirely and the gem stays on the pure-Rust fast path.
70
+
71
+ ### Upstream parity
72
+
73
+ The differential suite (`spec/fast_unicode/compat_spec.rb`) asserts
74
+ codepoint-by-codepoint equality with `Unicode::DisplayWidth.of` across
75
+ the entire BMP plus targeted SMP sweeps, for both `ambiguous` values
76
+ and all emoji modes. Parity is achieved by a build-time
77
+ width-correction table generated from upstream's INDEX
78
+ (`data/upstream_width_table.txt`, regenerated by
79
+ `bin/regenerate-width-data`), plus two runtime quirks the table can't
80
+ model on its own: upstream's "common narrow" early-exit shortcut, and
81
+ the per-codepoint walk used for `emoji: :none` (which sidesteps
82
+ `unicode-width`'s own VS16/skin-tone clustering in `UnicodeWidthStr`).
83
+
84
+ ASCII control chars (tab, DEL, backspace mid-string), the BINARY
85
+ encoding fallback, and the `overwrite:` hash all match upstream
86
+ exactly.
87
+
88
+ ## Development
89
+
90
+ ```sh
91
+ bin/setup # bundle install
92
+ bundle exec rake compile # build the Rust extension into lib/
93
+ bundle exec rake spec # compile + run specs
94
+ bundle exec rake # default: compile + spec
95
+ ```
96
+
97
+ Run the differential corpus against the upstream gem:
98
+
99
+ ```sh
100
+ bundle exec rspec spec/fast_unicode/compat_spec.rb
101
+ ```
102
+
103
+ Benchmarks (uses `benchmark-ips`, comparing this gem against upstream):
104
+
105
+ ```sh
106
+ BENCH_TIME=2 bin/benchmark
107
+ BENCH_TIME=5 bin/benchmark --json > bench.json
108
+ ```
109
+
110
+ Indicative results on an Apple M-series laptop, Ruby 4.0.5 +YJIT
111
+ (higher is better; ratio is `FastUnicode / Unicode` ips):
112
+
113
+ | input | ambiguous=1 | ambiguous=2 |
114
+ | ---------------------- | ----------- | ----------- |
115
+ | short ascii (11 B) | 4.5× | 4.6× |
116
+ | long ascii (1.4 KB) | 5.1× | 5.1× |
117
+ | CJK paragraph (528 B) | 5.4× | 5.4× |
118
+ | mixed ASCII+CJK (528 B)| 11.0× | 11.0× |
119
+ | combining marks (704 B)| 11.5× | 14.2× |
120
+ | ambiguous-heavy (512 B)| 8.5× | 7.5× |
121
+
122
+ Emoji-mode comparison (FastUnicode / Unicode ips):
123
+
124
+ | input | `:none` | `:rgi` | `:all` | `:vs16` |
125
+ | ------------------------------ | ------- | ------- | ------ | ------- |
126
+ | emoji-heavy RGI seqs (632 B) | 10.2× | 163.5× | 4.5× | 12.2× |
127
+ | mostly ASCII, sparse emoji | 13.6× | 30.7× | 30.6× | 12.2× |
128
+
129
+ The wins concentrate on real Unicode workloads where upstream's
130
+ per-codepoint Ruby loop is the bottleneck. The huge `:rgi` win comes
131
+ from upstream resolving RGI sequences in Ruby against the
132
+ `unicode-emoji` tables; the FFI overhead dominates only for very short
133
+ inputs (and even there, the Rust path still leads).
134
+
135
+ ## License
136
+
137
+ MIT. See `LICENSE.txt`.
@@ -0,0 +1,25 @@
1
+ [package]
2
+ name = "fast_unicode_display_width"
3
+ version = "0.1.0"
4
+ authors = ["Ville Lautanala <lautis@gmail.com>"]
5
+ edition = "2021"
6
+ build = "build.rs"
7
+
8
+ [lib]
9
+ name = "display_width"
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = { version = "0.8", features = ["rb-sys"] }
14
+ unicode-width = "0.2"
15
+ unicode-segmentation = "1.13"
16
+ phf = "0.11"
17
+
18
+ [build-dependencies]
19
+ phf_codegen = "0.11"
20
+ unicode-width = "0.2"
21
+
22
+ [profile.release]
23
+ opt-level = 3
24
+ lto = "fat"
25
+ codegen-units = 1
@@ -0,0 +1,206 @@
1
+ // Build-time codegen for two artifacts consumed at runtime:
2
+ //
3
+ // 1. `rgi_set.rs` — a `phf::Set` of RGI emoji sequences parsed from the
4
+ // vendored `data/emoji-test.txt`. Covers fully- and minimally-qualified
5
+ // entries (the union upstream's `REGEX_INCLUDE_MQE_UQE` matches).
6
+ //
7
+ // 2. `width_delta.rs` — a sorted slice of width-correction ranges. Each
8
+ // range carries the *delta* between upstream's per-codepoint width and
9
+ // `unicode-width` 0.2's output, for both `width()` and `width_cjk()`.
10
+ // The runtime applies these on top of `unicode-width`'s value so the
11
+ // bulk `UnicodeWidthStr::width` path stays in play for inputs that
12
+ // contain no diverging codepoints.
13
+ //
14
+ // Source: `data/upstream_width_table.txt`, regenerated by
15
+ // `bin/regenerate-width-data` from the installed upstream gem. The
16
+ // vendored UCD `EastAsianWidth.txt` is kept alongside for provenance
17
+ // and to make future rule reimplementations possible offline.
18
+
19
+ use std::env;
20
+ use std::fs::File;
21
+ use std::io::{BufRead, BufReader, BufWriter, Write};
22
+ use std::path::PathBuf;
23
+ use unicode_width::UnicodeWidthChar;
24
+
25
+ fn main() {
26
+ let manifest_dir = PathBuf::from(env::var_os("CARGO_MANIFEST_DIR").unwrap());
27
+ let out_dir = PathBuf::from(env::var_os("OUT_DIR").unwrap());
28
+ println!("cargo:rerun-if-changed=build.rs");
29
+
30
+ emit_rgi_set(&manifest_dir, &out_dir);
31
+ emit_width_delta(&manifest_dir, &out_dir);
32
+ }
33
+
34
+ fn emit_rgi_set(manifest_dir: &std::path::Path, out_dir: &std::path::Path) {
35
+ let data_path = manifest_dir.join("data").join("emoji-test.txt");
36
+ println!("cargo:rerun-if-changed={}", data_path.display());
37
+
38
+ let file = File::open(&data_path)
39
+ .unwrap_or_else(|e| panic!("failed to open {}: {}", data_path.display(), e));
40
+ let reader = BufReader::new(file);
41
+
42
+ let mut set = phf_codegen::Set::<String>::new();
43
+ let mut seen = std::collections::HashSet::<String>::new();
44
+
45
+ for line in reader.lines() {
46
+ let line = line.expect("emoji-test.txt read");
47
+ if line.is_empty() || line.starts_with('#') {
48
+ continue;
49
+ }
50
+ let semi = match line.find(';') {
51
+ Some(i) => i,
52
+ None => continue,
53
+ };
54
+ let cps = &line[..semi];
55
+ let rest = &line[semi + 1..];
56
+ let status_end = rest.find('#').unwrap_or(rest.len());
57
+ let status = rest[..status_end].trim();
58
+ if status != "fully-qualified" && status != "minimally-qualified" {
59
+ continue;
60
+ }
61
+ let s: String = cps
62
+ .split_whitespace()
63
+ .filter_map(|hex| u32::from_str_radix(hex, 16).ok())
64
+ .filter_map(char::from_u32)
65
+ .collect();
66
+ if s.is_empty() {
67
+ continue;
68
+ }
69
+ if seen.insert(s.clone()) {
70
+ set.entry(s);
71
+ }
72
+ }
73
+
74
+ let out_path = out_dir.join("rgi_set.rs");
75
+ let mut out = BufWriter::new(File::create(&out_path).unwrap());
76
+ writeln!(
77
+ out,
78
+ "/// Auto-generated by build.rs from data/emoji-test.txt (Unicode 17.0)."
79
+ )
80
+ .unwrap();
81
+ writeln!(
82
+ out,
83
+ "pub static RGI_SEQUENCES: phf::Set<&'static str> = {};",
84
+ set.build()
85
+ )
86
+ .unwrap();
87
+ }
88
+
89
+ fn emit_width_delta(manifest_dir: &std::path::Path, out_dir: &std::path::Path) {
90
+ let data_path = manifest_dir
91
+ .join("data")
92
+ .join("upstream_width_table.txt");
93
+ println!("cargo:rerun-if-changed={}", data_path.display());
94
+
95
+ let file = File::open(&data_path)
96
+ .unwrap_or_else(|e| panic!("failed to open {}: {}", data_path.display(), e));
97
+ let reader = BufReader::new(file);
98
+
99
+ // Per-codepoint delta = upstream_width - unicode_width_value.
100
+ // Encode None from unicode-width as 0 (matches our runtime treatment).
101
+ let mut points: Vec<(u32, i8, i8)> = Vec::new();
102
+
103
+ for line in reader.lines() {
104
+ let line = line.expect("upstream_width_table.txt read");
105
+ let line = line.trim();
106
+ if line.is_empty() || line.starts_with('#') {
107
+ continue;
108
+ }
109
+ let mut parts = line.split_whitespace();
110
+ let start = u32::from_str_radix(parts.next().unwrap(), 16).unwrap();
111
+ let end = u32::from_str_radix(parts.next().unwrap(), 16).unwrap();
112
+ let up_w1: i32 = parts.next().unwrap().parse().unwrap();
113
+ let up_w2: i32 = parts.next().unwrap().parse().unwrap();
114
+ for cp in start..=end {
115
+ if (0xD800..=0xDFFF).contains(&cp) {
116
+ continue;
117
+ }
118
+ let ch = match char::from_u32(cp) {
119
+ Some(c) => c,
120
+ None => continue,
121
+ };
122
+ // `unicode-width`'s string-level `width()` falls back to 1 for
123
+ // codepoints whose per-char `width()` returns None (controls,
124
+ // unassigned, …). The runtime mirrors that — keep the delta on
125
+ // the same baseline so the bulk path and per-char path agree.
126
+ let uw_w1 = UnicodeWidthChar::width(ch).unwrap_or(1) as i32;
127
+ let uw_w2 = UnicodeWidthChar::width_cjk(ch).unwrap_or(1) as i32;
128
+ let d1 = up_w1 - uw_w1;
129
+ let d2 = up_w2 - uw_w2;
130
+ if d1 != 0 || d2 != 0 {
131
+ points.push((cp, d1 as i8, d2 as i8));
132
+ }
133
+ }
134
+ }
135
+
136
+ // Direct Latin-1 lookup, indexed by codepoint. Latin-1 letters under
137
+ // `width_cjk` carry deltas on nearly every codepoint, so the per-char
138
+ // binary search was the hot path on accented-Latin and ambiguous inputs;
139
+ // a 256-entry array collapses it to a constant-time index.
140
+ let mut latin1: [(i8, i8); 256] = [(0, 0); 256];
141
+ for (cp, d1, d2) in &points {
142
+ if *cp < 0x100 {
143
+ latin1[*cp as usize] = (*d1, *d2);
144
+ }
145
+ }
146
+
147
+ // RLE-compress consecutive codepoints sharing the same (d1, d2).
148
+ // Latin-1 codepoints are excluded — they're served by `LATIN1_DELTA`
149
+ // directly, so leaving them in the run table just bloats the binary
150
+ // search.
151
+ let mut runs: Vec<(u32, u32, i8, i8)> = Vec::new();
152
+ for (cp, d1, d2) in points {
153
+ if cp < 0x100 {
154
+ continue;
155
+ }
156
+ if let Some(last) = runs.last_mut() {
157
+ if last.1 + 1 == cp && last.2 == d1 && last.3 == d2 {
158
+ last.1 = cp;
159
+ continue;
160
+ }
161
+ }
162
+ runs.push((cp, cp, d1, d2));
163
+ }
164
+
165
+ let out_path = out_dir.join("width_delta.rs");
166
+ let mut out = BufWriter::new(File::create(&out_path).unwrap());
167
+ writeln!(
168
+ out,
169
+ "/// Auto-generated by build.rs from data/upstream_width_table.txt."
170
+ )
171
+ .unwrap();
172
+ writeln!(
173
+ out,
174
+ "/// Each tuple is (start, end_inclusive, delta_amb1, delta_amb2)."
175
+ )
176
+ .unwrap();
177
+ writeln!(
178
+ out,
179
+ "/// Delta = upstream_width - unicode_width_value, applied on top of `unicode-width` 0.2."
180
+ )
181
+ .unwrap();
182
+
183
+ writeln!(out, "pub static LATIN1_DELTA: [(i8, i8); 256] = [").unwrap();
184
+ for (cp, (d1, d2)) in latin1.iter().enumerate() {
185
+ writeln!(out, " ({}, {}), // 0x{:02X}", d1, d2, cp).unwrap();
186
+ }
187
+ writeln!(out, "];").unwrap();
188
+
189
+ writeln!(
190
+ out,
191
+ "pub static WIDTH_DELTA: &[(u32, u32, i8, i8)] = &["
192
+ )
193
+ .unwrap();
194
+ for (start, end, d1, d2) in &runs {
195
+ writeln!(out, " (0x{:X}, 0x{:X}, {}, {}),", start, end, d1, d2).unwrap();
196
+ }
197
+ writeln!(out, "];").unwrap();
198
+
199
+ // Min/max codepoints in the run table (excluding Latin-1, which has its
200
+ // own direct lookup). Runtime uses these for early-exit screens.
201
+ let (min_cp, max_cp) = runs
202
+ .iter()
203
+ .fold((u32::MAX, 0u32), |(lo, hi), (s, e, _, _)| (lo.min(*s), hi.max(*e)));
204
+ writeln!(out, "pub const WIDTH_DELTA_MIN: u32 = 0x{:X};", min_cp).unwrap();
205
+ writeln!(out, "pub const WIDTH_DELTA_MAX: u32 = 0x{:X};", max_cp).unwrap();
206
+ }