RubyGems - fast_unicode-display_width - Versions diffs - 0.1.1-arm64-darwin - Mend

fast_unicode-display_width 0.1.1-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +137 -0
data/ext/fast_unicode/display_width/Cargo.toml +25 -0
data/ext/fast_unicode/display_width/build.rs +206 -0
data/ext/fast_unicode/display_width/data/EastAsianWidth.txt +2721 -0
data/ext/fast_unicode/display_width/data/emoji-test.txt +5518 -0
data/ext/fast_unicode/display_width/data/upstream_width_table.txt +1292 -0
data/ext/fast_unicode/display_width/extconf.rb +4 -0
data/ext/fast_unicode/display_width/src/lib.rs +520 -0
data/lib/fast_unicode/display_width/3.4/display_width.bundle +0 -0
data/lib/fast_unicode/display_width/4.0/display_width.bundle +0 -0
data/lib/fast_unicode/display_width/emoji_support.rb +36 -0
data/lib/fast_unicode/display_width/string_ext.rb +20 -0
data/lib/fast_unicode/display_width/version.rb +7 -0
data/lib/fast_unicode/display_width.rb +95 -0
metadata +173 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: deb2f515fbf4acc96e04c89da09a8102c32312b311427ed995e133c70e0e9105
+  data.tar.gz: 7a8f1869a2c476e87fca5a4050f45cbf5b3eaa8f6a8432c6d41cb6885d36d2fc
+SHA512:
+  metadata.gz: a63c2bb636c2d3ee2a1657187e45b9e0a92619d5972892026586fa528a507aa8db40141c2f2a129afb8f3e618c76be5ebde6b31d69e4792845e26b629c75016d
+  data.tar.gz: a3449133e1d84482d6419b3be826cf729fe7ea8f124bb8b708735c65eb77ffb2e6f90ce0d08752bbaca2b4bce2280c82cbfac4767e2cd7aabc23557bddbefaa5

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2026 Ville Lautanala
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,137 @@
+# fast_unicode-display_width
+A fast, Rust-backed implementation of the
+[`unicode-display_width`](https://github.com/janlelis/unicode-display_width) gem.
+Computes the monospace display width of a string by delegating to the
+[`unicode-width`](https://crates.io/crates/unicode-width) Rust crate through a
+[`magnus`](https://github.com/matsadler/magnus) native extension.
+> Built as a demo for [Helsinki Ruby Brigade](https://www.meetup.com/helsinki-ruby-brigade/), May 2026.
+## Installation
+Requires Rust toolchain (stable) and Ruby `>= 3.0`. With those in place:
+```
+gem install fast_unicode-display_width
+```
+A precompiled native gem is also published for common platforms (linux x86_64 /
+aarch64, darwin x86_64 / arm64, mingw); installs from those skip the Rust
+build.
+## Usage
+The API mirrors `Unicode::DisplayWidth` (v3.x) under the `FastUnicode`
+namespace — swap your `require` and rename the class to migrate.
+```ruby
+require 'fast_unicode/display_width'
+FastUnicode::DisplayWidth.of("hello")        # => 5
+FastUnicode::DisplayWidth.of("一二三")       # => 6
+FastUnicode::DisplayWidth.of("·")            # => 1
+FastUnicode::DisplayWidth.of("·", 2)         # => 2  (ambiguous → wide)
+FastUnicode::DisplayWidth.of("·", ambiguous: 2)
+FastUnicode::DisplayWidth.of("\t", overwrite: { 0x09 => 4 })  # => 4
+# Reusable config (accepts ambiguous:, overwrite:, emoji:):
+dw = FastUnicode::DisplayWidth.new(ambiguous: 2, overwrite: { 0x09 => 4 })
+dw.of("·\t")  # => 6
+# Optional String refinement:
+require 'fast_unicode/display_width/string_ext'
+using FastUnicode::DisplayWidth::StringExt
+"一二三".display_width  # => 6
+```
+### Supported options
+| Option       | Status | Notes                                                                                       |
+| ------------ | ------ | ------------------------------------------------------------------------------------------- |
+| `ambiguous`  | ✅     | `1` (narrow, default) or `2` (wide / CJK context).                                          |
+| `overwrite`  | ✅     | `{ Integer => Integer }` codepoint → fixed width. Proc N/A.                                 |
+| `emoji:`     | ✅     | `:none`, `:all`, `:all_no_vs16`, `:vs16`, `:rgi`, `:rgi_at`, `:possible`, `true`/`:auto`, `false`. |
+### Emoji handling
+Multi-codepoint emoji sequences (ZWJ joins, regional indicators, modifier
+sequences, keycap, VS16) are recognised in Rust via grapheme-cluster
+iteration ([`unicode-segmentation`](https://crates.io/crates/unicode-segmentation))
+plus a perfect-hash set of RGI sequences generated at build time from
+Unicode's vendored `emoji-test.txt` (Unicode 17.0). No Ruby
+`unicode-emoji` dependency; the whole pass happens in a single FFI call.
+The default emoji mode is auto-detected from `TERM_PROGRAM` / `TERM`
+etc. via `FastUnicode::DisplayWidth::EmojiSupport.recommended`
+(mirroring upstream); pass `emoji: false` to disable. On terminals
+whose recommended mode is `:none` (most non-emoji setups), the emoji
+path is skipped entirely and the gem stays on the pure-Rust fast path.
+### Upstream parity
+The differential suite (`spec/fast_unicode/compat_spec.rb`) asserts
+codepoint-by-codepoint equality with `Unicode::DisplayWidth.of` across
+the entire BMP plus targeted SMP sweeps, for both `ambiguous` values
+and all emoji modes. Parity is achieved by a build-time
+width-correction table generated from upstream's INDEX
+(`data/upstream_width_table.txt`, regenerated by
+`bin/regenerate-width-data`), plus two runtime quirks the table can't
+model on its own: upstream's "common narrow" early-exit shortcut, and
+the per-codepoint walk used for `emoji: :none` (which sidesteps
+`unicode-width`'s own VS16/skin-tone clustering in `UnicodeWidthStr`).
+ASCII control chars (tab, DEL, backspace mid-string), the BINARY
+encoding fallback, and the `overwrite:` hash all match upstream
+exactly.
+## Development
+```sh
+bin/setup                  # bundle install
+bundle exec rake compile   # build the Rust extension into lib/
+bundle exec rake spec      # compile + run specs
+bundle exec rake           # default: compile + spec
+```
+Run the differential corpus against the upstream gem:
+```sh
+bundle exec rspec spec/fast_unicode/compat_spec.rb
+```
+Benchmarks (uses `benchmark-ips`, comparing this gem against upstream):
+```sh
+BENCH_TIME=2 bin/benchmark
+BENCH_TIME=5 bin/benchmark --json > bench.json
+```
+Indicative results on an Apple M-series laptop, Ruby 4.0.5 +YJIT
+(higher is better; ratio is `FastUnicode / Unicode` ips):
+| input                  | ambiguous=1 | ambiguous=2 |
+| ---------------------- | ----------- | ----------- |
+| short ascii (11 B)     | 4.5×        | 4.6×        |
+| long ascii (1.4 KB)    | 5.1×        | 5.1×        |
+| CJK paragraph (528 B)  | 5.4×        | 5.4×        |
+| mixed ASCII+CJK (528 B)| 11.0×       | 11.0×       |
+| combining marks (704 B)| 11.5×       | 14.2×       |
+| ambiguous-heavy (512 B)| 8.5×        | 7.5×        |
+Emoji-mode comparison (FastUnicode / Unicode ips):
+| input                          | `:none` | `:rgi`  | `:all` | `:vs16` |
+| ------------------------------ | ------- | ------- | ------ | ------- |
+| emoji-heavy RGI seqs (632 B)   | 10.2×   | 163.5×  | 4.5×   | 12.2×   |
+| mostly ASCII, sparse emoji     | 13.6×   | 30.7×   | 30.6×  | 12.2×   |
+The wins concentrate on real Unicode workloads where upstream's
+per-codepoint Ruby loop is the bottleneck. The huge `:rgi` win comes
+from upstream resolving RGI sequences in Ruby against the
+`unicode-emoji` tables; the FFI overhead dominates only for very short
+inputs (and even there, the Rust path still leads).
+## License
+MIT. See `LICENSE.txt`.

data/ext/fast_unicode/display_width/Cargo.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[package]
+name = "fast_unicode_display_width"
+version = "0.1.0"
+authors = ["Ville Lautanala <lautis@gmail.com>"]
+edition = "2021"
+build = "build.rs"
+[lib]
+name = "display_width"
+crate-type = ["cdylib"]
+[dependencies]
+magnus = { version = "0.8", features = ["rb-sys"] }
+unicode-width = "0.2"
+unicode-segmentation = "1.13"
+phf = "0.11"
+[build-dependencies]
+phf_codegen = "0.11"
+unicode-width = "0.2"
+[profile.release]
+opt-level = 3
+lto = "fat"
+codegen-units = 1

data/ext/fast_unicode/display_width/build.rs ADDED Viewed

@@ -0,0 +1,206 @@
+// Build-time codegen for two artifacts consumed at runtime:
+//
+//   1. `rgi_set.rs` — a `phf::Set` of RGI emoji sequences parsed from the
+//      vendored `data/emoji-test.txt`. Covers fully- and minimally-qualified
+//      entries (the union upstream's `REGEX_INCLUDE_MQE_UQE` matches).
+//
+//   2. `width_delta.rs` — a sorted slice of width-correction ranges. Each
+//      range carries the *delta* between upstream's per-codepoint width and
+//      `unicode-width` 0.2's output, for both `width()` and `width_cjk()`.
+//      The runtime applies these on top of `unicode-width`'s value so the
+//      bulk `UnicodeWidthStr::width` path stays in play for inputs that
+//      contain no diverging codepoints.
+//
+//      Source: `data/upstream_width_table.txt`, regenerated by
+//      `bin/regenerate-width-data` from the installed upstream gem. The
+//      vendored UCD `EastAsianWidth.txt` is kept alongside for provenance
+//      and to make future rule reimplementations possible offline.
+use std::env;
+use std::fs::File;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+use std::path::PathBuf;
+use unicode_width::UnicodeWidthChar;
+fn main() {
+    let manifest_dir = PathBuf::from(env::var_os("CARGO_MANIFEST_DIR").unwrap());
+    let out_dir = PathBuf::from(env::var_os("OUT_DIR").unwrap());
+    println!("cargo:rerun-if-changed=build.rs");
+    emit_rgi_set(&manifest_dir, &out_dir);
+    emit_width_delta(&manifest_dir, &out_dir);
+}
+fn emit_rgi_set(manifest_dir: &std::path::Path, out_dir: &std::path::Path) {
+    let data_path = manifest_dir.join("data").join("emoji-test.txt");
+    println!("cargo:rerun-if-changed={}", data_path.display());
+    let file = File::open(&data_path)
+        .unwrap_or_else(|e| panic!("failed to open {}: {}", data_path.display(), e));
+    let reader = BufReader::new(file);
+    let mut set = phf_codegen::Set::<String>::new();
+    let mut seen = std::collections::HashSet::<String>::new();
+    for line in reader.lines() {
+        let line = line.expect("emoji-test.txt read");
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let semi = match line.find(';') {
+            Some(i) => i,
+            None => continue,
+        };
+        let cps = &line[..semi];
+        let rest = &line[semi + 1..];
+        let status_end = rest.find('#').unwrap_or(rest.len());
+        let status = rest[..status_end].trim();
+        if status != "fully-qualified" && status != "minimally-qualified" {
+            continue;
+        }
+        let s: String = cps
+            .split_whitespace()
+            .filter_map(|hex| u32::from_str_radix(hex, 16).ok())
+            .filter_map(char::from_u32)
+            .collect();
+        if s.is_empty() {
+            continue;
+        }
+        if seen.insert(s.clone()) {
+            set.entry(s);
+        }
+    }
+    let out_path = out_dir.join("rgi_set.rs");
+    let mut out = BufWriter::new(File::create(&out_path).unwrap());
+    writeln!(
+        out,
+        "/// Auto-generated by build.rs from data/emoji-test.txt (Unicode 17.0)."
+    )
+    .unwrap();
+    writeln!(
+        out,
+        "pub static RGI_SEQUENCES: phf::Set<&'static str> = {};",
+        set.build()
+    )
+    .unwrap();
+}
+fn emit_width_delta(manifest_dir: &std::path::Path, out_dir: &std::path::Path) {
+    let data_path = manifest_dir
+        .join("data")
+        .join("upstream_width_table.txt");
+    println!("cargo:rerun-if-changed={}", data_path.display());
+    let file = File::open(&data_path)
+        .unwrap_or_else(|e| panic!("failed to open {}: {}", data_path.display(), e));
+    let reader = BufReader::new(file);
+    // Per-codepoint delta = upstream_width - unicode_width_value.
+    // Encode None from unicode-width as 0 (matches our runtime treatment).
+    let mut points: Vec<(u32, i8, i8)> = Vec::new();
+    for line in reader.lines() {
+        let line = line.expect("upstream_width_table.txt read");
+        let line = line.trim();
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let mut parts = line.split_whitespace();
+        let start = u32::from_str_radix(parts.next().unwrap(), 16).unwrap();
+        let end = u32::from_str_radix(parts.next().unwrap(), 16).unwrap();
+        let up_w1: i32 = parts.next().unwrap().parse().unwrap();
+        let up_w2: i32 = parts.next().unwrap().parse().unwrap();
+        for cp in start..=end {
+            if (0xD800..=0xDFFF).contains(&cp) {
+                continue;
+            }
+            let ch = match char::from_u32(cp) {
+                Some(c) => c,
+                None => continue,
+            };
+            // `unicode-width`'s string-level `width()` falls back to 1 for
+            // codepoints whose per-char `width()` returns None (controls,
+            // unassigned, …). The runtime mirrors that — keep the delta on
+            // the same baseline so the bulk path and per-char path agree.
+            let uw_w1 = UnicodeWidthChar::width(ch).unwrap_or(1) as i32;
+            let uw_w2 = UnicodeWidthChar::width_cjk(ch).unwrap_or(1) as i32;
+            let d1 = up_w1 - uw_w1;
+            let d2 = up_w2 - uw_w2;
+            if d1 != 0 || d2 != 0 {
+                points.push((cp, d1 as i8, d2 as i8));
+            }
+        }
+    }
+    // Direct Latin-1 lookup, indexed by codepoint. Latin-1 letters under
+    // `width_cjk` carry deltas on nearly every codepoint, so the per-char
+    // binary search was the hot path on accented-Latin and ambiguous inputs;
+    // a 256-entry array collapses it to a constant-time index.
+    let mut latin1: [(i8, i8); 256] = [(0, 0); 256];
+    for (cp, d1, d2) in &points {
+        if *cp < 0x100 {
+            latin1[*cp as usize] = (*d1, *d2);
+        }
+    }
+    // RLE-compress consecutive codepoints sharing the same (d1, d2).
+    // Latin-1 codepoints are excluded — they're served by `LATIN1_DELTA`
+    // directly, so leaving them in the run table just bloats the binary
+    // search.
+    let mut runs: Vec<(u32, u32, i8, i8)> = Vec::new();
+    for (cp, d1, d2) in points {
+        if cp < 0x100 {
+            continue;
+        }
+        if let Some(last) = runs.last_mut() {
+            if last.1 + 1 == cp && last.2 == d1 && last.3 == d2 {
+                last.1 = cp;
+                continue;
+            }
+        }
+        runs.push((cp, cp, d1, d2));
+    }
+    let out_path = out_dir.join("width_delta.rs");
+    let mut out = BufWriter::new(File::create(&out_path).unwrap());
+    writeln!(
+        out,
+        "/// Auto-generated by build.rs from data/upstream_width_table.txt."
+    )
+    .unwrap();
+    writeln!(
+        out,
+        "/// Each tuple is (start, end_inclusive, delta_amb1, delta_amb2)."
+    )
+    .unwrap();
+    writeln!(
+        out,
+        "/// Delta = upstream_width - unicode_width_value, applied on top of `unicode-width` 0.2."
+    )
+    .unwrap();
+    writeln!(out, "pub static LATIN1_DELTA: [(i8, i8); 256] = [").unwrap();
+    for (cp, (d1, d2)) in latin1.iter().enumerate() {
+        writeln!(out, "    ({}, {}), // 0x{:02X}", d1, d2, cp).unwrap();
+    }
+    writeln!(out, "];").unwrap();
+    writeln!(
+        out,
+        "pub static WIDTH_DELTA: &[(u32, u32, i8, i8)] = &["
+    )
+    .unwrap();
+    for (start, end, d1, d2) in &runs {
+        writeln!(out, "    (0x{:X}, 0x{:X}, {}, {}),", start, end, d1, d2).unwrap();
+    }
+    writeln!(out, "];").unwrap();
+    // Min/max codepoints in the run table (excluding Latin-1, which has its
+    // own direct lookup). Runtime uses these for early-exit screens.
+    let (min_cp, max_cp) = runs
+        .iter()
+        .fold((u32::MAX, 0u32), |(lo, hi), (s, e, _, _)| (lo.min(*s), hi.max(*e)));
+    writeln!(out, "pub const WIDTH_DELTA_MIN: u32 = 0x{:X};", min_cp).unwrap();
+    writeln!(out, "pub const WIDTH_DELTA_MAX: u32 = 0x{:X};", max_cp).unwrap();
+}