npm - @cto.af/unicode-trie-runtime - Versions diffs - 3.0.0 - Mend

@cto.af/unicode-trie-runtime 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,57 @@
+# @cto.af/unicode-trie-runtime
+A data structure for fast Unicode character metadata lookup, ported from ICU
+This version was copied from https://github.com/foliojs/unicode-trie and
+modernized slightly.
+## Background
+When implementing many Unicode algorithms such as text segmentation,
+normalization, bidi processing, etc., fast access to character metadata
+is crucial to good performance.  There over a million code points in the
+Unicode standard, many of which produce the same result when looked up,
+so an array or hash table is not appropriate - those data structures are
+fast but would require a lot of memory.  The data is generally
+grouped in ranges, so you could do a binary search, but that is not
+fast enough for some applications.
+The [International Components for Unicode](http://site.icu-project.org) (ICU) project
+came up with a data structure based on a [Trie](http://en.wikipedia.org/wiki/Trie) that provides fast access
+to Unicode metadata.  The range data is precompiled to a serialized
+and flattened trie, which is then used at runtime to lookup the necessary
+data.  According to my own tests, this is generally at least 50% faster
+than binary search, with not too much additional memory required.
+## Installation
+    npm install @cto.af/unicode-trie-runtime
+## Building a Trie
+Use the `@cto.af/unicode-trie` package to build a trie module.
+## Using a precompiled Trie
+Once you've built a precompiled trie, you can load it into the
+`UnicodeTrie` class, which is a readonly representation of the
+trie.  From there, you can lookup values.
+```js
+import {UnicodeTrie} from '@cto.af/unicode-trie-runtime';
+import fs from 'node:fs'
+// load serialized trie from binary file
+const data = fs.readFileSync('data.trie');
+const trie = new UnicodeTrie(data);
+// lookup a value
+trie.get(0x4567); // => 99 or 'FOO' (if a string was stored)
+```
+## License
+MIT
+---
+[![Tests](https://github.com/cto-af/unicode-trie/actions/workflows/node.js.yml/badge.svg)](https://github.com/cto-af/unicode-trie/actions/workflows/node.js.yml)
+[![codecov](https://codecov.io/gh/cto-af/unicode-trie/branch/main/graph/badge.svg?token=JVBOYR3GWY)](https://codecov.io/gh/cto-af/unicode-trie)

package/constants.js ADDED Viewed

@@ -0,0 +1,71 @@
+// Shift size for getting the index-1 table offset.
+export const SHIFT_1 = 6 + 5;
+// Shift size for getting the index-2 table offset.
+export const SHIFT_2 = 5;
+// Difference between the two shift sizes,
+// for getting an index-1 offset from an index-2 offset. 6=11-5
+export const SHIFT_1_2 = SHIFT_1 - SHIFT_2;
+// Number of index-1 entries for the BMP. 32=0x20
+// This part of the index-1 table is omitted from the serialized form.
+export const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1;
+// Number of entries in an index-2 block. 64=0x40
+export const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2;
+// Mask for getting the lower bits for the in-index-2-block offset. */
+export const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1;
+// Shift size for shifting left the index array values.
+// Increases possible data size with 16-bit index values at the cost
+// of compactability.
+// This requires data blocks to be aligned by DATA_GRANULARITY.
+export const INDEX_SHIFT = 2;
+// Number of entries in a data block. 32=0x20
+export const DATA_BLOCK_LENGTH = 1 << SHIFT_2;
+// Mask for getting the lower bits for the in-data-block offset.
+export const DATA_MASK = DATA_BLOCK_LENGTH - 1;
+// The part of the index-2 table for U+D800..U+DBFF stores values for lead
+// surrogate code _units_ not code _points_. Values for lead surrogate code
+// _points_ are indexed with this portion of the table.
+// Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
+export const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2;
+export const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2;
+// Count the lengths of both BMP pieces. 2080=0x820
+export const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH;
+// The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
+// Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
+export const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH;
+// U+0800 is the first code point after 2-byte UTF-8
+export const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6;
+// The index-1 table, only used for supplementary code points, at offset
+// 2112=0x840. Variable length, for code points up to highStart, where the
+// last single-value range starts. Maximum length 512=0x200=0x100000>>SHIFT_1.
+// (For 0x100000 supplementary code points U+10000..U+10ffff.)
+//
+// The part of the index-2 table for supplementary code points starts after
+// this index-1 table.
+//
+// Both the index-1 table and the following part of the index-2 table are
+// omitted completely if there is only BMP data.
+export const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH;
+export const MAX_INDEX_1_LENGTH = 0x100000 >> SHIFT_1;
+// The alignment size of a data block. Also the granularity for compaction.
+export const DATA_GRANULARITY = 1 << INDEX_SHIFT;
+// This goes in the third u32 of the input to show that we are decoding with
+// the same format the input was encoded with.  Could theoretically decrement
+// this for every major format change.
+export const CURRENT_VERSION = 0xFFFFFFFF;
+// Number of bytes before compressed data starts in this CURRENT_VERSION.
+export const PREFIX_LENGTH = 16;

package/index.js ADDED Viewed

@@ -0,0 +1,154 @@
+import {
+  CURRENT_VERSION,
+  DATA_GRANULARITY,
+  DATA_MASK,
+  INDEX_1_OFFSET,
+  INDEX_2_MASK,
+  INDEX_SHIFT,
+  LSCP_INDEX_2_OFFSET,
+  OMITTED_BMP_INDEX_1_LENGTH,
+  PREFIX_LENGTH,
+  SHIFT_1,
+  SHIFT_2,
+} from './constants.js';
+import {gunzipSync} from 'fflate';
+import {swap32LE} from './swap.js';
+const DECODER = new TextDecoder();
+/**
+ * @typedef {object} TrieValues
+ * @prop {Int32Array} data
+ * @prop {number} highStart
+ * @prop {number} errorValue
+ * @prop {string[]} [values]
+ */
+export class UnicodeTrie {
+  /**
+   * Creates a trie, either from compressed data or pre-parsed values.
+   *
+   * @param {Uint8Array|TrieValues} data
+   */
+  constructor(data) {
+    if (data instanceof Uint8Array) {
+      // Read binary format
+      let uncompressedLength = 0;
+      const view = new DataView(data.buffer);
+      this.highStart = view.getUint32(0, true);
+      this.errorValue = view.getUint32(4, true);
+      uncompressedLength = view.getUint32(8, true);
+      if (uncompressedLength !== CURRENT_VERSION) {
+        throw new Error('Trie created with old version of @cto.af/unicode-trie.');
+      }
+      uncompressedLength = view.getUint32(12, true);
+      if (PREFIX_LENGTH + uncompressedLength > data.byteLength) {
+        throw new RangeError('Invalid input length');
+      }
+      // Don't swap UTF8-encoded text.
+      const values = data.subarray(PREFIX_LENGTH + uncompressedLength);
+      /**
+       * @type{string[]}
+       */
+      this.values = values.length ?
+        JSON.parse(DECODER.decode(gunzipSync(values))) :
+        [];
+      // Inflate the actual trie data
+      data = gunzipSync(data.subarray(
+        PREFIX_LENGTH,
+        PREFIX_LENGTH + uncompressedLength
+      ));
+      // Swap bytes from little-endian
+      swap32LE(data);
+      /**
+       * @type {Int32Array}
+       */
+      this.data = new Int32Array(data.buffer);
+    } else {
+      // Pre-parsed data
+      ({
+        data: this.data,
+        highStart: this.highStart,
+        errorValue: this.errorValue,
+        values: this.values = [],
+      } = data);
+    }
+  }
+  /**
+   * Creates a trie from a base64-encoded string.
+   * @param {string} base64 The base64-encoded trie to initialize.
+   * @returns {UnicodeTrie} The decoded Unicode trie.
+   */
+  static fromBase64(base64) {
+    // This use of Buffer is ok unless we're using Parcel or some other
+    // packer that polyfills automatically.
+    if (typeof Buffer === 'function') {
+      return new UnicodeTrie(new Uint8Array(Buffer.from(base64, 'base64')));
+    }
+    return new UnicodeTrie(new Uint8Array(atob(base64)
+      .split('')
+      .map(c => c.charCodeAt(0))));
+  }
+  /**
+   * Get the value associated with a codepoint, or the default value, or the
+   * error value if codePoint is out of range.
+   *
+   * @param {number} codePoint
+   * @returns {number}
+   */
+  get(codePoint) {
+    let val = this.errorValue;
+    if ((codePoint < 0) || (codePoint > 0x10ffff)) {
+      val = this.errorValue;
+    } else if (
+      (codePoint < 0xd800) || ((codePoint > 0xdbff) && (codePoint <= 0xffff))
+    ) {
+      // Ordinary BMP code point, excluding leading surrogates.
+      // BMP uses a single level lookup.  BMP index starts at offset 0 in the
+      // index. data is stored in the index array itself.
+      const index = (this.data[codePoint >> SHIFT_2] << INDEX_SHIFT) +
+        (codePoint & DATA_MASK);
+      val = this.data[index];
+    } else if (codePoint <= 0xffff) {
+      // Lead Surrogate Code Point.  A Separate index section is stored for
+      // lead surrogate code units and code points.
+      //   The main index has the code unit data.
+      //   For this function, we need the code point data.
+      const index = (
+        this.data[LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> SHIFT_2)] <<
+          INDEX_SHIFT
+      ) + (codePoint & DATA_MASK);
+      val = this.data[index];
+    } else if (codePoint < this.highStart) {
+      // Supplemental code point, use two-level lookup.
+      let index = this.data[
+        (INDEX_1_OFFSET - OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> SHIFT_1)
+      ];
+      index = this.data[index + ((codePoint >> SHIFT_2) & INDEX_2_MASK)];
+      index = (index << INDEX_SHIFT) + (codePoint & DATA_MASK);
+      val = this.data[index];
+    } else {
+      val = this.data[this.data.length - DATA_GRANULARITY];
+    }
+    return val;
+  }
+  /**
+   * Get the value associated with the codePoint, stringified if possible.
+   *
+   * @param {number} codePoint
+   * @returns {number|string}
+   */
+  getString(codePoint) {
+    const val = this.get(codePoint);
+    return this.values[val] ?? val;
+  }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,46 @@
+{
+  "name": "@cto.af/unicode-trie-runtime",
+  "version": "3.0.0",
+  "description": "Runtime code for unicode-trie files.",
+  "type": "module",
+  "main": "index.js",
+  "scripts": {
+    "clean": "rm -rf coverage docs types",
+    "docs": "typedoc",
+    "test": "mocha",
+    "types": "tsc",
+    "build": "npm run examples && npm run lint && npm run types && npm run test"
+  },
+  "keywords": [
+    "unicode",
+    "properties",
+    "icu",
+    "trie",
+    "compressed",
+    "brotli"
+  ],
+  "author": "Devon Govett <devongovett@gmail.com>",
+  "contributors": [
+    "Joe Hildebrand <joe-github@cursive.net>",
+    "valadaptive <valadaptive@protonmail.com>"
+  ],
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/cto-af/unicode-trie.git"
+  },
+  "license": "MIT",
+  "dependencies": {
+    "fflate": "^0.8.2"
+  },
+  "devDependencies": {
+    "@types/node": "22.15.19",
+    "c8": "10.1.3",
+    "mocha": "11.4.0",
+    "typedoc": "0.28.4",
+    "typescript": "5.8.3"
+  },
+  "packageManager": "pnpm@10.11.0",
+  "engines": {
+    "node": ">=20"
+  }
+}

package/swap.js ADDED Viewed

@@ -0,0 +1,27 @@
+const isBigEndian =
+  (new Uint8Array(new Uint32Array([0x12345678]).buffer)[0] === 0x12);
+/**
+ * Exported for testing
+ * @param {Uint8Array} array
+ * @private
+ */
+export function swap32(array) {
+  const len = array.length;
+  for (let i = 0; i < len; i += 4) {
+    [array[i], array[i + 1], array[i + 2], array[i + 3]] =
+      [array[i + 3], array[i + 2], array[i + 1], array[i]];
+  }
+}
+/**
+ * No-op.
+ *
+ * @param {Uint8Array} _array Ingored
+ * @private
+ */
+function noOp(_array) {
+  // Intentionally empty
+}
+export const swap32LE = isBigEndian ? swap32 : noOp;

package/types/constants.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+export const SHIFT_1: number;
+export const SHIFT_2: 5;
+export const SHIFT_1_2: number;
+export const OMITTED_BMP_INDEX_1_LENGTH: number;
+export const INDEX_2_BLOCK_LENGTH: number;
+export const INDEX_2_MASK: number;
+export const INDEX_SHIFT: 2;
+export const DATA_BLOCK_LENGTH: number;
+export const DATA_MASK: number;
+export const LSCP_INDEX_2_OFFSET: number;
+export const LSCP_INDEX_2_LENGTH: number;
+export const INDEX_2_BMP_LENGTH: number;
+export const UTF8_2B_INDEX_2_OFFSET: number;
+export const UTF8_2B_INDEX_2_LENGTH: number;
+export const INDEX_1_OFFSET: number;
+export const MAX_INDEX_1_LENGTH: number;
+export const DATA_GRANULARITY: number;
+export const CURRENT_VERSION: 4294967295;
+export const PREFIX_LENGTH: 16;

package/types/index.d.ts ADDED Viewed

@@ -0,0 +1,52 @@
+/**
+ * @typedef {object} TrieValues
+ * @prop {Int32Array} data
+ * @prop {number} highStart
+ * @prop {number} errorValue
+ * @prop {string[]} [values]
+ */
+export class UnicodeTrie {
+    /**
+     * Creates a trie from a base64-encoded string.
+     * @param {string} base64 The base64-encoded trie to initialize.
+     * @returns {UnicodeTrie} The decoded Unicode trie.
+     */
+    static fromBase64(base64: string): UnicodeTrie;
+    /**
+     * Creates a trie, either from compressed data or pre-parsed values.
+     *
+     * @param {Uint8Array|TrieValues} data
+     */
+    constructor(data: Uint8Array | TrieValues);
+    highStart: number;
+    errorValue: number;
+    /**
+     * @type{string[]}
+     */
+    values: string[];
+    /**
+     * @type {Int32Array}
+     */
+    data: Int32Array;
+    /**
+     * Get the value associated with a codepoint, or the default value, or the
+     * error value if codePoint is out of range.
+     *
+     * @param {number} codePoint
+     * @returns {number}
+     */
+    get(codePoint: number): number;
+    /**
+     * Get the value associated with the codePoint, stringified if possible.
+     *
+     * @param {number} codePoint
+     * @returns {number|string}
+     */
+    getString(codePoint: number): number | string;
+}
+export type TrieValues = {
+    data: Int32Array;
+    highStart: number;
+    errorValue: number;
+    values?: string[] | undefined;
+};

package/types/swap.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+/**
+ * Exported for testing
+ * @param {Uint8Array} array
+ * @private
+ */
+export function swap32(array: Uint8Array): void;
+/**
+ * Exported for testing
+ * @param {Uint8Array} array
+ * @private
+ */
+export function swap32LE(array: Uint8Array): void;