@cto.af/unicode-trie-runtime 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # @cto.af/unicode-trie-runtime
2
+
3
+ A data structure for fast Unicode character metadata lookup, ported from ICU
4
+ This version was copied from https://github.com/foliojs/unicode-trie and
5
+ modernized slightly.
6
+
7
+ ## Background
8
+
9
+ When implementing many Unicode algorithms such as text segmentation,
10
+ normalization, bidi processing, etc., fast access to character metadata
11
+ is crucial to good performance. There over a million code points in the
12
+ Unicode standard, many of which produce the same result when looked up,
13
+ so an array or hash table is not appropriate - those data structures are
14
+ fast but would require a lot of memory. The data is generally
15
+ grouped in ranges, so you could do a binary search, but that is not
16
+ fast enough for some applications.
17
+
18
+ The [International Components for Unicode](http://site.icu-project.org) (ICU) project
19
+ came up with a data structure based on a [Trie](http://en.wikipedia.org/wiki/Trie) that provides fast access
20
+ to Unicode metadata. The range data is precompiled to a serialized
21
+ and flattened trie, which is then used at runtime to lookup the necessary
22
+ data. According to my own tests, this is generally at least 50% faster
23
+ than binary search, with not too much additional memory required.
24
+
25
+ ## Installation
26
+
27
+ npm install @cto.af/unicode-trie-runtime
28
+
29
+ ## Building a Trie
30
+
31
+ Use the `@cto.af/unicode-trie` package to build a trie module.
32
+
33
+ ## Using a precompiled Trie
34
+
35
+ Once you've built a precompiled trie, you can load it into the
36
+ `UnicodeTrie` class, which is a readonly representation of the
37
+ trie. From there, you can lookup values.
38
+
39
+ ```js
40
+ import {UnicodeTrie} from '@cto.af/unicode-trie-runtime';
41
+ import fs from 'node:fs'
42
+
43
+ // load serialized trie from binary file
44
+ const data = fs.readFileSync('data.trie');
45
+ const trie = new UnicodeTrie(data);
46
+
47
+ // lookup a value
48
+ trie.get(0x4567); // => 99 or 'FOO' (if a string was stored)
49
+ ```
50
+
51
+ ## License
52
+
53
+ MIT
54
+
55
+ ---
56
+ [![Tests](https://github.com/cto-af/unicode-trie/actions/workflows/node.js.yml/badge.svg)](https://github.com/cto-af/unicode-trie/actions/workflows/node.js.yml)
57
+ [![codecov](https://codecov.io/gh/cto-af/unicode-trie/branch/main/graph/badge.svg?token=JVBOYR3GWY)](https://codecov.io/gh/cto-af/unicode-trie)
package/constants.js ADDED
@@ -0,0 +1,71 @@
1
+ // Shift size for getting the index-1 table offset.
2
+ export const SHIFT_1 = 6 + 5;
3
+
4
+ // Shift size for getting the index-2 table offset.
5
+ export const SHIFT_2 = 5;
6
+
7
+ // Difference between the two shift sizes,
8
+ // for getting an index-1 offset from an index-2 offset. 6=11-5
9
+ export const SHIFT_1_2 = SHIFT_1 - SHIFT_2;
10
+
11
+ // Number of index-1 entries for the BMP. 32=0x20
12
+ // This part of the index-1 table is omitted from the serialized form.
13
+ export const OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1;
14
+
15
+ // Number of entries in an index-2 block. 64=0x40
16
+ export const INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2;
17
+
18
+ // Mask for getting the lower bits for the in-index-2-block offset. */
19
+ export const INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1;
20
+
21
+ // Shift size for shifting left the index array values.
22
+ // Increases possible data size with 16-bit index values at the cost
23
+ // of compactability.
24
+ // This requires data blocks to be aligned by DATA_GRANULARITY.
25
+ export const INDEX_SHIFT = 2;
26
+
27
+ // Number of entries in a data block. 32=0x20
28
+ export const DATA_BLOCK_LENGTH = 1 << SHIFT_2;
29
+
30
+ // Mask for getting the lower bits for the in-data-block offset.
31
+ export const DATA_MASK = DATA_BLOCK_LENGTH - 1;
32
+
33
+ // The part of the index-2 table for U+D800..U+DBFF stores values for lead
34
+ // surrogate code _units_ not code _points_. Values for lead surrogate code
35
+ // _points_ are indexed with this portion of the table.
36
+ // Length=32=0x20=0x400>>SHIFT_2. (There are 1024=0x400 lead surrogates.)
37
+ export const LSCP_INDEX_2_OFFSET = 0x10000 >> SHIFT_2;
38
+ export const LSCP_INDEX_2_LENGTH = 0x400 >> SHIFT_2;
39
+
40
+ // Count the lengths of both BMP pieces. 2080=0x820
41
+ export const INDEX_2_BMP_LENGTH = LSCP_INDEX_2_OFFSET + LSCP_INDEX_2_LENGTH;
42
+
43
+ // The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
44
+ // Length 32=0x20 for lead bytes C0..DF, regardless of SHIFT_2.
45
+ export const UTF8_2B_INDEX_2_OFFSET = INDEX_2_BMP_LENGTH;
46
+ // U+0800 is the first code point after 2-byte UTF-8
47
+ export const UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6;
48
+
49
+ // The index-1 table, only used for supplementary code points, at offset
50
+ // 2112=0x840. Variable length, for code points up to highStart, where the
51
+ // last single-value range starts. Maximum length 512=0x200=0x100000>>SHIFT_1.
52
+ // (For 0x100000 supplementary code points U+10000..U+10ffff.)
53
+ //
54
+ // The part of the index-2 table for supplementary code points starts after
55
+ // this index-1 table.
56
+ //
57
+ // Both the index-1 table and the following part of the index-2 table are
58
+ // omitted completely if there is only BMP data.
59
+ export const INDEX_1_OFFSET = UTF8_2B_INDEX_2_OFFSET + UTF8_2B_INDEX_2_LENGTH;
60
+ export const MAX_INDEX_1_LENGTH = 0x100000 >> SHIFT_1;
61
+
62
+ // The alignment size of a data block. Also the granularity for compaction.
63
+ export const DATA_GRANULARITY = 1 << INDEX_SHIFT;
64
+
65
+ // This goes in the third u32 of the input to show that we are decoding with
66
+ // the same format the input was encoded with. Could theoretically decrement
67
+ // this for every major format change.
68
+ export const CURRENT_VERSION = 0xFFFFFFFF;
69
+
70
+ // Number of bytes before compressed data starts in this CURRENT_VERSION.
71
+ export const PREFIX_LENGTH = 16;
package/index.js ADDED
@@ -0,0 +1,154 @@
1
+ import {
2
+ CURRENT_VERSION,
3
+ DATA_GRANULARITY,
4
+ DATA_MASK,
5
+ INDEX_1_OFFSET,
6
+ INDEX_2_MASK,
7
+ INDEX_SHIFT,
8
+ LSCP_INDEX_2_OFFSET,
9
+ OMITTED_BMP_INDEX_1_LENGTH,
10
+ PREFIX_LENGTH,
11
+ SHIFT_1,
12
+ SHIFT_2,
13
+ } from './constants.js';
14
+ import {gunzipSync} from 'fflate';
15
+ import {swap32LE} from './swap.js';
16
+
17
+ const DECODER = new TextDecoder();
18
+
19
+ /**
20
+ * @typedef {object} TrieValues
21
+ * @prop {Int32Array} data
22
+ * @prop {number} highStart
23
+ * @prop {number} errorValue
24
+ * @prop {string[]} [values]
25
+ */
26
+
27
+ export class UnicodeTrie {
28
+ /**
29
+ * Creates a trie, either from compressed data or pre-parsed values.
30
+ *
31
+ * @param {Uint8Array|TrieValues} data
32
+ */
33
+ constructor(data) {
34
+ if (data instanceof Uint8Array) {
35
+ // Read binary format
36
+ let uncompressedLength = 0;
37
+ const view = new DataView(data.buffer);
38
+ this.highStart = view.getUint32(0, true);
39
+ this.errorValue = view.getUint32(4, true);
40
+ uncompressedLength = view.getUint32(8, true);
41
+ if (uncompressedLength !== CURRENT_VERSION) {
42
+ throw new Error('Trie created with old version of @cto.af/unicode-trie.');
43
+ }
44
+ uncompressedLength = view.getUint32(12, true);
45
+ if (PREFIX_LENGTH + uncompressedLength > data.byteLength) {
46
+ throw new RangeError('Invalid input length');
47
+ }
48
+
49
+ // Don't swap UTF8-encoded text.
50
+ const values = data.subarray(PREFIX_LENGTH + uncompressedLength);
51
+
52
+ /**
53
+ * @type{string[]}
54
+ */
55
+ this.values = values.length ?
56
+ JSON.parse(DECODER.decode(gunzipSync(values))) :
57
+ [];
58
+
59
+ // Inflate the actual trie data
60
+ data = gunzipSync(data.subarray(
61
+ PREFIX_LENGTH,
62
+ PREFIX_LENGTH + uncompressedLength
63
+ ));
64
+
65
+ // Swap bytes from little-endian
66
+ swap32LE(data);
67
+
68
+ /**
69
+ * @type {Int32Array}
70
+ */
71
+ this.data = new Int32Array(data.buffer);
72
+ } else {
73
+ // Pre-parsed data
74
+ ({
75
+ data: this.data,
76
+ highStart: this.highStart,
77
+ errorValue: this.errorValue,
78
+ values: this.values = [],
79
+ } = data);
80
+ }
81
+ }
82
+
83
+ /**
84
+ * Creates a trie from a base64-encoded string.
85
+ * @param {string} base64 The base64-encoded trie to initialize.
86
+ * @returns {UnicodeTrie} The decoded Unicode trie.
87
+ */
88
+ static fromBase64(base64) {
89
+ // This use of Buffer is ok unless we're using Parcel or some other
90
+ // packer that polyfills automatically.
91
+ if (typeof Buffer === 'function') {
92
+ return new UnicodeTrie(new Uint8Array(Buffer.from(base64, 'base64')));
93
+ }
94
+ return new UnicodeTrie(new Uint8Array(atob(base64)
95
+ .split('')
96
+ .map(c => c.charCodeAt(0))));
97
+ }
98
+
99
+ /**
100
+ * Get the value associated with a codepoint, or the default value, or the
101
+ * error value if codePoint is out of range.
102
+ *
103
+ * @param {number} codePoint
104
+ * @returns {number}
105
+ */
106
+ get(codePoint) {
107
+ let val = this.errorValue;
108
+ if ((codePoint < 0) || (codePoint > 0x10ffff)) {
109
+ val = this.errorValue;
110
+ } else if (
111
+ (codePoint < 0xd800) || ((codePoint > 0xdbff) && (codePoint <= 0xffff))
112
+ ) {
113
+ // Ordinary BMP code point, excluding leading surrogates.
114
+ // BMP uses a single level lookup. BMP index starts at offset 0 in the
115
+ // index. data is stored in the index array itself.
116
+ const index = (this.data[codePoint >> SHIFT_2] << INDEX_SHIFT) +
117
+ (codePoint & DATA_MASK);
118
+ val = this.data[index];
119
+ } else if (codePoint <= 0xffff) {
120
+ // Lead Surrogate Code Point. A Separate index section is stored for
121
+ // lead surrogate code units and code points.
122
+ // The main index has the code unit data.
123
+ // For this function, we need the code point data.
124
+ const index = (
125
+ this.data[LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> SHIFT_2)] <<
126
+ INDEX_SHIFT
127
+ ) + (codePoint & DATA_MASK);
128
+ val = this.data[index];
129
+ } else if (codePoint < this.highStart) {
130
+ // Supplemental code point, use two-level lookup.
131
+ let index = this.data[
132
+ (INDEX_1_OFFSET - OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> SHIFT_1)
133
+ ];
134
+ index = this.data[index + ((codePoint >> SHIFT_2) & INDEX_2_MASK)];
135
+ index = (index << INDEX_SHIFT) + (codePoint & DATA_MASK);
136
+ val = this.data[index];
137
+ } else {
138
+ val = this.data[this.data.length - DATA_GRANULARITY];
139
+ }
140
+
141
+ return val;
142
+ }
143
+
144
+ /**
145
+ * Get the value associated with the codePoint, stringified if possible.
146
+ *
147
+ * @param {number} codePoint
148
+ * @returns {number|string}
149
+ */
150
+ getString(codePoint) {
151
+ const val = this.get(codePoint);
152
+ return this.values[val] ?? val;
153
+ }
154
+ }
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "@cto.af/unicode-trie-runtime",
3
+ "version": "3.0.0",
4
+ "description": "Runtime code for unicode-trie files.",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "scripts": {
8
+ "clean": "rm -rf coverage docs types",
9
+ "docs": "typedoc",
10
+ "test": "mocha",
11
+ "types": "tsc",
12
+ "build": "npm run examples && npm run lint && npm run types && npm run test"
13
+ },
14
+ "keywords": [
15
+ "unicode",
16
+ "properties",
17
+ "icu",
18
+ "trie",
19
+ "compressed",
20
+ "brotli"
21
+ ],
22
+ "author": "Devon Govett <devongovett@gmail.com>",
23
+ "contributors": [
24
+ "Joe Hildebrand <joe-github@cursive.net>",
25
+ "valadaptive <valadaptive@protonmail.com>"
26
+ ],
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "git+https://github.com/cto-af/unicode-trie.git"
30
+ },
31
+ "license": "MIT",
32
+ "dependencies": {
33
+ "fflate": "^0.8.2"
34
+ },
35
+ "devDependencies": {
36
+ "@types/node": "22.15.19",
37
+ "c8": "10.1.3",
38
+ "mocha": "11.4.0",
39
+ "typedoc": "0.28.4",
40
+ "typescript": "5.8.3"
41
+ },
42
+ "packageManager": "pnpm@10.11.0",
43
+ "engines": {
44
+ "node": ">=20"
45
+ }
46
+ }
package/swap.js ADDED
@@ -0,0 +1,27 @@
1
+ const isBigEndian =
2
+ (new Uint8Array(new Uint32Array([0x12345678]).buffer)[0] === 0x12);
3
+
4
+ /**
5
+ * Exported for testing
6
+ * @param {Uint8Array} array
7
+ * @private
8
+ */
9
+ export function swap32(array) {
10
+ const len = array.length;
11
+ for (let i = 0; i < len; i += 4) {
12
+ [array[i], array[i + 1], array[i + 2], array[i + 3]] =
13
+ [array[i + 3], array[i + 2], array[i + 1], array[i]];
14
+ }
15
+ }
16
+
17
+ /**
18
+ * No-op.
19
+ *
20
+ * @param {Uint8Array} _array Ingored
21
+ * @private
22
+ */
23
+ function noOp(_array) {
24
+ // Intentionally empty
25
+ }
26
+
27
+ export const swap32LE = isBigEndian ? swap32 : noOp;
@@ -0,0 +1,19 @@
1
+ export const SHIFT_1: number;
2
+ export const SHIFT_2: 5;
3
+ export const SHIFT_1_2: number;
4
+ export const OMITTED_BMP_INDEX_1_LENGTH: number;
5
+ export const INDEX_2_BLOCK_LENGTH: number;
6
+ export const INDEX_2_MASK: number;
7
+ export const INDEX_SHIFT: 2;
8
+ export const DATA_BLOCK_LENGTH: number;
9
+ export const DATA_MASK: number;
10
+ export const LSCP_INDEX_2_OFFSET: number;
11
+ export const LSCP_INDEX_2_LENGTH: number;
12
+ export const INDEX_2_BMP_LENGTH: number;
13
+ export const UTF8_2B_INDEX_2_OFFSET: number;
14
+ export const UTF8_2B_INDEX_2_LENGTH: number;
15
+ export const INDEX_1_OFFSET: number;
16
+ export const MAX_INDEX_1_LENGTH: number;
17
+ export const DATA_GRANULARITY: number;
18
+ export const CURRENT_VERSION: 4294967295;
19
+ export const PREFIX_LENGTH: 16;
@@ -0,0 +1,52 @@
1
+ /**
2
+ * @typedef {object} TrieValues
3
+ * @prop {Int32Array} data
4
+ * @prop {number} highStart
5
+ * @prop {number} errorValue
6
+ * @prop {string[]} [values]
7
+ */
8
+ export class UnicodeTrie {
9
+ /**
10
+ * Creates a trie from a base64-encoded string.
11
+ * @param {string} base64 The base64-encoded trie to initialize.
12
+ * @returns {UnicodeTrie} The decoded Unicode trie.
13
+ */
14
+ static fromBase64(base64: string): UnicodeTrie;
15
+ /**
16
+ * Creates a trie, either from compressed data or pre-parsed values.
17
+ *
18
+ * @param {Uint8Array|TrieValues} data
19
+ */
20
+ constructor(data: Uint8Array | TrieValues);
21
+ highStart: number;
22
+ errorValue: number;
23
+ /**
24
+ * @type{string[]}
25
+ */
26
+ values: string[];
27
+ /**
28
+ * @type {Int32Array}
29
+ */
30
+ data: Int32Array;
31
+ /**
32
+ * Get the value associated with a codepoint, or the default value, or the
33
+ * error value if codePoint is out of range.
34
+ *
35
+ * @param {number} codePoint
36
+ * @returns {number}
37
+ */
38
+ get(codePoint: number): number;
39
+ /**
40
+ * Get the value associated with the codePoint, stringified if possible.
41
+ *
42
+ * @param {number} codePoint
43
+ * @returns {number|string}
44
+ */
45
+ getString(codePoint: number): number | string;
46
+ }
47
+ export type TrieValues = {
48
+ data: Int32Array;
49
+ highStart: number;
50
+ errorValue: number;
51
+ values?: string[] | undefined;
52
+ };
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Exported for testing
3
+ * @param {Uint8Array} array
4
+ * @private
5
+ */
6
+ export function swap32(array: Uint8Array): void;
7
+ /**
8
+ * Exported for testing
9
+ * @param {Uint8Array} array
10
+ * @private
11
+ */
12
+ export function swap32LE(array: Uint8Array): void;