cspell-trie-lib 6.6.1 → 6.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,7 +14,117 @@ The resulting trie can then be compressed into a
14
14
  npm install -S cspell-trie-lib
15
15
  ```
16
16
 
17
- ## File Format
17
+ ## File Format V3
18
+
19
+ ### Header
20
+
21
+ ```
22
+ TrieXv3
23
+ base=10
24
+ # Comments
25
+ __DATA__
26
+ ```
27
+
28
+ The header has two parts.
29
+
30
+ - `TrieXv3` -- the format identifier.
31
+ - base -- references are stored using the base (10, 16, 32) are common.
32
+ higher the base, the smaller the file. Max is 36
33
+
34
+ ### Data
35
+
36
+ The data is a stream of characters and operators. Each character represents a node in the Trie. The operators adjust the position in the Trie.
37
+
38
+ ### Conceptual Format
39
+
40
+ Given a sorted list of words:
41
+
42
+ ```text
43
+ joust
44
+ jouster
45
+ jousting
46
+ joy
47
+ joyful
48
+ joyfuller
49
+ joyfullest
50
+ ```
51
+
52
+ It is possible to think of the same list stored as a series of operations.
53
+
54
+ | op | Meaning |
55
+ | ----- | ------------------- |
56
+ | `<` | remove 1 character |
57
+ | `<<` | remove 2 characters |
58
+ | `<<<` | remove 3 characters |
59
+ | `<2` | remove 2 characters |
60
+ | `<3` | remove 3 characters |
61
+ | `$` | end of word |
62
+ | `_` | visual place holder |
63
+
64
+ ```text
65
+ joust$
66
+ _____er$
67
+ _____<<
68
+ _____ing$
69
+ __<<<<<<
70
+ __y$
71
+ ___ful$
72
+ ______ler$
73
+ ________<
74
+ ________st$
75
+ ```
76
+
77
+ Becomes:
78
+
79
+ ```text
80
+ joust$er$<2ing$<6y$ful$ler$<st$
81
+ ```
82
+
83
+ Trie:
84
+
85
+ ```text
86
+ j─o┬u─s─t┬$
87
+ │ ├e─r─$
88
+ │ └i─n─g─$
89
+ └y┬$
90
+ └f─u─l┬$
91
+ └l─e┬r─$
92
+ └s─t─$
93
+ ```
94
+
95
+ ### Data Format
96
+
97
+ | op | Meaning |
98
+ | ----- | --------------------------------------------------------------------------------------------------------- |
99
+ | `<` | remove 1 character |
100
+ | `<n` | remove n characters where `n` is `[2-9]` to remove 12 characters use `<9<3` |
101
+ | `$` | end of word |
102
+ | `\` | escape next character. All characters can be escaped. <br/> `\\` -> `\` <br/>`\#` -> `#` <br/>`\a` -> `a` |
103
+ | `#n;` | reference to an already imported trie node where `n` is the node number |
104
+
105
+ **Sample Data**
106
+
107
+ <!--- cspell:disable --->
108
+
109
+ ```text
110
+ Big Apple$8races\: \{\}\[\]\(\)$9<5
111
+ New York$7umbers \0\1\2\3\4\5\6\7\8\9$9<9
112
+ ap#6;<rrow \<$7
113
+ big a#5;<4urned$r$2ing$3s$$4
114
+ chalk#56;<3u#54;<3
115
+ eol \\n$3w \$$4scape \\\$8
116
+ fun journey$7wal#27;<7
117
+ journalism$tic$2$3s$$2eyer$2man$2e#103;<2$4ste#101;<i#58;<$3vialit#85;<2$4wly$$2yfuller$st$4ness$4$3lessn#120;<$4ou#125;<2ridde#103;<2er$$i#58;<3od#8;<3
118
+ stic#27;<4$3
119
+ lift#56;<3ong w#86;<6
120
+ ref \#$5
121
+ t#61;<
122
+ wa#62;<2
123
+ ```
124
+
125
+ <!--- cspell:enable --->
126
+
127
+ ## File Format V1
18
128
 
19
129
  ### Header
20
130
 
@@ -4,6 +4,7 @@ export interface ExportOptions {
4
4
  base?: number;
5
5
  comment?: string;
6
6
  version?: number;
7
+ addLineBreaksToImproveDiffs?: boolean;
7
8
  }
8
9
  /**
9
10
  * Serialize a TrieNode.
@@ -24,12 +24,21 @@ var __importStar = (this && this.__importStar) || function (mod) {
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
26
  exports.importTrie = exports.serializeTrie = void 0;
27
+ const cspell_pipe_1 = require("@cspell/cspell-pipe");
27
28
  const gensequence_1 = require("gensequence");
28
29
  const iv1 = __importStar(require("./importExportV1"));
29
30
  const iv2 = __importStar(require("./importExportV2"));
30
31
  const iv3 = __importStar(require("./importExportV3"));
31
- const serializers = [iv1.serializeTrie, iv1.serializeTrie, iv2.serializeTrie, iv3.serializeTrie];
32
- const deserializers = [iv1.importTrie, iv1.importTrie, iv2.importTrie, iv3.importTrie];
32
+ const iv4 = __importStar(require("./importExportV4"));
33
+ const serializers = [
34
+ iv1.serializeTrie,
35
+ iv1.serializeTrie,
36
+ iv2.serializeTrie,
37
+ iv3.serializeTrie,
38
+ iv4.serializeTrie,
39
+ ];
40
+ const deserializers = [iv1.importTrie, iv1.importTrie, iv2.importTrie, iv3.importTrie, iv4.importTrie];
41
+ const DEFAULT_VERSION = 3;
33
42
  /**
34
43
  * Serialize a TrieNode.
35
44
  * Note: This is destructive. The node will no longer be usable.
@@ -37,7 +46,7 @@ const deserializers = [iv1.importTrie, iv1.importTrie, iv2.importTrie, iv3.impor
37
46
  * Considering this is the last step before exporting, it was decided to let this be destructive.
38
47
  */
39
48
  function serializeTrie(root, options = 16) {
40
- const version = typeof options !== 'number' && options.version ? options.version : 0;
49
+ const version = typeof options !== 'number' && options.version ? options.version : DEFAULT_VERSION;
41
50
  const method = serializers[version];
42
51
  if (!method) {
43
52
  throw new Error(`Unknown version: ${version}`);
@@ -46,14 +55,9 @@ function serializeTrie(root, options = 16) {
46
55
  }
47
56
  exports.serializeTrie = serializeTrie;
48
57
  function importTrie(lines) {
49
- const comment = /^\s*#/;
50
- function* arrayToIterableIterator(i) {
51
- yield* i;
52
- }
53
58
  function parseHeaderRows(headerRows) {
54
59
  const header = headerRows.join('\n');
55
- const headerReg = /\bTrieXv(\d+)/;
56
- /* istanbul ignore if */
60
+ const headerReg = /^\s*TrieXv(\d+)/m;
57
61
  const match = header.match(headerReg);
58
62
  if (!match)
59
63
  throw new Error('Unknown file format');
@@ -61,16 +65,8 @@ function importTrie(lines) {
61
65
  }
62
66
  function readHeader(iter) {
63
67
  const headerRows = [];
64
- // eslint-disable-next-line no-constant-condition
65
- while (true) {
66
- const next = iter.next();
67
- if (next.done) {
68
- break;
69
- }
70
- const line = next.value.trim();
71
- if (!line || comment.test(line)) {
72
- continue;
73
- }
68
+ for (const entry of iter) {
69
+ const line = entry.trim();
74
70
  headerRows.push(line);
75
71
  if (line === iv1.DATA || line === iv2.DATA) {
76
72
  break;
@@ -78,7 +74,7 @@ function importTrie(lines) {
78
74
  }
79
75
  return headerRows;
80
76
  }
81
- const input = arrayToIterableIterator(lines);
77
+ const input = (0, cspell_pipe_1.toDistributableIterable)(lines);
82
78
  const headerLines = readHeader(input);
83
79
  const version = parseHeaderRows(headerLines);
84
80
  const stream = (0, gensequence_1.genSequence)(headerLines).concat(input);
@@ -4,10 +4,20 @@ export declare const DATA = "__DATA__";
4
4
  export interface ExportOptions {
5
5
  base?: number;
6
6
  comment?: string;
7
+ /**
8
+ * This will reduce the size of the `.trie` file by removing references to short suffixes.
9
+ * But it does increase the size of the trie when loaded into memory.
10
+ */
11
+ optimizeSimpleReferences?: boolean;
12
+ /**
13
+ * To improve diffs, an EOL is added before each double letter prefix.
14
+ * @default true
15
+ */
16
+ addLineBreaksToImproveDiffs?: boolean;
7
17
  }
8
18
  /**
9
19
  * Serialize a TrieRoot.
10
20
  */
11
21
  export declare function serializeTrie(root: TrieRoot, options?: ExportOptions | number): Sequence<string>;
12
- export declare function importTrie(linesX: Iterable<string>): TrieRoot;
22
+ export declare function importTrie(linesX: Iterable<string> | string): TrieRoot;
13
23
  //# sourceMappingURL=importExportV3.d.ts.map
@@ -5,12 +5,12 @@ const TrieNode_1 = require("../TrieNode");
5
5
  const gensequence_1 = require("gensequence");
6
6
  const bufferLines_1 = require("../utils/bufferLines");
7
7
  const trie_util_1 = require("../trie-util");
8
- const EOW = '$';
9
- const BACK = '<';
10
- const EOL = '\n';
11
- const LF = '\r';
12
- const REF = '#';
13
- const EOR = ';';
8
+ const EOW = '$'; // End of word
9
+ const BACK = '<'; // Move up the tree
10
+ const EOL = '\n'; // End of Line (ignored)
11
+ const LF = '\r'; // Line Feed (ignored)
12
+ const REF = '#'; // Start of Reference
13
+ const EOR = ';'; // End of Reference
14
14
  const ESCAPE = '\\';
15
15
  const specialCharacters = new Set([EOW, BACK, EOL, REF, EOR, ESCAPE, LF]
16
16
  .concat('0123456789'.split(''))
@@ -21,6 +21,8 @@ const specialCharacterMap = new Map([
21
21
  ['\\', '\\\\'],
22
22
  ]);
23
23
  const characterMap = new Map([...specialCharacterMap].map((a) => [a[1], a[0]]));
24
+ const specialPrefix = stringToCharSet('~!');
25
+ const WORDS_PER_LINE = 20;
24
26
  exports.DATA = '__DATA__';
25
27
  function generateHeader(base, comment) {
26
28
  const header = ['#!/usr/bin/env cspell-trie reader', 'TrieXv3', 'base=' + base]
@@ -32,12 +34,15 @@ function generateHeader(base, comment) {
32
34
  * Serialize a TrieRoot.
33
35
  */
34
36
  function serializeTrie(root, options = 16) {
35
- options = typeof options === 'number' ? { base: options } : options;
36
- const { base = 16, comment = '' } = options;
37
+ options = typeof options === 'number' ? { base: options, addLineBreaksToImproveDiffs: false } : options;
38
+ const { base = 16, comment = '', addLineBreaksToImproveDiffs: addBreaks = true } = options;
37
39
  const radix = base > 36 ? 36 : base < 10 ? 10 : base;
38
40
  const cache = new Map();
41
+ const cacheShouldRef = new Map();
39
42
  let count = 0;
40
- const backBuffer = { last: '', count: 0 };
43
+ const backBuffer = { last: '', count: 0, words: 0, eol: false };
44
+ const optimizeSimpleReferences = options.optimizeSimpleReferences ?? false;
45
+ const wordChars = [];
41
46
  function ref(n) {
42
47
  return '#' + n.toString(radix) + ';';
43
48
  }
@@ -51,6 +56,11 @@ function serializeTrie(root, options = 16) {
51
56
  backBuffer.last = BACK;
52
57
  backBuffer.count -= n;
53
58
  }
59
+ if (backBuffer.eol) {
60
+ yield EOL;
61
+ backBuffer.eol = false;
62
+ backBuffer.words = 0;
63
+ }
54
64
  }
55
65
  function* emit(s) {
56
66
  switch (s) {
@@ -58,25 +68,39 @@ function serializeTrie(root, options = 16) {
58
68
  yield* flush();
59
69
  backBuffer.last = EOW;
60
70
  backBuffer.count = 0;
71
+ backBuffer.words++;
61
72
  break;
62
73
  case BACK:
63
74
  backBuffer.count++;
64
75
  break;
76
+ case EOL:
77
+ backBuffer.eol = true;
78
+ break;
65
79
  default:
80
+ if (backBuffer.words >= WORDS_PER_LINE) {
81
+ backBuffer.eol = true;
82
+ }
66
83
  yield* flush();
84
+ if (s.startsWith(REF)) {
85
+ backBuffer.words++;
86
+ }
67
87
  yield s;
68
88
  }
69
89
  }
70
90
  function* walk(node, depth) {
71
91
  const r = cache.get(node);
72
- if (r !== undefined) {
92
+ if (r !== undefined && (!optimizeSimpleReferences || !shouldSimpleRef(node))) {
73
93
  yield* emit(ref(r));
74
94
  return;
75
95
  }
76
96
  if (node.c) {
97
+ if (addBreaks && depth > 0 && depth <= 2) {
98
+ yield* emit(EOL);
99
+ }
77
100
  cache.set(node, count++);
78
101
  const c = [...node.c].sort((a, b) => (a[0] < b[0] ? -1 : 1));
79
102
  for (const [s, n] of c) {
103
+ wordChars[depth] = s;
80
104
  yield* emit(escape(s));
81
105
  yield* walk(n, depth + 1);
82
106
  yield* emit(BACK);
@@ -88,18 +112,36 @@ function serializeTrie(root, options = 16) {
88
112
  if (node.f) {
89
113
  yield* emit(EOW);
90
114
  }
115
+ if (addBreaks && (depth === 2 || (depth === 3 && wordChars[0] in specialPrefix))) {
116
+ yield* emit(EOL);
117
+ }
91
118
  }
92
119
  function* serialize(node) {
93
120
  yield* walk(node, 0);
94
121
  yield* flush();
95
122
  }
96
- return generateHeader(radix, comment).concat((0, bufferLines_1.bufferLines)((0, bufferLines_1.bufferLines)(serialize(root), 120, '\n'), 10, ''));
123
+ function _calcShouldSimpleRef(node) {
124
+ if (node.c?.size !== 1)
125
+ return false;
126
+ const [n] = [...node.c.values()];
127
+ return !!n.f && (n.c === undefined || n.c.size === 0);
128
+ }
129
+ function shouldSimpleRef(node) {
130
+ const r = cacheShouldRef.get(node);
131
+ if (r !== undefined)
132
+ return r;
133
+ const rr = _calcShouldSimpleRef(node);
134
+ cacheShouldRef.set(node, rr);
135
+ return rr;
136
+ }
137
+ return generateHeader(radix, comment).concat((0, bufferLines_1.bufferLines)(serialize(root), 1200, ''));
97
138
  }
98
139
  exports.serializeTrie = serializeTrie;
99
140
  function* toIterableIterator(iter) {
100
141
  yield* iter;
101
142
  }
102
143
  function importTrie(linesX) {
144
+ linesX = typeof linesX === 'string' ? linesX.split(/(?<=\n)/) : linesX;
103
145
  const root = (0, trie_util_1.trieNodeToRoot)({}, {});
104
146
  let radix = 16;
105
147
  const comment = /^\s*#/;
@@ -234,4 +276,12 @@ function parseStream(radix) {
234
276
  }
235
277
  return parserMain;
236
278
  }
279
+ function stringToCharSet(values) {
280
+ const set = Object.create(null);
281
+ const len = values.length;
282
+ for (let i = 0; i < len; ++i) {
283
+ set[values[i]] = true;
284
+ }
285
+ return set;
286
+ }
237
287
  //# sourceMappingURL=importExportV3.js.map
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Trie file format v4
3
+ *
4
+ * Trie format v4 is very similar to v3. The v4 reader can even read v3 files.
5
+ * The motivation behind v4 is to reduce the cost of storing `.trie` files in git.
6
+ * When a word is added in v3, nearly the entire file is changed due to the absolute
7
+ * references. V4 adds an index sorted by the most frequently used reference to the least.
8
+ * Because git diff is line based, it is important to add line breaks at logical points.
9
+ * V3 added line breaks just to make sure the lines were not too long, V4 takes a different
10
+ * approach. Line breaks are added at two distinct points. First, at the start of each two
11
+ * letter prefix and second after approximately 50 words have been emitted.
12
+ *
13
+ * To improve readability and git diff, at the beginning of each two letter prefix,
14
+ * a comment is emitted.
15
+ *
16
+ * Example:
17
+ *
18
+ * ```
19
+ * /* ab *​/
20
+ * ```
21
+ */
22
+ import { Sequence } from 'gensequence';
23
+ import { TrieNode, TrieRoot } from '../TrieNode';
24
+ export declare const DATA = "__DATA__";
25
+ export interface ExportOptions {
26
+ base?: number;
27
+ comment?: string;
28
+ /**
29
+ * This will reduce the size of the `.trie` file by removing references to short suffixes.
30
+ * But it does increase the size of the trie when loaded into memory.
31
+ */
32
+ optimizeSimpleReferences?: boolean;
33
+ }
34
+ /**
35
+ * Serialize a TrieRoot.
36
+ */
37
+ export declare function serializeTrie(root: TrieRoot, options?: ExportOptions | number): Sequence<string>;
38
+ interface ReferenceMap {
39
+ /**
40
+ * An array of references to nodes.
41
+ * The most frequently referenced is first in the list.
42
+ * A node must be reference by other nodes to be included.
43
+ */
44
+ refCounts: (readonly [TrieNode, number])[];
45
+ }
46
+ declare function buildReferenceMap(root: TrieRoot, base: number): ReferenceMap;
47
+ export declare function importTrie(linesX: Iterable<string> | string): TrieRoot;
48
+ export declare const __testing__: {
49
+ buildReferenceMap: typeof buildReferenceMap;
50
+ };
51
+ export {};
52
+ //# sourceMappingURL=importExportV4.d.ts.map
@@ -0,0 +1,442 @@
1
+ "use strict";
2
+ /* eslint-disable no-irregular-whitespace */
3
+ /**
4
+ * Trie file format v4
5
+ *
6
+ * Trie format v4 is very similar to v3. The v4 reader can even read v3 files.
7
+ * The motivation behind v4 is to reduce the cost of storing `.trie` files in git.
8
+ * When a word is added in v3, nearly the entire file is changed due to the absolute
9
+ * references. V4 adds an index sorted by the most frequently used reference to the least.
10
+ * Because git diff is line based, it is important to add line breaks at logical points.
11
+ * V3 added line breaks just to make sure the lines were not too long, V4 takes a different
12
+ * approach. Line breaks are added at two distinct points. First, at the start of each two
13
+ * letter prefix and second after approximately 50 words have been emitted.
14
+ *
15
+ * To improve readability and git diff, at the beginning of each two letter prefix,
16
+ * a comment is emitted.
17
+ *
18
+ * Example:
19
+ *
20
+ * ```
21
+ * /* ab *​/
22
+ * ```
23
+ */
24
+ Object.defineProperty(exports, "__esModule", { value: true });
25
+ exports.__testing__ = exports.importTrie = exports.serializeTrie = exports.DATA = void 0;
26
+ const cspell_pipe_1 = require("@cspell/cspell-pipe");
27
+ const gensequence_1 = require("gensequence");
28
+ const trie_util_1 = require("../trie-util");
29
+ const TrieNode_1 = require("../TrieNode");
30
+ const bufferLines_1 = require("../utils/bufferLines");
31
+ const EOW = '$'; // End of word
32
+ const BACK = '<'; // Move up the tree
33
+ const EOL = '\n'; // End of Line (ignored)
34
+ const LF = '\r'; // Line Feed (ignored)
35
+ const REF = '#'; // Start absolute of Reference
36
+ const REF_REL = '@'; // Start indexed of Reference
37
+ const EOR = ';'; // End of Reference
38
+ const ESCAPE = '\\';
39
+ const REF_INDEX_BEGIN = '[';
40
+ const REF_INDEX_END = ']';
41
+ const INLINE_DATA_COMMENT_LINE = '/';
42
+ const specialCharacters = stringToCharSet([EOW, BACK, EOL, REF, REF_REL, EOR, ESCAPE, LF, REF_INDEX_BEGIN, REF_INDEX_END, INLINE_DATA_COMMENT_LINE]
43
+ .concat('0123456789'.split(''))
44
+ .concat('`~!@#$%^&*()_-+=[]{};:\'"<>,./?\\|'.split(''))
45
+ .join(''));
46
+ const SPECIAL_CHARACTERS_MAP = [
47
+ ['\n', '\\n'],
48
+ ['\r', '\\r'],
49
+ ['\\', '\\\\'],
50
+ ];
51
+ const specialCharacterMap = stringToCharMap(SPECIAL_CHARACTERS_MAP);
52
+ const characterMap = stringToCharMap(SPECIAL_CHARACTERS_MAP.map((a) => [a[1], a[0]]));
53
+ const specialPrefix = stringToCharSet('~!');
54
+ const WORDS_PER_LINE = 20;
55
+ exports.DATA = '__DATA__';
56
+ function generateHeader(base, comment) {
57
+ const comments = comment
58
+ .split('\n')
59
+ .map((a) => '# ' + a.trimEnd())
60
+ .join('\n');
61
+ return `\
62
+ #!/usr/bin/env cspell-trie reader
63
+ TrieXv4
64
+ base=${base}
65
+ ${comments}
66
+ # Data:
67
+ ${exports.DATA}
68
+ `;
69
+ }
70
+ /**
71
+ * Serialize a TrieRoot.
72
+ */
73
+ function serializeTrie(root, options = 16) {
74
+ options = typeof options === 'number' ? { base: options } : options;
75
+ const { base = 10, comment = '' } = options;
76
+ const radix = base > 36 ? 36 : base < 10 ? 10 : base;
77
+ const cache = new Map();
78
+ const refMap = buildReferenceMap(root, base);
79
+ const nodeToIndexMap = new Map(refMap.refCounts.map(([node], index) => [node, index]));
80
+ let count = 0;
81
+ const backBuffer = { last: '', count: 0, words: 0, eol: false };
82
+ const wordChars = [];
83
+ function ref(n, idx) {
84
+ const r = idx === undefined || n < idx ? REF + n.toString(radix) : REF_REL + idx.toString(radix);
85
+ return radix === 10 ? r : r + ';';
86
+ }
87
+ function escape(s) {
88
+ return s in specialCharacters ? ESCAPE + (specialCharacterMap[s] || s) : s;
89
+ }
90
+ function* flush() {
91
+ while (backBuffer.count) {
92
+ const n = Math.min(9, backBuffer.count);
93
+ yield n > 1 ? backBuffer.last + n : backBuffer.last;
94
+ backBuffer.last = BACK;
95
+ backBuffer.count -= n;
96
+ }
97
+ if (backBuffer.eol) {
98
+ yield EOL;
99
+ backBuffer.eol = false;
100
+ backBuffer.words = 0;
101
+ }
102
+ }
103
+ function* emit(s) {
104
+ switch (s) {
105
+ case EOW:
106
+ yield* flush();
107
+ backBuffer.last = EOW;
108
+ backBuffer.count = 0;
109
+ backBuffer.words++;
110
+ break;
111
+ case BACK:
112
+ backBuffer.count++;
113
+ break;
114
+ case EOL:
115
+ backBuffer.eol = true;
116
+ break;
117
+ default:
118
+ if (backBuffer.words >= WORDS_PER_LINE) {
119
+ backBuffer.eol = true;
120
+ }
121
+ yield* flush();
122
+ if (s.startsWith(REF) || s.startsWith(REF_REL)) {
123
+ backBuffer.words++;
124
+ }
125
+ yield s;
126
+ }
127
+ }
128
+ const comment_begin = `${EOL}${INLINE_DATA_COMMENT_LINE}* `;
129
+ const comment_end = ` *${INLINE_DATA_COMMENT_LINE}${EOL}`;
130
+ function* walk(node, depth) {
131
+ const nodeNumber = cache.get(node);
132
+ const refIndex = nodeToIndexMap.get(node);
133
+ if (nodeNumber !== undefined) {
134
+ yield* emit(ref(nodeNumber, refIndex));
135
+ return;
136
+ }
137
+ if (node.c) {
138
+ if (depth > 0 && depth <= 2) {
139
+ const chars = wordChars.slice(0, depth).map(escape).join('');
140
+ yield* emit(comment_begin + chars + comment_end);
141
+ }
142
+ cache.set(node, count++);
143
+ const c = [...node.c].sort((a, b) => (a[0] < b[0] ? -1 : 1));
144
+ for (const [s, n] of c) {
145
+ wordChars[depth] = s;
146
+ yield* emit(escape(s));
147
+ yield* walk(n, depth + 1);
148
+ yield* emit(BACK);
149
+ if (depth === 0)
150
+ yield* emit(EOL);
151
+ }
152
+ }
153
+ // Output EOW after children so it can be optimized on read
154
+ if (node.f) {
155
+ yield* emit(EOW);
156
+ }
157
+ if (depth === 2 || (depth === 3 && wordChars[0] in specialPrefix)) {
158
+ yield* emit(EOL);
159
+ }
160
+ }
161
+ function* serialize(node) {
162
+ yield* walk(node, 0);
163
+ yield* flush();
164
+ }
165
+ const lines = [...(0, bufferLines_1.bufferLines)(serialize(root), 1000, '')];
166
+ const resolvedReferences = refMap.refCounts.map(([node]) => cache.get(node) || 0);
167
+ // const r = refMap.refCounts.slice(0, 200).map(([node, c]) => ({ n: cache.get(node) || 0, c }));
168
+ // console.log('First 100: %o \n %o', r.slice(0, 100), r.slice(100, 200));
169
+ const reference = '[\n' +
170
+ resolvedReferences
171
+ .map((n) => n.toString(radix))
172
+ .join(',')
173
+ .replace(/.{110,130}[,]/g, '$&\n') +
174
+ '\n]\n';
175
+ return (0, gensequence_1.genSequence)([generateHeader(radix, comment), reference]).concat(lines);
176
+ }
177
+ exports.serializeTrie = serializeTrie;
178
+ function buildReferenceMap(root, base) {
179
+ const refCount = new Map();
180
+ let nodeCount = 0;
181
+ function walk(node) {
182
+ const ref = refCount.get(node);
183
+ if (ref) {
184
+ ref.c++;
185
+ return;
186
+ }
187
+ refCount.set(node, { c: 1, n: nodeCount++ });
188
+ if (!node.c)
189
+ return;
190
+ for (const child of node.c.values()) {
191
+ walk(child);
192
+ }
193
+ }
194
+ walk(root);
195
+ // sorted highest to lowest
196
+ const refCountAndNode = [
197
+ ...(0, cspell_pipe_1.pipeSync)(refCount, (0, cspell_pipe_1.opFilter)(([_, ref]) => ref.c >= 2)),
198
+ ].sort((a, b) => b[1].c - a[1].c || a[1].n - b[1].n);
199
+ let adj = 0;
200
+ const baseLogScale = 1 / Math.log(base);
201
+ const refs = refCountAndNode
202
+ .filter(([_, ref], idx) => {
203
+ const i = idx - adj;
204
+ const charsIdx = Math.ceil(Math.log(i) * baseLogScale);
205
+ const charsNode = Math.ceil(Math.log(ref.n) * baseLogScale);
206
+ const savings = ref.c * (charsNode - charsIdx) - charsIdx;
207
+ const keep = savings > 0;
208
+ adj += keep ? 0 : 1;
209
+ return keep;
210
+ })
211
+ .map(([n, ref]) => [n, ref.c]);
212
+ return { refCounts: refs };
213
+ }
214
+ function importTrie(linesX) {
215
+ linesX = typeof linesX === 'string' ? linesX.split(/(?<=\n)/) : linesX;
216
+ let radix = 10;
217
+ const comment = /^\s*#/;
218
+ const iter = tapIterable((0, cspell_pipe_1.pipeSync)(linesX, (0, cspell_pipe_1.opConcatMap)((a) => a.split(/(?<=\n)(?!$)/))));
219
+ function parseHeaderRows(headerRows) {
220
+ const header = headerRows.slice(0, 2).join('\n');
221
+ const headerReg = /^TrieXv[34]\nbase=(\d+)$/;
222
+ /* istanbul ignore if */
223
+ if (!headerReg.test(header))
224
+ throw new Error('Unknown file format');
225
+ radix = Number.parseInt(header.replace(headerReg, '$1'), 10);
226
+ }
227
+ function readHeader(iter) {
228
+ const headerRows = [];
229
+ for (const value of iter) {
230
+ const line = value.trim();
231
+ if (!line || comment.test(line))
232
+ continue;
233
+ if (line === exports.DATA)
234
+ break;
235
+ headerRows.push(line);
236
+ }
237
+ parseHeaderRows(headerRows);
238
+ }
239
+ readHeader(iter);
240
+ const root = parseStream(radix, iter);
241
+ return root;
242
+ }
243
+ exports.importTrie = importTrie;
244
+ const numbersSet = stringToCharSet('0123456789');
245
+ function parseStream(radix, iter) {
246
+ const eow = Object.freeze({ f: 1 });
247
+ let refIndex = [];
248
+ const root = (0, trie_util_1.trieNodeToRoot)({}, {});
249
+ function parseReference(acc, s) {
250
+ const isIndexRef = s === REF_REL;
251
+ let ref = '';
252
+ function parser(acc, s) {
253
+ if (s === EOR || (radix === 10 && !(s in numbersSet))) {
254
+ const { root, nodes, stack } = acc;
255
+ const r = parseInt(ref, radix);
256
+ const top = stack[stack.length - 1];
257
+ const p = stack[stack.length - 2].node;
258
+ const n = isIndexRef ? refIndex[r] : r;
259
+ p.c?.set(top.s, nodes[n]);
260
+ const rr = { root, nodes, stack, parser: undefined };
261
+ return s === EOR ? rr : parserMain(rr, s);
262
+ }
263
+ ref = ref + s;
264
+ return acc;
265
+ }
266
+ const { nodes } = acc;
267
+ nodes.pop();
268
+ return { ...acc, nodes, parser };
269
+ }
270
+ function parseEscapeCharacter(acc, _) {
271
+ let prev = '';
272
+ const parser = function (acc, s) {
273
+ if (prev) {
274
+ s = characterMap[prev + s] || s;
275
+ return parseCharacter({ ...acc, parser: undefined }, s);
276
+ }
277
+ if (s === ESCAPE) {
278
+ prev = s;
279
+ return acc;
280
+ }
281
+ return parseCharacter({ ...acc, parser: undefined }, s);
282
+ };
283
+ return { ...acc, parser };
284
+ }
285
+ function parseComment(acc, s) {
286
+ const endOfComment = s;
287
+ let isEscaped = false;
288
+ function parser(acc, s) {
289
+ if (isEscaped) {
290
+ isEscaped = false;
291
+ return acc;
292
+ }
293
+ if (s === ESCAPE) {
294
+ isEscaped = true;
295
+ return acc;
296
+ }
297
+ if (s === endOfComment) {
298
+ return { ...acc, parser: undefined };
299
+ }
300
+ return acc;
301
+ }
302
+ return { ...acc, parser };
303
+ }
304
+ function parseCharacter(acc, s) {
305
+ const parser = undefined;
306
+ const { root, nodes, stack } = acc;
307
+ const top = stack[stack.length - 1];
308
+ const node = top.node;
309
+ node.c = node.c ?? new Map();
310
+ const n = { f: undefined, c: undefined, n: nodes.length };
311
+ node.c.set(s, n);
312
+ stack.push({ node: n, s });
313
+ nodes.push(n);
314
+ return { root, nodes, stack, parser };
315
+ }
316
+ function parseEOW(acc, _) {
317
+ const parser = parseBack;
318
+ const { root, nodes, stack } = acc;
319
+ const top = stack[stack.length - 1];
320
+ const node = top.node;
321
+ node.f = TrieNode_1.FLAG_WORD;
322
+ if (!node.c) {
323
+ top.node = eow;
324
+ const p = stack[stack.length - 2].node;
325
+ p.c?.set(top.s, eow);
326
+ nodes.pop();
327
+ }
328
+ stack.pop();
329
+ return { root, nodes, stack, parser };
330
+ }
331
+ const charactersBack = stringToCharSet(BACK + '23456789');
332
+ function parseBack(acc, s) {
333
+ if (!(s in charactersBack)) {
334
+ return parserMain({ ...acc, parser: undefined }, s);
335
+ }
336
+ let n = s === BACK ? 1 : parseInt(s, 10) - 1;
337
+ const { stack } = acc;
338
+ while (n-- > 0) {
339
+ stack.pop();
340
+ }
341
+ return { ...acc, parser: parseBack };
342
+ }
343
+ function parseIgnore(acc, _) {
344
+ return acc;
345
+ }
346
+ const parsers = createStringLookupMap([
347
+ [EOW, parseEOW],
348
+ [BACK, parseBack],
349
+ [REF, parseReference],
350
+ [REF_REL, parseReference],
351
+ [ESCAPE, parseEscapeCharacter],
352
+ [EOL, parseIgnore],
353
+ [LF, parseIgnore],
354
+ [INLINE_DATA_COMMENT_LINE, parseComment],
355
+ ]);
356
+ function parserMain(acc, s) {
357
+ const parser = acc.parser ?? parsers[s] ?? parseCharacter;
358
+ return parser(acc, s);
359
+ }
360
+ const charsetSpaces = stringToCharSet(' \r\n\t');
361
+ function parseReferenceIndex(acc, s) {
362
+ let json = '';
363
+ function parserStart(acc, s) {
364
+ if (s === REF_INDEX_BEGIN) {
365
+ json = json + s;
366
+ return { ...acc, parser };
367
+ }
368
+ if (s in charsetSpaces) {
369
+ return acc;
370
+ }
371
+ // A Reference Index was not found.
372
+ return parserMain({ ...acc, parser: undefined }, s);
373
+ }
374
+ function parser(acc, s) {
375
+ json = json + s;
376
+ if (s === REF_INDEX_END) {
377
+ refIndex = JSON.parse(json);
378
+ return { ...acc, parser: undefined };
379
+ }
380
+ return acc;
381
+ }
382
+ return parserStart({ ...acc, parser: parserStart }, s);
383
+ }
384
+ (0, gensequence_1.genSequence)(iter)
385
+ .concatMap((a) => a.split(''))
386
+ .reduce(parserMain, {
387
+ nodes: [root],
388
+ root,
389
+ stack: [{ node: root, s: '' }],
390
+ parser: parseReferenceIndex,
391
+ });
392
+ return root;
393
+ }
394
+ function stringToCharSet(values) {
395
+ const set = Object.create(null);
396
+ const len = values.length;
397
+ for (let i = 0; i < len; ++i) {
398
+ set[values[i]] = true;
399
+ }
400
+ return set;
401
+ }
402
+ function stringToCharMap(values) {
403
+ return createStringLookupMap(values);
404
+ }
405
+ function createStringLookupMap(values) {
406
+ const map = Object.create(null);
407
+ const len = values.length;
408
+ for (let i = 0; i < len; ++i) {
409
+ map[values[i][0]] = values[i][1];
410
+ }
411
+ return map;
412
+ }
413
+ /**
414
+ * Allows an iterable to be shared by multiple consumers.
415
+ * Each consumer takes from the iterable.
416
+ * @param iterable - the iterable to share
417
+ */
418
+ function tapIterable(iterable) {
419
+ let lastValue;
420
+ let iter;
421
+ function getNext() {
422
+ if (lastValue && lastValue.done) {
423
+ return { ...lastValue };
424
+ }
425
+ iter = iter || iterable[Symbol.iterator]();
426
+ lastValue = iter.next();
427
+ return lastValue;
428
+ }
429
+ function* iterableFn() {
430
+ let next;
431
+ while (!(next = getNext()).done) {
432
+ yield next.value;
433
+ }
434
+ }
435
+ return {
436
+ [Symbol.iterator]: iterableFn,
437
+ };
438
+ }
439
+ exports.__testing__ = {
440
+ buildReferenceMap,
441
+ };
442
+ //# sourceMappingURL=importExportV4.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cspell-trie-lib",
3
- "version": "6.6.1",
3
+ "version": "6.7.0",
4
4
  "description": "Trie Data Structure to support cspell.",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -37,7 +37,8 @@
37
37
  },
38
38
  "homepage": "https://github.com/streetsidesoftware/cspell#readme",
39
39
  "dependencies": {
40
- "@cspell/cspell-pipe": "^6.6.1",
40
+ "@cspell/cspell-pipe": "^6.7.0",
41
+ "@cspell/cspell-types": "^6.7.0",
41
42
  "fs-extra": "^10.1.0",
42
43
  "gensequence": "^3.1.1"
43
44
  },
@@ -45,13 +46,12 @@
45
46
  "node": ">=14"
46
47
  },
47
48
  "devDependencies": {
48
- "@cspell/cspell-types": "^6.6.1",
49
- "@cspell/dict-en_us": "^2.3.0",
49
+ "@cspell/dict-en_us": "^2.3.1",
50
50
  "@cspell/dict-es-es": "^2.2.0",
51
51
  "@types/fs-extra": "^9.0.13",
52
- "@types/node": "^18.6.5",
52
+ "@types/node": "^18.7.6",
53
53
  "jest": "^28.1.3",
54
54
  "rimraf": "^3.0.2"
55
55
  },
56
- "gitHead": "3c9c24d1cebd558ac3729d3fbf441e6ed751d8cf"
56
+ "gitHead": "3a7312a15d2df1507d9e01863ec5842f5a99e0cc"
57
57
  }