dawg-search 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +64 -0
- package/dist/index.d.mts +106 -0
- package/dist/index.mjs +323 -0
- package/package.json +55 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 UnluckyNinja
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# DAWG Search
|
|
2
|
+
[![npm version][npm-version-src]][npm-href]
|
|
3
|
+
[![npm downloads][npm-downloads-src]][npm-href]
|
|
4
|
+
[![Unit Test][unit-test-src]][unit-test-href]
|
|
5
|
+
[![License][license-src]][license-href]
|
|
6
|
+
|
|
7
|
+
## Description
|
|
8
|
+
- It aims at solving a string searching problem: how to find **many** words in **one** text.
|
|
9
|
+
- It's designed to work with non-English languages like Chinese and Japanese.
|
|
10
|
+
- It's based on two well-studied data strucutres: Trie Automaton and Suffix Automaton
|
|
11
|
+
- If multiple words overlaps, words shown first or longer take precedence,
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm i dawg-search
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```js
|
|
22
|
+
import { prepareSearch, refineMatches } from 'dawg-search'
|
|
23
|
+
|
|
24
|
+
const text = '举头望明月,低头思故乡'
|
|
25
|
+
const words = ['明月', '故乡', '月,低']
|
|
26
|
+
|
|
27
|
+
const { findWords } = prepareSearch(words)
|
|
28
|
+
const results = refineMatches(findWords(text))
|
|
29
|
+
|
|
30
|
+
// [{start: 3, end: 5}, {start: 9, end: 11}]
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## How it works
|
|
35
|
+
1. Trie Automaton
|
|
36
|
+
First you need to prepare a dictionary of words.
|
|
37
|
+
It will be processed into a trie, which merged not only the prefixes but also the suffixes.
|
|
38
|
+
Historically it's called *Deterministic Acyclic Finite State Automaton* (DAFSA).
|
|
39
|
+
(A regular trie will work, but considering many words have common suffix, it could save a lot of memory).
|
|
40
|
+
|
|
41
|
+
2. Suffix Automaton
|
|
42
|
+
When searching text, the text is processed into a suffix automaton. The trie automaton can be reused.
|
|
43
|
+
|
|
44
|
+
3. Traversing Phase
|
|
45
|
+
Then, from root node of the suffix automaton, it will traverse every transition,
|
|
46
|
+
which implicitly will traverse all substrings.
|
|
47
|
+
It runs in two pass,
|
|
48
|
+
`findWords` return results in unordered list,
|
|
49
|
+
`refineMatches` will sort and eliminate overlaps.
|
|
50
|
+
|
|
51
|
+
## Limitations
|
|
52
|
+
- This lib can only process concrete words, not regex.
|
|
53
|
+
- For the two data structures, a long chain of single transitions could be compressed into one transition, achieving more compact forms.
|
|
54
|
+
|
|
55
|
+
## License
|
|
56
|
+
[MIT](./LICENSE) License © 2026 [UnluckyNinja](https://github.com/UnluckyNinja)
|
|
57
|
+
|
|
58
|
+
[npm-version-src]: https://img.shields.io/npm/v/dawg-search?style=flat&colorA=080f12&colorB=1fa669
|
|
59
|
+
[npm-downloads-src]: https://img.shields.io/npm/dm/dawg-search?style=flat&colorA=080f12&colorB=1fa669
|
|
60
|
+
[npm-href]: https://npmjs.com/package/dawg-search
|
|
61
|
+
[unit-test-src]: https://github.com/UnluckyNinja/dawg-search/actions/workflows/unit-test.yml/badge.svg
|
|
62
|
+
[unit-test-href]: https://github.com/UnluckyNinja/dawg-search/actions/workflows/unit-test.yml
|
|
63
|
+
[license-src]: https://img.shields.io/github/license/UnluckyNinja/dawg-search.svg?style=flat&colorA=080f12&colorB=1fa669
|
|
64
|
+
[license-href]: https://github.com/UnluckyNinja/dawg-search/blob/main/LICENSE
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
//#region src/utils.d.ts
|
|
2
|
+
declare function binarySearch<T>(arr: T[], target: T, cmp: (a: T, b: T) => number, start?: number, end?: number): number;
|
|
3
|
+
interface DGraphNode {
|
|
4
|
+
id: number;
|
|
5
|
+
out: [string, this][];
|
|
6
|
+
}
|
|
7
|
+
declare function nodeGetNext<T extends DGraphNode>(node: T, char: string): T | undefined;
|
|
8
|
+
/**
|
|
9
|
+
* only works for single char link
|
|
10
|
+
* @param next if undefined, delete edge
|
|
11
|
+
*/
|
|
12
|
+
declare function nodeSetOutEdge<T extends DGraphNode>(node: T, char: string, next?: T): void;
|
|
13
|
+
declare function compareNodeOut(a: [string, unknown], b: [string, unknown]): number;
|
|
14
|
+
//#endregion
|
|
15
|
+
//#region src/trie.d.ts
|
|
16
|
+
interface TrieNode extends DGraphNode {
|
|
17
|
+
final: boolean;
|
|
18
|
+
inDegree: number;
|
|
19
|
+
}
|
|
20
|
+
declare class TrieAutomaton {
|
|
21
|
+
private _root;
|
|
22
|
+
get root(): TrieNode;
|
|
23
|
+
private _states;
|
|
24
|
+
get states(): TrieNode[];
|
|
25
|
+
private nodePool;
|
|
26
|
+
private register;
|
|
27
|
+
constructor(words?: string[]);
|
|
28
|
+
private init;
|
|
29
|
+
refill(words: string[]): void;
|
|
30
|
+
/**
|
|
31
|
+
* MARK: Internal funcs
|
|
32
|
+
*/
|
|
33
|
+
private addNode;
|
|
34
|
+
private cloneNode;
|
|
35
|
+
private removeFromRegister;
|
|
36
|
+
/**
|
|
37
|
+
* Will register child
|
|
38
|
+
*/
|
|
39
|
+
private replaceOrRegister;
|
|
40
|
+
/**
|
|
41
|
+
* @returns false if the parameter is not presented in states, true otherwise.
|
|
42
|
+
*/
|
|
43
|
+
private deleteNode;
|
|
44
|
+
/**
|
|
45
|
+
* Giving a word, from root node follow transition of each character:
|
|
46
|
+
* - if there is a transition to a child node
|
|
47
|
+
* - if child and all parent nodes have only one in-edge
|
|
48
|
+
* - push it to an array and remove it from merge-able list, if it's in.
|
|
49
|
+
* (to avoid a chain of transitions megring with itself,
|
|
50
|
+
* where there might be a shorter word happening to be the prefix of it
|
|
51
|
+
* and ends with the same char)
|
|
52
|
+
* - if child node or any parent have more than one in-edge
|
|
53
|
+
* - clone child node (excluding all in-edges), separate current transition from old one
|
|
54
|
+
* and reroute to cloned node.
|
|
55
|
+
* - if there isn't one transtion of that character
|
|
56
|
+
* - if it's removing, break and do nothing
|
|
57
|
+
* - if adding, create new node
|
|
58
|
+
* After done:
|
|
59
|
+
* - if removing, and didn't early break, mark last node as NOT final
|
|
60
|
+
* - if adding, mark last node as final
|
|
61
|
+
* Then try merging (and retracting for removing)
|
|
62
|
+
*/
|
|
63
|
+
private processWord;
|
|
64
|
+
/**
|
|
65
|
+
* MARK: API
|
|
66
|
+
*/
|
|
67
|
+
addWord(word: string): void;
|
|
68
|
+
removeWord(word: string): void;
|
|
69
|
+
}
|
|
70
|
+
//#endregion
|
|
71
|
+
//#region src/suffix.d.ts
|
|
72
|
+
interface SuffixNode extends DGraphNode {
|
|
73
|
+
id: number;
|
|
74
|
+
len: number;
|
|
75
|
+
link: this | null;
|
|
76
|
+
out: [string, this][];
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Reference: https://oi-wiki.org/string/sam/
|
|
80
|
+
*/
|
|
81
|
+
declare class SuffixAutomaton {
|
|
82
|
+
private _root;
|
|
83
|
+
get root(): SuffixNode;
|
|
84
|
+
private _last;
|
|
85
|
+
get last(): SuffixNode;
|
|
86
|
+
private _states;
|
|
87
|
+
get states(): SuffixNode[];
|
|
88
|
+
constructor(text: string);
|
|
89
|
+
private addNode;
|
|
90
|
+
private cloneNode;
|
|
91
|
+
extend(char: string): void;
|
|
92
|
+
getFinals(): Set<number>;
|
|
93
|
+
}
|
|
94
|
+
//#endregion
|
|
95
|
+
//#region src/index.d.ts
|
|
96
|
+
interface Result {
|
|
97
|
+
start: number;
|
|
98
|
+
end: number;
|
|
99
|
+
}
|
|
100
|
+
declare function prepareSearch(words: string[]): {
|
|
101
|
+
findWords: (text: string) => Result[];
|
|
102
|
+
};
|
|
103
|
+
declare function findWords(text: string, trie: TrieAutomaton): Result[];
|
|
104
|
+
declare function refineMatches(result: Result[], inPlace?: boolean): Result[];
|
|
105
|
+
//#endregion
|
|
106
|
+
export { DGraphNode, SuffixAutomaton, SuffixNode, TrieAutomaton, TrieNode, binarySearch, compareNodeOut, findWords, nodeGetNext, nodeSetOutEdge, prepareSearch, refineMatches };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
import SplayTree from "splaytree";
|
|
2
|
+
function binarySearch(arr, target, cmp, start = 0, end = arr.length - 1) {
|
|
3
|
+
while (start <= end) {
|
|
4
|
+
let mid = Math.floor((start + end) / 2);
|
|
5
|
+
let result = cmp(arr[mid], target);
|
|
6
|
+
if (result === 0) return mid;
|
|
7
|
+
else if (result > 0) end = mid - 1;
|
|
8
|
+
else start = mid + 1;
|
|
9
|
+
}
|
|
10
|
+
return start;
|
|
11
|
+
}
|
|
12
|
+
function nodeGetNext(node, char) {
|
|
13
|
+
const idx = binarySearch(node.out, [char, void 0], compareNodeOut);
|
|
14
|
+
if (node.out[idx]?.[0] === char) return node.out[idx][1];
|
|
15
|
+
else return;
|
|
16
|
+
}
|
|
17
|
+
function nodeSetOutEdge(node, char, next) {
|
|
18
|
+
const idx = binarySearch(node.out, [char, void 0], compareNodeOut);
|
|
19
|
+
let replace = false;
|
|
20
|
+
if (node.out[idx]?.[0] === char) replace = true;
|
|
21
|
+
if (replace) if (!next) node.out.splice(idx, 1);
|
|
22
|
+
else node.out.splice(idx, 1, [char, next]);
|
|
23
|
+
else if (!next) return;
|
|
24
|
+
else node.out.splice(idx, 0, [char, next]);
|
|
25
|
+
}
|
|
26
|
+
function compareNodeOut(a, b) {
|
|
27
|
+
return a[0].localeCompare(b[0]);
|
|
28
|
+
}
|
|
29
|
+
var SuffixAutomaton = class {
|
|
30
|
+
_root;
|
|
31
|
+
get root() {
|
|
32
|
+
return this._root;
|
|
33
|
+
}
|
|
34
|
+
_last;
|
|
35
|
+
get last() {
|
|
36
|
+
return this._last;
|
|
37
|
+
}
|
|
38
|
+
_states = [];
|
|
39
|
+
get states() {
|
|
40
|
+
return this._states;
|
|
41
|
+
}
|
|
42
|
+
constructor(text) {
|
|
43
|
+
this._root = this.addNode();
|
|
44
|
+
this._last = this._root;
|
|
45
|
+
for (const char of text) this.extend(char);
|
|
46
|
+
}
|
|
47
|
+
addNode() {
|
|
48
|
+
const node = {
|
|
49
|
+
id: this.states.length,
|
|
50
|
+
len: 0,
|
|
51
|
+
link: null,
|
|
52
|
+
out: []
|
|
53
|
+
};
|
|
54
|
+
this.states.push(node);
|
|
55
|
+
return node;
|
|
56
|
+
}
|
|
57
|
+
cloneNode(node) {
|
|
58
|
+
const clone = this.addNode();
|
|
59
|
+
clone.len = node.len;
|
|
60
|
+
clone.link = node.link;
|
|
61
|
+
clone.out = [...node.out];
|
|
62
|
+
return clone;
|
|
63
|
+
}
|
|
64
|
+
extend(char) {
|
|
65
|
+
const cur = this.addNode();
|
|
66
|
+
cur.len = this._last.len + 1;
|
|
67
|
+
let p = this._last;
|
|
68
|
+
while (p && nodeGetNext(p, char) === void 0) {
|
|
69
|
+
nodeSetOutEdge(p, char, cur);
|
|
70
|
+
p = p.link;
|
|
71
|
+
}
|
|
72
|
+
if (!p) cur.link = this._root;
|
|
73
|
+
else {
|
|
74
|
+
let q = nodeGetNext(p, char);
|
|
75
|
+
if (q === void 0) throw new Error(`p:${p.id}->${char} is undefined}`);
|
|
76
|
+
if (p.len + 1 === q.len) cur.link = q;
|
|
77
|
+
else {
|
|
78
|
+
const clone = this.cloneNode(q);
|
|
79
|
+
clone.len = p.len + 1;
|
|
80
|
+
while (p && nodeGetNext(p, char) === q) {
|
|
81
|
+
nodeSetOutEdge(p, char, clone);
|
|
82
|
+
p = p.link;
|
|
83
|
+
}
|
|
84
|
+
q.link = cur.link = clone;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
this._last = cur;
|
|
88
|
+
}
|
|
89
|
+
getFinals() {
|
|
90
|
+
const map = /* @__PURE__ */ new Set();
|
|
91
|
+
let node = this._last;
|
|
92
|
+
while (node.link) {
|
|
93
|
+
map.add(node.id);
|
|
94
|
+
node = node.link;
|
|
95
|
+
}
|
|
96
|
+
return map;
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
var TrieAutomaton = class {
|
|
100
|
+
_root;
|
|
101
|
+
get root() {
|
|
102
|
+
return this._root;
|
|
103
|
+
}
|
|
104
|
+
_states = [];
|
|
105
|
+
get states() {
|
|
106
|
+
return this._states;
|
|
107
|
+
}
|
|
108
|
+
nodePool = [];
|
|
109
|
+
register;
|
|
110
|
+
constructor(words) {
|
|
111
|
+
this.refill(words ?? []);
|
|
112
|
+
}
|
|
113
|
+
init() {
|
|
114
|
+
this._states.length = 0;
|
|
115
|
+
this.nodePool.length = 0;
|
|
116
|
+
this._root = this.addNode();
|
|
117
|
+
this.register = new SplayTree(compareNode);
|
|
118
|
+
}
|
|
119
|
+
refill(words) {
|
|
120
|
+
this.init();
|
|
121
|
+
for (let i = 0; i < words.length; ++i) this.addWord(words[i]);
|
|
122
|
+
}
|
|
123
|
+
addNode() {
|
|
124
|
+
let node;
|
|
125
|
+
if (this.nodePool.length > 0) {
|
|
126
|
+
node = this.nodePool.pop();
|
|
127
|
+
node.id = this._states.length;
|
|
128
|
+
node.final = false;
|
|
129
|
+
node.inDegree = 0;
|
|
130
|
+
node.out = [];
|
|
131
|
+
this._states.push(node);
|
|
132
|
+
} else {
|
|
133
|
+
node = {
|
|
134
|
+
id: this._states.length,
|
|
135
|
+
final: false,
|
|
136
|
+
inDegree: 0,
|
|
137
|
+
out: []
|
|
138
|
+
};
|
|
139
|
+
this._states.push(node);
|
|
140
|
+
}
|
|
141
|
+
return node;
|
|
142
|
+
}
|
|
143
|
+
cloneNode(node) {
|
|
144
|
+
const clone = this.addNode();
|
|
145
|
+
clone.final = node.final;
|
|
146
|
+
clone.inDegree = 0;
|
|
147
|
+
clone.out = node.out.map((it) => it.slice());
|
|
148
|
+
for (let i = 0; i < clone.out.length; ++i) ++clone.out[i][1].inDegree;
|
|
149
|
+
return clone;
|
|
150
|
+
}
|
|
151
|
+
removeFromRegister(node) {
|
|
152
|
+
this.register.remove(node);
|
|
153
|
+
}
|
|
154
|
+
replaceOrRegister(state, char) {
|
|
155
|
+
const child = nodeGetNext(state, char);
|
|
156
|
+
if (!child) throw new Error(`State doesn't have a transition of "${char}"`);
|
|
157
|
+
if (child.inDegree > 1) throw new Error(`There are more than one transition to the child by "${char}"`);
|
|
158
|
+
const node = this.register.find(child);
|
|
159
|
+
if (node) {
|
|
160
|
+
const found = node.key;
|
|
161
|
+
setTrieTransition(state, char, found);
|
|
162
|
+
this.deleteNode(child, false);
|
|
163
|
+
return found;
|
|
164
|
+
} else {
|
|
165
|
+
this.register.add(child);
|
|
166
|
+
return child;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
deleteNode(node, alsoRegister) {
|
|
170
|
+
const id = node.id;
|
|
171
|
+
if (!this._states[id] || this._states[id] !== node) return false;
|
|
172
|
+
for (let i = 0; i < node.out.length; ++i) --node.out[i][1].inDegree;
|
|
173
|
+
if (alsoRegister) this.removeFromRegister(node);
|
|
174
|
+
this.nodePool.push(node);
|
|
175
|
+
const lastID = this.states.length - 1;
|
|
176
|
+
const swapnode = this._states[lastID];
|
|
177
|
+
this._states[id] = swapnode;
|
|
178
|
+
swapnode.id = id;
|
|
179
|
+
this._states.length = lastID;
|
|
180
|
+
return true;
|
|
181
|
+
}
|
|
182
|
+
processWord(word, remove) {
|
|
183
|
+
let lastNode = this._root;
|
|
184
|
+
const chain = [lastNode];
|
|
185
|
+
let i = 0;
|
|
186
|
+
let shouldClone = false;
|
|
187
|
+
for (; i < word.length; ++i) {
|
|
188
|
+
const char = word.charAt(i);
|
|
189
|
+
const child = nodeGetNext(lastNode, char);
|
|
190
|
+
if (child) {
|
|
191
|
+
if (child.inDegree > 1) shouldClone = true;
|
|
192
|
+
if (shouldClone) {
|
|
193
|
+
const cloned = this.cloneNode(child);
|
|
194
|
+
setTrieTransition(lastNode, char, cloned);
|
|
195
|
+
lastNode = cloned;
|
|
196
|
+
} else {
|
|
197
|
+
this.removeFromRegister(child);
|
|
198
|
+
lastNode = child;
|
|
199
|
+
}
|
|
200
|
+
} else if (remove) break;
|
|
201
|
+
else {
|
|
202
|
+
const newNode = this.addNode();
|
|
203
|
+
setTrieTransition(lastNode, char, newNode);
|
|
204
|
+
lastNode = newNode;
|
|
205
|
+
}
|
|
206
|
+
chain.push(lastNode);
|
|
207
|
+
}
|
|
208
|
+
if (remove) {
|
|
209
|
+
if (i === word.length) lastNode.final = false;
|
|
210
|
+
} else lastNode.final = true;
|
|
211
|
+
for (let i = chain.length - 2; i >= 0; --i) this.replaceOrRegister(chain[i], word.charAt(i));
|
|
212
|
+
}
|
|
213
|
+
addWord(word) {
|
|
214
|
+
this.processWord(word, false);
|
|
215
|
+
}
|
|
216
|
+
removeWord(word) {
|
|
217
|
+
this.processWord(word, true);
|
|
218
|
+
}
|
|
219
|
+
};
|
|
220
|
+
function compareNode(a, b) {
|
|
221
|
+
if (a === b) return 0;
|
|
222
|
+
let diff = (a.final ? 1 : 0) - (b.final ? 1 : 0);
|
|
223
|
+
if (diff !== 0) return diff;
|
|
224
|
+
diff = a.out.length - b.out.length;
|
|
225
|
+
if (diff !== 0) return diff;
|
|
226
|
+
for (let i = 0; i < a.out.length; ++i) {
|
|
227
|
+
const [charA, nodeA] = a.out[i];
|
|
228
|
+
const [charB, nodeB] = b.out[i];
|
|
229
|
+
diff = charA.localeCompare(charB);
|
|
230
|
+
if (diff !== 0) return diff;
|
|
231
|
+
return compareNode(nodeA, nodeB);
|
|
232
|
+
}
|
|
233
|
+
return 0;
|
|
234
|
+
}
|
|
235
|
+
function setTrieTransition(node, char, next) {
|
|
236
|
+
const idx = binarySearch(node.out, [char, void 0], compareNodeOut);
|
|
237
|
+
const found = node.out[idx]?.[0] === char;
|
|
238
|
+
const oldNext = found ? node.out[idx]?.[1] : void 0;
|
|
239
|
+
if (oldNext === next) return;
|
|
240
|
+
if (found) if (next) node.out.splice(idx, 1, [char, next]);
|
|
241
|
+
else node.out.splice(idx, 1);
|
|
242
|
+
else if (next) node.out.splice(idx, 0, [char, next]);
|
|
243
|
+
if (oldNext) --oldNext.inDegree;
|
|
244
|
+
if (next) ++next.inDegree;
|
|
245
|
+
}
|
|
246
|
+
function prepareSearch(words) {
|
|
247
|
+
const trie = new TrieAutomaton(words);
|
|
248
|
+
function _findWords(text) {
|
|
249
|
+
return findWords(text, trie);
|
|
250
|
+
}
|
|
251
|
+
return { findWords: _findWords };
|
|
252
|
+
}
|
|
253
|
+
function findWords(text, trie) {
|
|
254
|
+
const sam = new SuffixAutomaton(text);
|
|
255
|
+
const finals = sam.getFinals();
|
|
256
|
+
const result = [];
|
|
257
|
+
const queue = [{
|
|
258
|
+
matchedLength: 0,
|
|
259
|
+
matchingLength: 0,
|
|
260
|
+
final: false,
|
|
261
|
+
samNode: sam.root,
|
|
262
|
+
trieNode: trie.root
|
|
263
|
+
}];
|
|
264
|
+
function pushToQueue(oldState, options = {}) {
|
|
265
|
+
const { matched = false, matching = true, final = false, nextSAM = null, nextTrie = null } = options;
|
|
266
|
+
queue.push({
|
|
267
|
+
matchedLength: matched ? oldState.matchingLength + 1 : oldState.matchedLength,
|
|
268
|
+
matchingLength: matching ? oldState.matchingLength + 1 : oldState.matchingLength,
|
|
269
|
+
final,
|
|
270
|
+
samNode: nextSAM,
|
|
271
|
+
trieNode: nextTrie
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
outter: while (queue.length > 0) {
|
|
275
|
+
const state = queue.pop();
|
|
276
|
+
if (finals.has(state.samNode.id)) {
|
|
277
|
+
if (state.matchedLength > 0) result.push({
|
|
278
|
+
start: text.length - state.matchingLength,
|
|
279
|
+
end: text.length - state.matchingLength + state.matchedLength
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
if (!state.trieNode) {
|
|
283
|
+
for (const [_, nextSAM] of state.samNode.out) pushToQueue(state, { nextSAM });
|
|
284
|
+
continue outter;
|
|
285
|
+
}
|
|
286
|
+
for (const [char, nextSAM] of state.samNode.out) {
|
|
287
|
+
const nextTrie = nodeGetNext(state.trieNode, char);
|
|
288
|
+
if (!nextTrie) {
|
|
289
|
+
if (state.matchedLength === 0) continue;
|
|
290
|
+
pushToQueue(state, {
|
|
291
|
+
nextTrie: null,
|
|
292
|
+
nextSAM
|
|
293
|
+
});
|
|
294
|
+
continue;
|
|
295
|
+
}
|
|
296
|
+
if (nextTrie.final) pushToQueue(state, {
|
|
297
|
+
matched: true,
|
|
298
|
+
nextTrie,
|
|
299
|
+
nextSAM
|
|
300
|
+
});
|
|
301
|
+
else pushToQueue(state, {
|
|
302
|
+
nextTrie,
|
|
303
|
+
nextSAM
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
return result;
|
|
308
|
+
}
|
|
309
|
+
function refineMatches(result, inPlace = true) {
|
|
310
|
+
let outputArray = result;
|
|
311
|
+
if (!inPlace) outputArray = result.slice();
|
|
312
|
+
if (outputArray.length <= 1) return outputArray;
|
|
313
|
+
outputArray.sort((a, b) => {
|
|
314
|
+
if (a.start !== b.start) return b.start - a.start;
|
|
315
|
+
return a.end - b.end;
|
|
316
|
+
});
|
|
317
|
+
for (let i = outputArray.length - 1; i >= 1; --i) {
|
|
318
|
+
const cur = outputArray[i];
|
|
319
|
+
if (outputArray[i - 1].start < cur.end) outputArray.splice(i - 1, 1);
|
|
320
|
+
}
|
|
321
|
+
return outputArray.reverse();
|
|
322
|
+
}
|
|
323
|
+
export { SuffixAutomaton, TrieAutomaton, binarySearch, compareNodeOut, findWords, nodeGetNext, nodeSetOutEdge, prepareSearch, refineMatches };
|
package/package.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dawg-search",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"version": "1.0.0",
|
|
5
|
+
"description": "search multiple word in text, by leveraging two types of automaton",
|
|
6
|
+
"author": "UnluckyNinja <unluckyninja4ever@gmail.com>",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"homepage": "https://github.com/UnluckyNinja/dawg-search#readme",
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "git+https://github.com/UnluckyNinja/dawg-search.git"
|
|
12
|
+
},
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/UnluckyNinja/dawg-search/issues"
|
|
15
|
+
},
|
|
16
|
+
"exports": {
|
|
17
|
+
".": "./src/index.ts",
|
|
18
|
+
"./*": "./*"
|
|
19
|
+
},
|
|
20
|
+
"types": "./dist/index.d.mts",
|
|
21
|
+
"files": [
|
|
22
|
+
"dist"
|
|
23
|
+
],
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "tsdown",
|
|
26
|
+
"dev": "tsdown --watch --env.DEV=true",
|
|
27
|
+
"dev:playground": "pnpm -r --include-workspace-root --parallel run dev",
|
|
28
|
+
"test": "vitest",
|
|
29
|
+
"typecheck": "tsc --noEmit",
|
|
30
|
+
"release": "bumpp",
|
|
31
|
+
"prepublishOnly": "pnpm run build"
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"@types/node": "^25.2.0",
|
|
35
|
+
"bumpp": "^10.4.1",
|
|
36
|
+
"tsdown": "^0.20.3",
|
|
37
|
+
"typescript": "^5.9.3",
|
|
38
|
+
"vitest": "^4.0.18"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"splaytree": "^3.2.3"
|
|
42
|
+
},
|
|
43
|
+
"pnpm": {
|
|
44
|
+
"overrides": {
|
|
45
|
+
"vite": "npm:rolldown-vite@^7.3.1"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"publishConfig": {
|
|
49
|
+
"exports": {
|
|
50
|
+
".": "./dist/index.mjs",
|
|
51
|
+
"./*": "./*"
|
|
52
|
+
},
|
|
53
|
+
"registry": "https://registry.npmjs.org"
|
|
54
|
+
}
|
|
55
|
+
}
|