@stll/text-search 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +25 -0
- package/README.md +183 -0
- package/package.json +51 -0
- package/src/index.ts +6 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 stella labs, s.r.o., a limited liability company
|
|
4
|
+
established under the laws of the Czech Republic, with its registered
|
|
5
|
+
office at Nad Porubkou 2355, Poruba, 708 00 Ostrava, Czech Republic,
|
|
6
|
+
Company ID (IČO): 24632872, registered in the Commercial Register
|
|
7
|
+
maintained by the Regional Court in Ostrava, Section C, File No. 103233.
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src=".github/assets/banner.png" alt="Stella" width="100%" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# @stll/text-search
|
|
6
|
+
|
|
7
|
+
Multi-engine text search orchestrator for
|
|
8
|
+
Node.js and Bun. Routes patterns to the optimal
|
|
9
|
+
engine automatically: Aho-Corasick for literals,
|
|
10
|
+
RegexSet for regex, FuzzySearch for approximate
|
|
11
|
+
matching, with auto-optimization for large
|
|
12
|
+
alternations.
|
|
13
|
+
|
|
14
|
+
Part of the
|
|
15
|
+
[@stll text search ecosystem](https://github.com/stella):
|
|
16
|
+
[@stll/regex-set](https://github.com/stella/regex-set),
|
|
17
|
+
[@stll/aho-corasick](https://github.com/stella/aho-corasick),
|
|
18
|
+
[@stll/fuzzy-search](https://github.com/stella/fuzzy-search).
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install @stll/text-search
|
|
24
|
+
# or
|
|
25
|
+
bun add @stll/text-search
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Requires `@stll/regex-set`, `@stll/aho-corasick`,
|
|
29
|
+
and `@stll/fuzzy-search` as peer dependencies
|
|
30
|
+
(installed automatically).
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```typescript
|
|
35
|
+
import { TextSearch } from "@stll/text-search";
|
|
36
|
+
|
|
37
|
+
const ts = new TextSearch([
|
|
38
|
+
// Regex patterns → RegexSet (DFA)
|
|
39
|
+
/\b\d{2}\.\d{2}\.\d{4}\b/,
|
|
40
|
+
/\b[\w.+-]+@[\w-]+\.[\w]+\b/,
|
|
41
|
+
|
|
42
|
+
// Pure literals → Aho-Corasick (SIMD)
|
|
43
|
+
"Confidential",
|
|
44
|
+
"Attorney-Client Privilege",
|
|
45
|
+
|
|
46
|
+
// Fuzzy patterns → FuzzySearch (Levenshtein)
|
|
47
|
+
{ pattern: "Novák", distance: 1, name: "person" },
|
|
48
|
+
|
|
49
|
+
// Large alternation → auto-isolated RegexSet
|
|
50
|
+
`(?:${titles.join("|")})\\s+[A-Z][a-z]+`,
|
|
51
|
+
|
|
52
|
+
// Named patterns
|
|
53
|
+
{ pattern: /\+?\d{9,12}/, name: "phone" },
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
ts.findIter("Ing. Jan Novak, born 15.03.1990");
|
|
57
|
+
// [
|
|
58
|
+
// { pattern: 5, text: "Ing. Jan Novak", ... },
|
|
59
|
+
// { pattern: 4, text: "Novak", distance: 1, ... },
|
|
60
|
+
// { pattern: 0, text: "15.03.1990", ... },
|
|
61
|
+
// ]
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Engine routing
|
|
65
|
+
|
|
66
|
+
Patterns are classified and routed to the optimal
|
|
67
|
+
engine at construction time:
|
|
68
|
+
|
|
69
|
+
| Engine | Condition | Performance |
|
|
70
|
+
| --- | --- | --- |
|
|
71
|
+
| Aho-Corasick | Pure literal strings | SIMD-accelerated |
|
|
72
|
+
| RegexSet (shared) | Normal regex patterns | Single-pass DFA |
|
|
73
|
+
| RegexSet (isolated) | >50 alternation branches | Prevents DFA explosion |
|
|
74
|
+
| FuzzySearch | `distance` field present | Levenshtein/Damerau |
|
|
75
|
+
|
|
76
|
+
Large alternation patterns (e.g., 80+ title
|
|
77
|
+
prefixes) are automatically isolated into their
|
|
78
|
+
own RegexSet instance, preventing DFA state
|
|
79
|
+
explosion when combined with other patterns.
|
|
80
|
+
|
|
81
|
+
```typescript
|
|
82
|
+
// Without text-search: 73ms (DFA state explosion)
|
|
83
|
+
new RegexSet([hugePattern, simplePattern]);
|
|
84
|
+
|
|
85
|
+
// With text-search: 0.4ms (auto-split)
|
|
86
|
+
new TextSearch([hugePattern, simplePattern]);
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Options
|
|
90
|
+
|
|
91
|
+
```typescript
|
|
92
|
+
new TextSearch(patterns, {
|
|
93
|
+
// Unicode word boundaries (default: true)
|
|
94
|
+
unicodeBoundaries: true,
|
|
95
|
+
|
|
96
|
+
// Only match whole words (default: false)
|
|
97
|
+
wholeWords: false,
|
|
98
|
+
|
|
99
|
+
// Max alternation branches before auto-split
|
|
100
|
+
// (default: 50)
|
|
101
|
+
maxAlternations: 50,
|
|
102
|
+
|
|
103
|
+
// Fuzzy matching options
|
|
104
|
+
fuzzyMetric: "levenshtein", // or "damerau-levenshtein"
|
|
105
|
+
normalizeDiacritics: false,
|
|
106
|
+
caseInsensitive: false,
|
|
107
|
+
});
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## API
|
|
111
|
+
|
|
112
|
+
| Method | Returns | Description |
|
|
113
|
+
| --- | --- | --- |
|
|
114
|
+
| `findIter(text)` | `Match[]` | All non-overlapping matches |
|
|
115
|
+
| `isMatch(text)` | `boolean` | Any pattern matches? |
|
|
116
|
+
| `whichMatch(text)` | `number[]` | Which pattern indices matched |
|
|
117
|
+
| `replaceAll(text, replacements)` | `string` | Replace matches |
|
|
118
|
+
| `length` | `number` | Number of patterns |
|
|
119
|
+
|
|
120
|
+
## Pattern entry types
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
// Simple string (literal → AC, regex → RegexSet)
|
|
124
|
+
"foo"
|
|
125
|
+
|
|
126
|
+
// RegExp object → RegexSet
|
|
127
|
+
/\btest\b/i
|
|
128
|
+
|
|
129
|
+
// Named pattern
|
|
130
|
+
{ pattern: "\\d+", name: "number" }
|
|
131
|
+
|
|
132
|
+
// Fuzzy pattern → FuzzySearch
|
|
133
|
+
{ pattern: "Novák", distance: 1 }
|
|
134
|
+
{ pattern: "Smith", distance: "auto", name: "person" }
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Match type
|
|
138
|
+
|
|
139
|
+
```typescript
|
|
140
|
+
type Match = {
|
|
141
|
+
pattern: number; // original pattern index
|
|
142
|
+
start: number; // UTF-16 offset
|
|
143
|
+
end: number; // exclusive
|
|
144
|
+
text: string; // matched substring
|
|
145
|
+
name?: string; // pattern name (if provided)
|
|
146
|
+
};
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Same `Match` shape as `@stll/regex-set`,
|
|
150
|
+
`@stll/aho-corasick`, and `@stll/fuzzy-search`.
|
|
151
|
+
|
|
152
|
+
## How it works
|
|
153
|
+
|
|
154
|
+
1. **Classify**: detect literals, count alternation
|
|
155
|
+
branches, identify fuzzy patterns
|
|
156
|
+
2. **Route**: literals → AC, fuzzy → FuzzySearch,
|
|
157
|
+
large alternations → isolated RegexSet,
|
|
158
|
+
normal regex → shared RegexSet
|
|
159
|
+
3. **Search**: each engine scans the text
|
|
160
|
+
4. **Merge**: combine results, sort by position,
|
|
161
|
+
select non-overlapping (longest match at ties)
|
|
162
|
+
|
|
163
|
+
## Development
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
bun install
|
|
167
|
+
bun test
|
|
168
|
+
bun run lint
|
|
169
|
+
bun run format
|
|
170
|
+
bun run build
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Built on
|
|
174
|
+
- [@stll/regex-set](https://github.com/stella/regex-set) —
|
|
175
|
+
NAPI-RS bindings to Rust regex-automata
|
|
176
|
+
- [@stll/aho-corasick](https://github.com/stella/aho-corasick) —
|
|
177
|
+
NAPI-RS bindings to Rust aho-corasick
|
|
178
|
+
- [@stll/fuzzy-search](https://github.com/stella/fuzzy-search) —
|
|
179
|
+
NAPI-RS Levenshtein/Damerau-Levenshtein matcher
|
|
180
|
+
|
|
181
|
+
## License
|
|
182
|
+
|
|
183
|
+
[MIT](./LICENSE)
|
package/package.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@stll/text-search",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Multi-engine text search orchestrator. Routes patterns to optimal engines: Aho-Corasick, RegexSet, or FuzzySearch.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"text-search",
|
|
7
|
+
"multi-pattern",
|
|
8
|
+
"aho-corasick",
|
|
9
|
+
"regex",
|
|
10
|
+
"fuzzy",
|
|
11
|
+
"orchestrator"
|
|
12
|
+
],
|
|
13
|
+
"homepage": "https://github.com/stella/text-search#readme",
|
|
14
|
+
"bugs": {
|
|
15
|
+
"url": "https://github.com/stella/text-search/issues"
|
|
16
|
+
},
|
|
17
|
+
"license": "MIT",
|
|
18
|
+
"repository": {
|
|
19
|
+
"type": "git",
|
|
20
|
+
"url": "https://github.com/stella/text-search"
|
|
21
|
+
},
|
|
22
|
+
"type": "module",
|
|
23
|
+
"main": "src/index.ts",
|
|
24
|
+
"module": "src/index.ts",
|
|
25
|
+
"exports": {
|
|
26
|
+
".": "./src/index.ts"
|
|
27
|
+
},
|
|
28
|
+
"files": [
|
|
29
|
+
"dist"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"build": "bun build src/index.ts --outdir dist --target node",
|
|
33
|
+
"test": "bun test",
|
|
34
|
+
"lint": "oxlint .",
|
|
35
|
+
"format": "oxfmt ."
|
|
36
|
+
},
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"@stll/aho-corasick": "^0.2.0",
|
|
39
|
+
"@stll/fuzzy-search": "^0.1.0",
|
|
40
|
+
"@stll/regex-set": "^0.4.0"
|
|
41
|
+
},
|
|
42
|
+
"devDependencies": {
|
|
43
|
+
"@types/node": "^22.0.0",
|
|
44
|
+
"bun-types": "^1.3.10",
|
|
45
|
+
"oxfmt": "^0.40.0",
|
|
46
|
+
"oxlint": "^1.55.0"
|
|
47
|
+
},
|
|
48
|
+
"engines": {
|
|
49
|
+
"node": ">= 18"
|
|
50
|
+
}
|
|
51
|
+
}
|