@stll/text-search 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index-BIQ38WSF.d.ts +153 -0
- package/dist/index.js +389 -439
- package/dist/index.js.map +1 -0
- package/package.json +6 -6
- package/dist/index.d.ts +0 -150
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
//#region src/types.d.ts
|
|
2
|
+
/**
|
|
3
|
+
* A single match result. Same shape as
|
|
4
|
+
* @stll/regex-set and @stll/aho-corasick.
|
|
5
|
+
*/
|
|
6
|
+
type Match = {
|
|
7
|
+
/** Index of the pattern that matched. */
|
|
8
|
+
pattern: number;
|
|
9
|
+
/** Start UTF-16 code unit offset. */
|
|
10
|
+
start: number;
|
|
11
|
+
/** End offset (exclusive). */
|
|
12
|
+
end: number;
|
|
13
|
+
/** The matched text. */
|
|
14
|
+
text: string;
|
|
15
|
+
/** Pattern name (if provided). */
|
|
16
|
+
name?: string;
|
|
17
|
+
/** Edit distance (fuzzy matches only). */
|
|
18
|
+
distance?: number;
|
|
19
|
+
};
|
|
20
|
+
/** A pattern entry for TextSearch. */
|
|
21
|
+
type PatternEntry = string | RegExp | {
|
|
22
|
+
pattern: string | RegExp;
|
|
23
|
+
name?: string;
|
|
24
|
+
} | {
|
|
25
|
+
pattern: string;
|
|
26
|
+
name?: string;
|
|
27
|
+
/** Fuzzy matching distance. Routes to
|
|
28
|
+
* @stll/fuzzy-search instead of regex. */
|
|
29
|
+
distance: number | "auto";
|
|
30
|
+
} | {
|
|
31
|
+
pattern: string;
|
|
32
|
+
name?: string;
|
|
33
|
+
/** Force literal matching via Aho-Corasick.
|
|
34
|
+
* Skips regex metacharacter detection so
|
|
35
|
+
* patterns like "č.p." or "s.r.o." are
|
|
36
|
+
* matched literally, not as regex. */
|
|
37
|
+
literal: true;
|
|
38
|
+
/** Per-pattern case-insensitive for AC.
|
|
39
|
+
* Overrides the global option for this
|
|
40
|
+
* pattern only. */
|
|
41
|
+
caseInsensitive?: boolean;
|
|
42
|
+
/** Per-pattern whole-word matching for AC. */
|
|
43
|
+
wholeWords?: boolean;
|
|
44
|
+
};
|
|
45
|
+
/** Options for TextSearch. */
|
|
46
|
+
type TextSearchOptions = {
|
|
47
|
+
/**
|
|
48
|
+
* Use Unicode word boundaries.
|
|
49
|
+
* @default true
|
|
50
|
+
*/
|
|
51
|
+
unicodeBoundaries?: boolean;
|
|
52
|
+
/**
|
|
53
|
+
* Only match whole words.
|
|
54
|
+
* @default false
|
|
55
|
+
*/
|
|
56
|
+
wholeWords?: boolean;
|
|
57
|
+
/**
|
|
58
|
+
* Max alternation branches before auto-splitting
|
|
59
|
+
* into a separate engine instance. Prevents DFA
|
|
60
|
+
* state explosion when large-alternation patterns
|
|
61
|
+
* are combined with other patterns.
|
|
62
|
+
* @default 50
|
|
63
|
+
*/
|
|
64
|
+
maxAlternations?: number;
|
|
65
|
+
/**
|
|
66
|
+
* Fuzzy matching metric.
|
|
67
|
+
* @default "levenshtein"
|
|
68
|
+
*/
|
|
69
|
+
fuzzyMetric?: "levenshtein" | "damerau-levenshtein";
|
|
70
|
+
/**
|
|
71
|
+
* Normalize diacritics for fuzzy matching.
|
|
72
|
+
* @default false
|
|
73
|
+
*/
|
|
74
|
+
normalizeDiacritics?: boolean;
|
|
75
|
+
/**
|
|
76
|
+
* Case-insensitive matching for AC literals
|
|
77
|
+
* and fuzzy patterns.
|
|
78
|
+
* @default false
|
|
79
|
+
*/
|
|
80
|
+
caseInsensitive?: boolean;
|
|
81
|
+
/**
|
|
82
|
+
* How to handle overlapping matches from
|
|
83
|
+
* different engines or patterns.
|
|
84
|
+
*
|
|
85
|
+
* - "longest": keep longest non-overlapping match
|
|
86
|
+
* at each position (default).
|
|
87
|
+
* - "all": return all matches including overlaps.
|
|
88
|
+
* Useful when the caller applies its own dedup.
|
|
89
|
+
*
|
|
90
|
+
* @default "longest"
|
|
91
|
+
*/
|
|
92
|
+
overlapStrategy?: "longest" | "all";
|
|
93
|
+
/**
|
|
94
|
+
* Treat ALL string patterns as literals (route
|
|
95
|
+
* to AC, skip metacharacter detection). Useful
|
|
96
|
+
* for deny-list patterns where "s.r.o." means
|
|
97
|
+
* the literal string, not a regex with wildcards.
|
|
98
|
+
* @default false
|
|
99
|
+
*/
|
|
100
|
+
allLiteral?: boolean;
|
|
101
|
+
};
|
|
102
|
+
//#endregion
|
|
103
|
+
//#region src/text-search.d.ts
|
|
104
|
+
/**
|
|
105
|
+
* Multi-engine text search orchestrator.
|
|
106
|
+
*
|
|
107
|
+
* Routes patterns to the optimal engine
|
|
108
|
+
* configuration:
|
|
109
|
+
* - Large alternation patterns get their own
|
|
110
|
+
* RegexSet instance (prevents DFA state explosion)
|
|
111
|
+
* - Normal patterns share a single RegexSet
|
|
112
|
+
* (single-pass multi-pattern DFA)
|
|
113
|
+
*
|
|
114
|
+
* Merges results from all engines into a unified
|
|
115
|
+
* non-overlapping Match[] sorted by position.
|
|
116
|
+
*/
|
|
117
|
+
declare class TextSearch {
|
|
118
|
+
private engines;
|
|
119
|
+
private patternCount;
|
|
120
|
+
private overlapAll;
|
|
121
|
+
/**
|
|
122
|
+
* True when there's exactly one engine and all
|
|
123
|
+
* patterns map to identity indices (0→0, 1→1, ...).
|
|
124
|
+
* Enables zero-overhead findIter: return raw engine
|
|
125
|
+
* output without remapping or object allocation.
|
|
126
|
+
*/
|
|
127
|
+
private zeroOverhead;
|
|
128
|
+
constructor(patterns: PatternEntry[], options?: TextSearchOptions);
|
|
129
|
+
/** Number of patterns. */
|
|
130
|
+
get length(): number;
|
|
131
|
+
/** Returns true if any pattern matches. */
|
|
132
|
+
isMatch(haystack: string): boolean;
|
|
133
|
+
/**
|
|
134
|
+
* Find matches in text.
|
|
135
|
+
*
|
|
136
|
+
* With `overlapStrategy: "longest"` (default):
|
|
137
|
+
* returns non-overlapping matches, longest wins.
|
|
138
|
+
*
|
|
139
|
+
* With `overlapStrategy: "all"`: returns all
|
|
140
|
+
* matches including overlaps, sorted by position.
|
|
141
|
+
*/
|
|
142
|
+
findIter(haystack: string): Match[];
|
|
143
|
+
/** Which pattern indices matched (not where). */
|
|
144
|
+
whichMatch(haystack: string): number[];
|
|
145
|
+
/**
|
|
146
|
+
* Replace all non-overlapping matches.
|
|
147
|
+
* replacements[i] replaces pattern i.
|
|
148
|
+
*/
|
|
149
|
+
replaceAll(haystack: string, replacements: string[]): string;
|
|
150
|
+
}
|
|
151
|
+
//#endregion
|
|
152
|
+
export { type Match, type PatternEntry, TextSearch, type TextSearchOptions };
|
|
153
|
+
//# sourceMappingURL=index-BIQ38WSF.d.ts.map
|
package/dist/index.js
CHANGED
|
@@ -1,464 +1,414 @@
|
|
|
1
|
-
// src/text-search.ts
|
|
2
1
|
import { AhoCorasick } from "@stll/aho-corasick";
|
|
3
2
|
import { FuzzySearch } from "@stll/fuzzy-search";
|
|
4
3
|
import { RegexSet } from "@stll/regex-set";
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
//#region src/classify.ts
|
|
5
|
+
/**
|
|
6
|
+
* Check if a string is a pure literal (no regex
|
|
7
|
+
* metacharacters). Pure literals are routed to
|
|
8
|
+
* Aho-Corasick instead of the regex DFA.
|
|
9
|
+
*/
|
|
7
10
|
function isLiteralPattern(pattern) {
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
}
|
|
14
|
-
return pattern.length > 0;
|
|
11
|
+
for (let i = 0; i < pattern.length; i++) {
|
|
12
|
+
const ch = pattern[i];
|
|
13
|
+
if (ch === "\\" || ch === "." || ch === "^" || ch === "$" || ch === "*" || ch === "+" || ch === "?" || ch === "{" || ch === "}" || ch === "(" || ch === ")" || ch === "[" || ch === "]" || ch === "|") return false;
|
|
14
|
+
}
|
|
15
|
+
return pattern.length > 0;
|
|
15
16
|
}
|
|
17
|
+
/**
|
|
18
|
+
* Count the maximum alternation branches at any
|
|
19
|
+
* depth in a regex string. Used to detect patterns
|
|
20
|
+
* with large alternations (even nested inside
|
|
21
|
+
* groups) that should be isolated into their own
|
|
22
|
+
* RegexSet to prevent DFA state explosion.
|
|
23
|
+
*
|
|
24
|
+
* "a|b|c" → 3
|
|
25
|
+
* "(a|b)|c" → 2 (max of top=2, depth1=2)
|
|
26
|
+
* "(?:Ing\\.|Mgr\\.|Dr\\.)" → 3 (depth 1)
|
|
27
|
+
*/
|
|
16
28
|
function countAlternations(pattern) {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if (currentCount > max) max = currentCount;
|
|
49
|
-
return max;
|
|
29
|
+
let depth = 0;
|
|
30
|
+
let inClass = false;
|
|
31
|
+
let i = 0;
|
|
32
|
+
let max = 1;
|
|
33
|
+
let currentCount = 1;
|
|
34
|
+
const stack = [];
|
|
35
|
+
while (i < pattern.length) {
|
|
36
|
+
const ch = pattern[i];
|
|
37
|
+
if (ch === "\\" && i + 1 < pattern.length) {
|
|
38
|
+
i += 2;
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
if (ch === "[") inClass = true;
|
|
42
|
+
if (ch === "]") inClass = false;
|
|
43
|
+
if (!inClass) {
|
|
44
|
+
if (ch === "(") {
|
|
45
|
+
stack.push(currentCount);
|
|
46
|
+
currentCount = 1;
|
|
47
|
+
depth++;
|
|
48
|
+
}
|
|
49
|
+
if (ch === ")") {
|
|
50
|
+
if (currentCount > max) max = currentCount;
|
|
51
|
+
currentCount = stack.pop() ?? 1;
|
|
52
|
+
depth--;
|
|
53
|
+
}
|
|
54
|
+
if (ch === "|") currentCount++;
|
|
55
|
+
}
|
|
56
|
+
i++;
|
|
57
|
+
}
|
|
58
|
+
if (currentCount > max) max = currentCount;
|
|
59
|
+
return max;
|
|
50
60
|
}
|
|
61
|
+
/**
|
|
62
|
+
* Classify and normalize pattern entries.
|
|
63
|
+
*/
|
|
51
64
|
function classifyPatterns(entries, allLiteral = false) {
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
const result = {
|
|
105
|
-
originalIndex: i,
|
|
106
|
-
pattern: pat,
|
|
107
|
-
alternationCount: allLiteral ? 0 : countAlternations(source),
|
|
108
|
-
isLiteral: typeof pat === "string" && (allLiteral || isLiteralPattern(pat))
|
|
109
|
-
};
|
|
110
|
-
if (entry.name !== void 0) result.name = entry.name;
|
|
111
|
-
return result;
|
|
112
|
-
});
|
|
65
|
+
return entries.map((entry, i) => {
|
|
66
|
+
if (typeof entry === "string") return {
|
|
67
|
+
originalIndex: i,
|
|
68
|
+
pattern: entry,
|
|
69
|
+
alternationCount: allLiteral ? 0 : countAlternations(entry),
|
|
70
|
+
isLiteral: allLiteral || isLiteralPattern(entry)
|
|
71
|
+
};
|
|
72
|
+
if (entry instanceof RegExp) return {
|
|
73
|
+
originalIndex: i,
|
|
74
|
+
pattern: entry,
|
|
75
|
+
alternationCount: countAlternations(entry.source),
|
|
76
|
+
isLiteral: false
|
|
77
|
+
};
|
|
78
|
+
if ("distance" in entry) {
|
|
79
|
+
const result = {
|
|
80
|
+
originalIndex: i,
|
|
81
|
+
pattern: entry.pattern,
|
|
82
|
+
alternationCount: 0,
|
|
83
|
+
isLiteral: false,
|
|
84
|
+
fuzzyDistance: entry.distance
|
|
85
|
+
};
|
|
86
|
+
if (entry.name !== void 0) result.name = entry.name;
|
|
87
|
+
return result;
|
|
88
|
+
}
|
|
89
|
+
if ("literal" in entry && entry.literal) {
|
|
90
|
+
const hasPerPatternOpts = "caseInsensitive" in entry || "wholeWords" in entry;
|
|
91
|
+
const result = {
|
|
92
|
+
originalIndex: i,
|
|
93
|
+
pattern: entry.pattern,
|
|
94
|
+
alternationCount: 0,
|
|
95
|
+
isLiteral: true
|
|
96
|
+
};
|
|
97
|
+
if (entry.name !== void 0) result.name = entry.name;
|
|
98
|
+
if (hasPerPatternOpts) {
|
|
99
|
+
const opts = {};
|
|
100
|
+
if (entry.caseInsensitive !== void 0) opts.caseInsensitive = entry.caseInsensitive;
|
|
101
|
+
if (entry.wholeWords !== void 0) opts.wholeWords = entry.wholeWords;
|
|
102
|
+
result.acOptions = opts;
|
|
103
|
+
}
|
|
104
|
+
return result;
|
|
105
|
+
}
|
|
106
|
+
const pat = entry.pattern;
|
|
107
|
+
const source = pat instanceof RegExp ? pat.source : pat;
|
|
108
|
+
const result = {
|
|
109
|
+
originalIndex: i,
|
|
110
|
+
pattern: pat,
|
|
111
|
+
alternationCount: allLiteral ? 0 : countAlternations(source),
|
|
112
|
+
isLiteral: typeof pat === "string" && (allLiteral || isLiteralPattern(pat))
|
|
113
|
+
};
|
|
114
|
+
if (entry.name !== void 0) result.name = entry.name;
|
|
115
|
+
return result;
|
|
116
|
+
});
|
|
113
117
|
}
|
|
114
|
-
|
|
115
|
-
|
|
118
|
+
//#endregion
|
|
119
|
+
//#region src/merge.ts
|
|
120
|
+
/**
|
|
121
|
+
* Merge matches from multiple engines, sort by
|
|
122
|
+
* position, and select non-overlapping (longest
|
|
123
|
+
* first at ties). Same algorithm as regex-set's
|
|
124
|
+
* internal select_non_overlapping.
|
|
125
|
+
*/
|
|
116
126
|
function mergeAndSelect(matches) {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
lastEnd = m.end;
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
return selected;
|
|
127
|
+
if (matches.length <= 1) return matches;
|
|
128
|
+
matches.sort((a, b) => {
|
|
129
|
+
if (a.start !== b.start) return a.start - b.start;
|
|
130
|
+
return b.end - b.start - (a.end - a.start);
|
|
131
|
+
});
|
|
132
|
+
const selected = [];
|
|
133
|
+
let lastEnd = 0;
|
|
134
|
+
for (const m of matches) if (m.start >= lastEnd) {
|
|
135
|
+
selected.push(m);
|
|
136
|
+
lastEnd = m.end;
|
|
137
|
+
}
|
|
138
|
+
return selected;
|
|
133
139
|
}
|
|
134
|
-
|
|
135
|
-
|
|
140
|
+
//#endregion
|
|
141
|
+
//#region src/text-search.ts
|
|
142
|
+
/**
|
|
143
|
+
* Multi-engine text search orchestrator.
|
|
144
|
+
*
|
|
145
|
+
* Routes patterns to the optimal engine
|
|
146
|
+
* configuration:
|
|
147
|
+
* - Large alternation patterns get their own
|
|
148
|
+
* RegexSet instance (prevents DFA state explosion)
|
|
149
|
+
* - Normal patterns share a single RegexSet
|
|
150
|
+
* (single-pass multi-pattern DFA)
|
|
151
|
+
*
|
|
152
|
+
* Merges results from all engines into a unified
|
|
153
|
+
* non-overlapping Match[] sorted by position.
|
|
154
|
+
*/
|
|
136
155
|
var TextSearch = class {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
*
|
|
278
|
-
* With `overlapStrategy: "longest"` (default):
|
|
279
|
-
* returns non-overlapping matches, longest wins.
|
|
280
|
-
*
|
|
281
|
-
* With `overlapStrategy: "all"`: returns all
|
|
282
|
-
* matches including overlaps, sorted by position.
|
|
283
|
-
*/
|
|
284
|
-
findIter(haystack) {
|
|
285
|
-
if (this.zeroOverhead) {
|
|
286
|
-
return engineFindIter(
|
|
287
|
-
this.engines[0],
|
|
288
|
-
haystack
|
|
289
|
-
);
|
|
290
|
-
}
|
|
291
|
-
if (this.engines.length === 1) {
|
|
292
|
-
return remapMatches(
|
|
293
|
-
engineFindIter(this.engines[0], haystack),
|
|
294
|
-
this.engines[0]
|
|
295
|
-
);
|
|
296
|
-
}
|
|
297
|
-
const all = [];
|
|
298
|
-
for (const engine of this.engines) {
|
|
299
|
-
const matches = engineFindIter(
|
|
300
|
-
engine,
|
|
301
|
-
haystack
|
|
302
|
-
);
|
|
303
|
-
for (const m of remapMatches(matches, engine)) {
|
|
304
|
-
all.push(m);
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
if (this.overlapAll) {
|
|
308
|
-
return all.sort(
|
|
309
|
-
(a, b) => a.start - b.start
|
|
310
|
-
);
|
|
311
|
-
}
|
|
312
|
-
return mergeAndSelect(all);
|
|
313
|
-
}
|
|
314
|
-
/** Which pattern indices matched (not where). */
|
|
315
|
-
whichMatch(haystack) {
|
|
316
|
-
const seen = /* @__PURE__ */ new Set();
|
|
317
|
-
for (const engine of this.engines) {
|
|
318
|
-
const matches = engineFindIter(
|
|
319
|
-
engine,
|
|
320
|
-
haystack
|
|
321
|
-
);
|
|
322
|
-
for (const m of matches) {
|
|
323
|
-
seen.add(engine.indexMap[m.pattern]);
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
return [...seen];
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Replace all non-overlapping matches.
|
|
330
|
-
* replacements[i] replaces pattern i.
|
|
331
|
-
*/
|
|
332
|
-
replaceAll(haystack, replacements) {
|
|
333
|
-
if (replacements.length !== this.patternCount) {
|
|
334
|
-
throw new Error(
|
|
335
|
-
`Expected ${this.patternCount} replacements, got ${replacements.length}`
|
|
336
|
-
);
|
|
337
|
-
}
|
|
338
|
-
const all = [];
|
|
339
|
-
for (const engine of this.engines) {
|
|
340
|
-
const matches2 = engineFindIter(
|
|
341
|
-
engine,
|
|
342
|
-
haystack
|
|
343
|
-
);
|
|
344
|
-
for (const m of remapMatches(matches2, engine)) {
|
|
345
|
-
all.push(m);
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
const matches = mergeAndSelect(all);
|
|
349
|
-
let result = "";
|
|
350
|
-
let last = 0;
|
|
351
|
-
for (const m of matches) {
|
|
352
|
-
result += haystack.slice(last, m.start);
|
|
353
|
-
result += replacements[m.pattern];
|
|
354
|
-
last = m.end;
|
|
355
|
-
}
|
|
356
|
-
result += haystack.slice(last);
|
|
357
|
-
return result;
|
|
358
|
-
}
|
|
156
|
+
engines = [];
|
|
157
|
+
patternCount;
|
|
158
|
+
overlapAll;
|
|
159
|
+
/**
|
|
160
|
+
* True when there's exactly one engine and all
|
|
161
|
+
* patterns map to identity indices (0→0, 1→1, ...).
|
|
162
|
+
* Enables zero-overhead findIter: return raw engine
|
|
163
|
+
* output without remapping or object allocation.
|
|
164
|
+
*/
|
|
165
|
+
zeroOverhead = false;
|
|
166
|
+
constructor(patterns, options) {
|
|
167
|
+
this.patternCount = patterns.length;
|
|
168
|
+
this.overlapAll = options?.overlapStrategy === "all";
|
|
169
|
+
const maxAlt = options?.maxAlternations ?? 50;
|
|
170
|
+
const classified = classifyPatterns(patterns, options?.allLiteral ?? false);
|
|
171
|
+
const fuzzy = [];
|
|
172
|
+
const literals = [];
|
|
173
|
+
const shared = [];
|
|
174
|
+
const isolated = [];
|
|
175
|
+
for (const cp of classified) if (cp.fuzzyDistance !== void 0) fuzzy.push(cp);
|
|
176
|
+
else if (cp.isLiteral) literals.push(cp);
|
|
177
|
+
else if (cp.alternationCount > maxAlt) isolated.push(cp);
|
|
178
|
+
else shared.push(cp);
|
|
179
|
+
const rsOptions = {
|
|
180
|
+
unicodeBoundaries: options?.unicodeBoundaries ?? true,
|
|
181
|
+
wholeWords: options?.wholeWords ?? false,
|
|
182
|
+
caseInsensitive: options?.caseInsensitive ?? false
|
|
183
|
+
};
|
|
184
|
+
if (fuzzy.length > 0) {
|
|
185
|
+
const fuzzyOpts = {
|
|
186
|
+
unicodeBoundaries: rsOptions.unicodeBoundaries,
|
|
187
|
+
wholeWords: rsOptions.wholeWords
|
|
188
|
+
};
|
|
189
|
+
if (options?.fuzzyMetric !== void 0) fuzzyOpts.metric = options.fuzzyMetric;
|
|
190
|
+
if (options?.normalizeDiacritics !== void 0) fuzzyOpts.normalizeDiacritics = options.normalizeDiacritics;
|
|
191
|
+
if (options?.caseInsensitive !== void 0) fuzzyOpts.caseInsensitive = options.caseInsensitive;
|
|
192
|
+
this.engines.push(buildFuzzyEngine(fuzzy, fuzzyOpts));
|
|
193
|
+
}
|
|
194
|
+
if (literals.length > 0) {
|
|
195
|
+
const groups = /* @__PURE__ */ new Map();
|
|
196
|
+
for (const cp of literals) {
|
|
197
|
+
const ci = cp.acOptions?.caseInsensitive ?? rsOptions.caseInsensitive;
|
|
198
|
+
const ww = cp.acOptions?.wholeWords ?? rsOptions.wholeWords;
|
|
199
|
+
const key = `${ci ? 1 : 0}:${ww ? 1 : 0}`;
|
|
200
|
+
const group = groups.get(key);
|
|
201
|
+
if (group) group.push(cp);
|
|
202
|
+
else groups.set(key, [cp]);
|
|
203
|
+
}
|
|
204
|
+
for (const [key, group] of groups) {
|
|
205
|
+
const [ci, ww] = key.split(":");
|
|
206
|
+
this.engines.push(buildAcEngine(group, {
|
|
207
|
+
...rsOptions,
|
|
208
|
+
caseInsensitive: ci === "1",
|
|
209
|
+
wholeWords: ww === "1"
|
|
210
|
+
}));
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
if (shared.length > 1) {
|
|
214
|
+
const combined = buildRegexEngine(shared, rsOptions);
|
|
215
|
+
const probe = "Hello World 123 test@example.com 2025-01-01 +420 123 456 789 Ing. Jan Novák, s.r.o. Praha 1 ".repeat(10);
|
|
216
|
+
const t0 = performance.now();
|
|
217
|
+
combined.rs.findIter(probe);
|
|
218
|
+
const combinedMs = performance.now() - t0;
|
|
219
|
+
let individualMs = 0;
|
|
220
|
+
const individualEngines = [];
|
|
221
|
+
for (const cp of shared) {
|
|
222
|
+
const eng = buildRegexEngine([cp], rsOptions);
|
|
223
|
+
const t1 = performance.now();
|
|
224
|
+
eng.rs.findIter(probe);
|
|
225
|
+
individualMs += performance.now() - t1;
|
|
226
|
+
individualEngines.push(eng);
|
|
227
|
+
}
|
|
228
|
+
if (combinedMs > individualMs * 1.5) for (const eng of individualEngines) this.engines.push(eng);
|
|
229
|
+
else this.engines.push(combined);
|
|
230
|
+
} else if (shared.length === 1) this.engines.push(buildRegexEngine(shared, rsOptions));
|
|
231
|
+
for (const cp of isolated) this.engines.push(buildRegexEngine([cp], rsOptions));
|
|
232
|
+
if (this.engines.length === 1) {
|
|
233
|
+
if (!this.engines[0].nameMap.some((n) => n !== void 0)) this.zeroOverhead = true;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
/** Number of patterns. */
|
|
237
|
+
get length() {
|
|
238
|
+
return this.patternCount;
|
|
239
|
+
}
|
|
240
|
+
/** Returns true if any pattern matches. */
|
|
241
|
+
isMatch(haystack) {
|
|
242
|
+
for (const engine of this.engines) if (engineIsMatch(engine, haystack)) return true;
|
|
243
|
+
return false;
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Find matches in text.
|
|
247
|
+
*
|
|
248
|
+
* With `overlapStrategy: "longest"` (default):
|
|
249
|
+
* returns non-overlapping matches, longest wins.
|
|
250
|
+
*
|
|
251
|
+
* With `overlapStrategy: "all"`: returns all
|
|
252
|
+
* matches including overlaps, sorted by position.
|
|
253
|
+
*/
|
|
254
|
+
findIter(haystack) {
|
|
255
|
+
if (this.zeroOverhead) return engineFindIter(this.engines[0], haystack);
|
|
256
|
+
if (this.engines.length === 1) return remapMatches(engineFindIter(this.engines[0], haystack), this.engines[0]);
|
|
257
|
+
const all = [];
|
|
258
|
+
for (const engine of this.engines) {
|
|
259
|
+
const matches = engineFindIter(engine, haystack);
|
|
260
|
+
for (const m of remapMatches(matches, engine)) all.push(m);
|
|
261
|
+
}
|
|
262
|
+
if (this.overlapAll) return all.sort((a, b) => a.start - b.start);
|
|
263
|
+
return mergeAndSelect(all);
|
|
264
|
+
}
|
|
265
|
+
/** Which pattern indices matched (not where). */
|
|
266
|
+
whichMatch(haystack) {
|
|
267
|
+
const seen = /* @__PURE__ */ new Set();
|
|
268
|
+
for (const engine of this.engines) {
|
|
269
|
+
const matches = engineFindIter(engine, haystack);
|
|
270
|
+
for (const m of matches) seen.add(engine.indexMap[m.pattern]);
|
|
271
|
+
}
|
|
272
|
+
return [...seen];
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Replace all non-overlapping matches.
|
|
276
|
+
* replacements[i] replaces pattern i.
|
|
277
|
+
*/
|
|
278
|
+
replaceAll(haystack, replacements) {
|
|
279
|
+
if (replacements.length !== this.patternCount) throw new Error(`Expected ${this.patternCount} replacements, got ${replacements.length}`);
|
|
280
|
+
const all = [];
|
|
281
|
+
for (const engine of this.engines) {
|
|
282
|
+
const matches = engineFindIter(engine, haystack);
|
|
283
|
+
for (const m of remapMatches(matches, engine)) all.push(m);
|
|
284
|
+
}
|
|
285
|
+
const matches = mergeAndSelect(all);
|
|
286
|
+
let result = "";
|
|
287
|
+
let last = 0;
|
|
288
|
+
for (const m of matches) {
|
|
289
|
+
result += haystack.slice(last, m.start);
|
|
290
|
+
result += replacements[m.pattern];
|
|
291
|
+
last = m.end;
|
|
292
|
+
}
|
|
293
|
+
result += haystack.slice(last);
|
|
294
|
+
return result;
|
|
295
|
+
}
|
|
359
296
|
};
|
|
297
|
+
/**
|
|
298
|
+
* Build a RegexSet engine from classified patterns.
|
|
299
|
+
*/
|
|
360
300
|
function buildRegexEngine(patterns, options) {
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
301
|
+
const rsPatterns = [];
|
|
302
|
+
const indexMap = [];
|
|
303
|
+
const nameMap = [];
|
|
304
|
+
for (const cp of patterns) {
|
|
305
|
+
if (cp.name !== void 0) rsPatterns.push({
|
|
306
|
+
pattern: cp.pattern,
|
|
307
|
+
name: cp.name
|
|
308
|
+
});
|
|
309
|
+
else rsPatterns.push(cp.pattern);
|
|
310
|
+
indexMap.push(cp.originalIndex);
|
|
311
|
+
nameMap.push(cp.name);
|
|
312
|
+
}
|
|
313
|
+
return {
|
|
314
|
+
type: "regex",
|
|
315
|
+
rs: new RegexSet(rsPatterns, options),
|
|
316
|
+
indexMap,
|
|
317
|
+
nameMap
|
|
318
|
+
};
|
|
378
319
|
}
|
|
320
|
+
/**
|
|
321
|
+
* Build an Aho-Corasick engine from literal patterns.
|
|
322
|
+
*/
|
|
379
323
|
function buildAcEngine(patterns, options) {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
324
|
+
const literals = [];
|
|
325
|
+
const indexMap = [];
|
|
326
|
+
const nameMap = [];
|
|
327
|
+
for (const cp of patterns) {
|
|
328
|
+
literals.push(cp.pattern);
|
|
329
|
+
indexMap.push(cp.originalIndex);
|
|
330
|
+
nameMap.push(cp.name);
|
|
331
|
+
}
|
|
332
|
+
return {
|
|
333
|
+
type: "ac",
|
|
334
|
+
ac: new AhoCorasick(literals, {
|
|
335
|
+
wholeWords: options.wholeWords,
|
|
336
|
+
unicodeBoundaries: options.unicodeBoundaries,
|
|
337
|
+
caseInsensitive: options.caseInsensitive
|
|
338
|
+
}),
|
|
339
|
+
indexMap,
|
|
340
|
+
nameMap
|
|
341
|
+
};
|
|
394
342
|
}
|
|
343
|
+
/**
|
|
344
|
+
* Build a FuzzySearch engine from fuzzy patterns.
|
|
345
|
+
*/
|
|
395
346
|
function buildFuzzyEngine(patterns, options) {
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
const fs = new FuzzySearch(fsPatterns, fsOptions);
|
|
421
|
-
return { type: "fuzzy", fs, indexMap, nameMap };
|
|
347
|
+
const fsPatterns = [];
|
|
348
|
+
const indexMap = [];
|
|
349
|
+
const nameMap = [];
|
|
350
|
+
for (const cp of patterns) {
|
|
351
|
+
const entry = { pattern: cp.pattern };
|
|
352
|
+
if (cp.fuzzyDistance !== void 0) entry.distance = cp.fuzzyDistance;
|
|
353
|
+
if (cp.name !== void 0) entry.name = cp.name;
|
|
354
|
+
fsPatterns.push(entry);
|
|
355
|
+
indexMap.push(cp.originalIndex);
|
|
356
|
+
nameMap.push(cp.name);
|
|
357
|
+
}
|
|
358
|
+
const fsOptions = {
|
|
359
|
+
unicodeBoundaries: options.unicodeBoundaries,
|
|
360
|
+
wholeWords: options.wholeWords
|
|
361
|
+
};
|
|
362
|
+
if (options.metric !== void 0) fsOptions.metric = options.metric;
|
|
363
|
+
if (options.normalizeDiacritics !== void 0) fsOptions.normalizeDiacritics = options.normalizeDiacritics;
|
|
364
|
+
if (options.caseInsensitive !== void 0) fsOptions.caseInsensitive = options.caseInsensitive;
|
|
365
|
+
return {
|
|
366
|
+
type: "fuzzy",
|
|
367
|
+
fs: new FuzzySearch(fsPatterns, fsOptions),
|
|
368
|
+
indexMap,
|
|
369
|
+
nameMap
|
|
370
|
+
};
|
|
422
371
|
}
|
|
372
|
+
/**
|
|
373
|
+
* Dispatch isMatch to the correct engine.
|
|
374
|
+
*/
|
|
423
375
|
function engineIsMatch(engine, haystack) {
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
case "regex":
|
|
430
|
-
return engine.rs.isMatch(haystack);
|
|
431
|
-
}
|
|
376
|
+
switch (engine.type) {
|
|
377
|
+
case "ac": return engine.ac.isMatch(haystack);
|
|
378
|
+
case "fuzzy": return engine.fs.isMatch(haystack);
|
|
379
|
+
case "regex": return engine.rs.isMatch(haystack);
|
|
380
|
+
}
|
|
432
381
|
}
|
|
382
|
+
/**
|
|
383
|
+
* Dispatch findIter to the correct engine.
|
|
384
|
+
*/
|
|
433
385
|
function engineFindIter(engine, haystack) {
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
case "regex":
|
|
440
|
-
return engine.rs.findIter(haystack);
|
|
441
|
-
}
|
|
386
|
+
switch (engine.type) {
|
|
387
|
+
case "ac": return engine.ac.findIter(haystack);
|
|
388
|
+
case "fuzzy": return engine.fs.findIter(haystack);
|
|
389
|
+
case "regex": return engine.rs.findIter(haystack);
|
|
390
|
+
}
|
|
442
391
|
}
|
|
392
|
+
/**
|
|
393
|
+
* Remap engine-local match indices to original
|
|
394
|
+
* input indices and add names.
|
|
395
|
+
*/
|
|
443
396
|
function remapMatches(matches, engine) {
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
result.distance = m.distance;
|
|
458
|
-
}
|
|
459
|
-
return result;
|
|
460
|
-
});
|
|
397
|
+
return matches.map((m) => {
|
|
398
|
+
const originalIdx = engine.indexMap[m.pattern];
|
|
399
|
+
const name = engine.nameMap[m.pattern];
|
|
400
|
+
const result = {
|
|
401
|
+
pattern: originalIdx,
|
|
402
|
+
start: m.start,
|
|
403
|
+
end: m.end,
|
|
404
|
+
text: m.text
|
|
405
|
+
};
|
|
406
|
+
if (name !== void 0) result.name = name;
|
|
407
|
+
if ("distance" in m && m.distance !== void 0) result.distance = m.distance;
|
|
408
|
+
return result;
|
|
409
|
+
});
|
|
461
410
|
}
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
411
|
+
//#endregion
|
|
412
|
+
export { TextSearch };
|
|
413
|
+
|
|
414
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","names":[],"sources":["../src/classify.ts","../src/merge.ts","../src/text-search.ts"],"sourcesContent":["import type { PatternEntry } from \"./types\";\n\n/**\n * Normalized pattern with metadata for routing.\n */\nexport type ClassifiedPattern = {\n /** Original index in the input array. */\n originalIndex: number;\n /** The regex-compatible pattern string. */\n pattern: string | RegExp;\n /** Optional name. */\n name?: string;\n /**\n * Number of top-level alternation branches.\n * Used to detect large alternations that should\n * be isolated into their own RegexSet instance.\n */\n alternationCount: number;\n /**\n * True if the pattern is a pure literal string\n * (no regex metacharacters). These can be routed\n * to Aho-Corasick for SIMD-accelerated matching.\n */\n isLiteral: boolean;\n /**\n * Fuzzy distance if this is a fuzzy pattern.\n * Routes to @stll/fuzzy-search.\n */\n fuzzyDistance?: number | \"auto\";\n /**\n * Per-pattern AC options. When set, this literal\n * is grouped with others that have the same\n * options into a separate AC engine instance.\n */\n acOptions?: {\n caseInsensitive?: boolean;\n wholeWords?: boolean;\n };\n};\n\n/**\n * Check if a string is a pure literal (no regex\n * metacharacters). Pure literals are routed to\n * Aho-Corasick instead of the regex DFA.\n */\nexport function isLiteralPattern(\n pattern: string,\n): boolean {\n // All standard regex metacharacters cause a\n // pattern to be classified as regex (→ RegexSet).\n // To force literal AC routing for patterns with\n // dots/parens (e.g., \"s.r.o.\", \"č.p.\"), use the\n // explicit { literal: true } PatternEntry flag.\n for (let i = 0; i < pattern.length; i++) {\n const ch = pattern[i]!;\n if (\n ch === \"\\\\\" ||\n ch === \".\" ||\n ch === \"^\" ||\n ch === \"$\" ||\n ch === \"*\" ||\n ch === \"+\" ||\n ch === \"?\" ||\n ch === \"{\" ||\n ch === \"}\" ||\n ch === \"(\" ||\n ch === \")\" ||\n ch === \"[\" ||\n ch === \"]\" ||\n ch === \"|\"\n ) {\n return false;\n }\n }\n return pattern.length > 0;\n}\n\n/**\n * Count the maximum alternation branches at any\n * depth in a regex string. Used to detect patterns\n * with large alternations (even nested inside\n * groups) that should be isolated into their own\n * RegexSet to prevent DFA state explosion.\n *\n * \"a|b|c\" → 3\n * \"(a|b)|c\" → 2 (max of top=2, depth1=2)\n * \"(?:Ing\\\\.|Mgr\\\\.|Dr\\\\.)\" → 3 (depth 1)\n */\nexport function countAlternations(\n pattern: string,\n): number {\n let depth = 0;\n let inClass = false;\n let i = 0;\n\n // Track max alternation count seen at any depth.\n // Each time we enter a group, start a fresh count.\n // When we leave, update the global max.\n let max = 1;\n let currentCount = 1; // count for current group\n const stack: number[] = []; // saved counts\n\n while (i < pattern.length) {\n const ch = pattern[i];\n\n if (ch === \"\\\\\" && i + 1 < pattern.length) {\n i += 2;\n continue;\n }\n\n if (ch === \"[\") inClass = true;\n if (ch === \"]\") inClass = false;\n\n if (!inClass) {\n if (ch === \"(\") {\n stack.push(currentCount);\n currentCount = 1;\n depth++;\n }\n if (ch === \")\") {\n if (currentCount > max) max = currentCount;\n currentCount = stack.pop() ?? 1;\n depth--;\n }\n if (ch === \"|\") {\n currentCount++;\n }\n }\n\n i++;\n }\n // Check top-level count too\n if (currentCount > max) max = currentCount;\n return max;\n}\n\n/**\n * Classify and normalize pattern entries.\n */\nexport function classifyPatterns(\n entries: PatternEntry[],\n allLiteral = false,\n): ClassifiedPattern[] {\n return entries.map((entry, i) => {\n if (typeof entry === \"string\") {\n return {\n originalIndex: i,\n pattern: entry,\n alternationCount: allLiteral\n ? 0\n : countAlternations(entry),\n isLiteral: allLiteral ||\n isLiteralPattern(entry),\n };\n }\n\n if (entry instanceof RegExp) {\n return {\n originalIndex: i,\n pattern: entry,\n alternationCount: countAlternations(\n entry.source,\n ),\n isLiteral: false, // RegExp is never literal\n };\n }\n\n // Fuzzy pattern: has `distance` field\n if (\"distance\" in entry) {\n const result: ClassifiedPattern = {\n originalIndex: i,\n pattern: entry.pattern,\n alternationCount: 0,\n isLiteral: false,\n fuzzyDistance: entry.distance,\n };\n if (entry.name !== undefined) result.name = entry.name;\n return result;\n }\n\n // Explicit literal: skip metachar detection\n if (\"literal\" in entry && entry.literal) {\n const hasPerPatternOpts =\n \"caseInsensitive\" in entry ||\n \"wholeWords\" in entry;\n const result: ClassifiedPattern = {\n originalIndex: i,\n pattern: entry.pattern,\n alternationCount: 0,\n isLiteral: true,\n };\n if (entry.name !== undefined) result.name = entry.name;\n if (hasPerPatternOpts) {\n const opts: NonNullable<\n ClassifiedPattern[\"acOptions\"]\n > = {};\n if (entry.caseInsensitive !== undefined)\n opts.caseInsensitive = entry.caseInsensitive;\n if (entry.wholeWords !== undefined)\n opts.wholeWords = entry.wholeWords;\n result.acOptions = opts;\n }\n return result;\n }\n\n const pat = entry.pattern;\n const source =\n pat instanceof RegExp ? pat.source : pat;\n\n const result: ClassifiedPattern = {\n originalIndex: i,\n pattern: pat,\n alternationCount: allLiteral\n ? 0\n : countAlternations(source),\n isLiteral:\n typeof pat === \"string\" &&\n (allLiteral || isLiteralPattern(pat)),\n };\n if (entry.name !== undefined) result.name = entry.name;\n return result;\n });\n}\n","import type { Match } from \"./types\";\n\n/**\n * Merge matches from multiple engines, sort by\n * position, and select non-overlapping (longest\n * first at ties). Same algorithm as regex-set's\n * internal select_non_overlapping.\n */\nexport function mergeAndSelect(\n matches: Match[],\n): Match[] {\n if (matches.length <= 1) return matches;\n\n // Sort: start ascending, longest first at ties\n matches.sort((a, b) => {\n if (a.start !== b.start) {\n return a.start - b.start;\n }\n return b.end - b.start - (a.end - a.start);\n });\n\n // Greedily select non-overlapping\n const selected: Match[] = [];\n let lastEnd = 0;\n\n for (const m of matches) {\n if (m.start >= lastEnd) {\n selected.push(m);\n lastEnd = m.end;\n }\n }\n\n return selected;\n}\n","import { AhoCorasick } from \"@stll/aho-corasick\";\nimport { FuzzySearch } from \"@stll/fuzzy-search\";\nimport { RegexSet } from \"@stll/regex-set\";\n\nimport type { ClassifiedPattern } from \"./classify\";\nimport { classifyPatterns } from \"./classify\";\nimport { mergeAndSelect } from \"./merge\";\nimport type {\n Match,\n PatternEntry,\n TextSearchOptions,\n} from \"./types\";\n\n/**\n * An engine instance with pattern index mapping.\n */\ntype RegexSlot = {\n type: \"regex\";\n rs: RegexSet;\n indexMap: number[];\n nameMap: (string | undefined)[];\n};\n\ntype AcSlot = {\n type: \"ac\";\n ac: AhoCorasick;\n indexMap: number[];\n nameMap: (string | undefined)[];\n};\n\ntype FuzzySlot = {\n type: \"fuzzy\";\n fs: FuzzySearch;\n indexMap: number[];\n nameMap: (string | undefined)[];\n};\n\ntype EngineSlot = RegexSlot | AcSlot | FuzzySlot;\n\n/**\n * Multi-engine text search orchestrator.\n *\n * Routes patterns to the optimal engine\n * configuration:\n * - Large alternation patterns get their own\n * RegexSet instance (prevents DFA state explosion)\n * - Normal patterns share a single RegexSet\n * (single-pass multi-pattern DFA)\n *\n * Merges results from all engines into a unified\n * non-overlapping Match[] sorted by position.\n */\nexport class TextSearch {\n private engines: EngineSlot[] = [];\n private patternCount: number;\n private overlapAll: boolean;\n /**\n * True when there's exactly one engine and all\n * patterns map to identity indices (0→0, 1→1, ...).\n * Enables zero-overhead findIter: return raw engine\n * output without remapping or object allocation.\n */\n private zeroOverhead: boolean = false;\n\n constructor(\n patterns: PatternEntry[],\n options?: TextSearchOptions,\n ) {\n this.patternCount = patterns.length;\n this.overlapAll =\n options?.overlapStrategy === \"all\";\n const maxAlt = options?.maxAlternations ?? 50;\n const classified = classifyPatterns(\n patterns,\n options?.allLiteral ?? false,\n );\n\n // Four buckets:\n // 1. Fuzzy patterns → FuzzySearch (Levenshtein)\n // 2. Pure literals → Aho-Corasick (SIMD)\n // 3. Normal regex → shared RegexSet (DFA)\n // 4. Large alternations → isolated RegexSet\n const fuzzy: ClassifiedPattern[] = [];\n const literals: ClassifiedPattern[] = [];\n const shared: ClassifiedPattern[] = [];\n const isolated: ClassifiedPattern[] = [];\n\n for (const cp of classified) {\n if (cp.fuzzyDistance !== undefined) {\n fuzzy.push(cp);\n } else if (cp.isLiteral) {\n literals.push(cp);\n } else if (cp.alternationCount > maxAlt) {\n isolated.push(cp);\n } else {\n shared.push(cp);\n }\n }\n\n const rsOptions = {\n unicodeBoundaries:\n options?.unicodeBoundaries ?? true,\n wholeWords: options?.wholeWords ?? false,\n caseInsensitive:\n options?.caseInsensitive ?? false,\n };\n\n // Build fuzzy engine\n if (fuzzy.length > 0) {\n const fuzzyOpts: Parameters<\n typeof buildFuzzyEngine\n >[1] = {\n unicodeBoundaries:\n rsOptions.unicodeBoundaries,\n wholeWords: rsOptions.wholeWords,\n };\n if (options?.fuzzyMetric !== undefined)\n fuzzyOpts.metric = options.fuzzyMetric;\n if (options?.normalizeDiacritics !== undefined)\n fuzzyOpts.normalizeDiacritics =\n options.normalizeDiacritics;\n if (options?.caseInsensitive !== undefined)\n fuzzyOpts.caseInsensitive =\n options.caseInsensitive;\n this.engines.push(\n buildFuzzyEngine(fuzzy, fuzzyOpts),\n );\n }\n\n // Build AC engine(s) for pure literals.\n // Group by per-pattern AC options so patterns\n // with different caseInsensitive/wholeWords\n // settings get separate AC instances.\n if (literals.length > 0) {\n const groups = new Map<\n string,\n ClassifiedPattern[]\n >();\n for (const cp of literals) {\n const ci =\n cp.acOptions?.caseInsensitive ??\n rsOptions.caseInsensitive;\n const ww =\n cp.acOptions?.wholeWords ??\n rsOptions.wholeWords;\n const key = `${ci ? 1 : 0}:${ww ? 1 : 0}`;\n const group = groups.get(key);\n if (group) {\n group.push(cp);\n } else {\n groups.set(key, [cp]);\n }\n }\n for (const [key, group] of groups) {\n const [ci, ww] = key.split(\":\");\n this.engines.push(\n buildAcEngine(group, {\n ...rsOptions,\n caseInsensitive: ci === \"1\",\n wholeWords: ww === \"1\",\n }),\n );\n }\n }\n\n // Adaptive regex grouping: try combining shared\n // patterns, measure actual search time on a\n // probe string. If combined is slower than\n // individual, fall back to isolation.\n if (shared.length > 1) {\n const combined = buildRegexEngine(\n shared,\n rsOptions,\n );\n // Probe: 1KB of mixed content\n const probe = (\n \"Hello World 123 test@example.com \" +\n \"2025-01-01 +420 123 456 789 \" +\n \"Ing. Jan Novák, s.r.o. Praha 1 \"\n ).repeat(10);\n const t0 = performance.now();\n combined.rs.findIter(probe);\n const combinedMs = performance.now() - t0;\n\n // Individual baseline (sum of isolated scans)\n let individualMs = 0;\n const individualEngines: RegexSlot[] = [];\n for (const cp of shared) {\n const eng = buildRegexEngine(\n [cp],\n rsOptions,\n );\n const t1 = performance.now();\n eng.rs.findIter(probe);\n individualMs += performance.now() - t1;\n individualEngines.push(eng);\n }\n\n if (combinedMs > individualMs * 1.5) {\n // Combined is >1.5x slower — isolate\n for (const eng of individualEngines) {\n this.engines.push(eng);\n }\n } else {\n this.engines.push(combined);\n }\n } else if (shared.length === 1) {\n this.engines.push(\n buildRegexEngine(shared, rsOptions),\n );\n }\n\n for (const cp of isolated) {\n this.engines.push(\n buildRegexEngine([cp], rsOptions),\n );\n }\n\n // Zero-overhead fast path: when all patterns\n // land in a single engine, the indexMap is\n // identity (0→0, 1→1, ...) and no names need\n // attaching. findIter can return raw engine\n // output without any JS-side remapping.\n if (this.engines.length === 1) {\n const engine = this.engines[0]!;\n const hasNames = engine.nameMap.some(\n (n) => n !== undefined,\n );\n if (!hasNames) {\n this.zeroOverhead = true;\n }\n }\n }\n\n /** Number of patterns. */\n get length(): number {\n return this.patternCount;\n }\n\n /** Returns true if any pattern matches. */\n isMatch(haystack: string): boolean {\n for (const engine of this.engines) {\n if (engineIsMatch(engine, haystack)) {\n return true;\n }\n }\n return false;\n }\n\n /**\n * Find matches in text.\n *\n * With `overlapStrategy: \"longest\"` (default):\n * returns non-overlapping matches, longest wins.\n *\n * With `overlapStrategy: \"all\"`: returns all\n * matches including overlaps, sorted by position.\n */\n findIter(haystack: string): Match[] {\n // Fast path: single engine, identity indexMap,\n // no names → return raw engine output directly.\n // Zero JS overhead: no remapping, no allocation.\n if (this.zeroOverhead) {\n return engineFindIter(\n this.engines[0]!,\n haystack,\n );\n }\n\n // Single engine but needs name remapping\n if (this.engines.length === 1) {\n return remapMatches(\n engineFindIter(this.engines[0]!, haystack),\n this.engines[0]!,\n );\n }\n\n // Multi-engine: collect from all, remap in-place\n const all: Match[] = [];\n for (const engine of this.engines) {\n const matches = engineFindIter(\n engine,\n haystack,\n );\n // In-place remapping avoids .map() allocation\n for (const m of remapMatches(matches, engine)) {\n all.push(m);\n }\n }\n\n if (this.overlapAll) {\n return all.sort(\n (a, b) => a.start - b.start,\n );\n }\n\n return mergeAndSelect(all);\n }\n\n /** Which pattern indices matched (not where). */\n whichMatch(haystack: string): number[] {\n const seen = new Set<number>();\n\n for (const engine of this.engines) {\n // AC doesn't have whichMatch — use findIter\n const matches = engineFindIter(\n engine,\n haystack,\n );\n for (const m of matches) {\n seen.add(engine.indexMap[m.pattern]!);\n }\n }\n\n return [...seen];\n }\n\n /**\n * Replace all non-overlapping matches.\n * replacements[i] replaces pattern i.\n */\n replaceAll(\n haystack: string,\n replacements: string[],\n ): string {\n if (replacements.length !== this.patternCount) {\n throw new Error(\n `Expected ${this.patternCount} ` +\n `replacements, got ${replacements.length}`,\n );\n }\n\n // Always use non-overlapping matches for\n // replacement, even if overlapStrategy is \"all\".\n const all: Match[] = [];\n for (const engine of this.engines) {\n const matches = engineFindIter(\n engine,\n haystack,\n );\n for (const m of remapMatches(matches, engine)) {\n all.push(m);\n }\n }\n const matches = mergeAndSelect(all);\n\n let result = \"\";\n let last = 0;\n\n for (const m of matches) {\n result += haystack.slice(last, m.start);\n result += replacements[m.pattern]!;\n last = m.end;\n }\n\n result += haystack.slice(last);\n return result;\n }\n}\n\n/**\n * Build a RegexSet engine from classified patterns.\n */\nfunction buildRegexEngine(\n patterns: ClassifiedPattern[],\n options: {\n unicodeBoundaries: boolean;\n wholeWords: boolean;\n caseInsensitive: boolean;\n },\n): RegexSlot {\n const rsPatterns: (string | RegExp | {\n pattern: string | RegExp;\n name?: string;\n })[] = [];\n const indexMap: number[] = [];\n const nameMap: (string | undefined)[] = [];\n\n for (const cp of patterns) {\n if (cp.name !== undefined) {\n rsPatterns.push({\n pattern: cp.pattern,\n name: cp.name,\n });\n } else {\n rsPatterns.push(cp.pattern);\n }\n indexMap.push(cp.originalIndex);\n nameMap.push(cp.name);\n }\n\n const rs = new RegexSet(rsPatterns, options);\n\n return { type: \"regex\", rs, indexMap, nameMap };\n}\n\n/**\n * Build an Aho-Corasick engine from literal patterns.\n */\nfunction buildAcEngine(\n patterns: ClassifiedPattern[],\n options: {\n unicodeBoundaries: boolean;\n wholeWords: boolean;\n caseInsensitive: boolean;\n },\n): AcSlot {\n const literals: string[] = [];\n const indexMap: number[] = [];\n const nameMap: (string | undefined)[] = [];\n\n for (const cp of patterns) {\n literals.push(cp.pattern as string);\n indexMap.push(cp.originalIndex);\n nameMap.push(cp.name);\n }\n\n const ac = new AhoCorasick(literals, {\n wholeWords: options.wholeWords,\n unicodeBoundaries: options.unicodeBoundaries,\n caseInsensitive: options.caseInsensitive,\n });\n\n return { type: \"ac\", ac, indexMap, nameMap };\n}\n\n/**\n * Build a FuzzySearch engine from fuzzy patterns.\n */\nfunction buildFuzzyEngine(\n patterns: ClassifiedPattern[],\n options: {\n unicodeBoundaries: boolean;\n wholeWords: boolean;\n metric?: \"levenshtein\" | \"damerau-levenshtein\";\n normalizeDiacritics?: boolean;\n caseInsensitive?: boolean;\n },\n): FuzzySlot {\n const fsPatterns: {\n pattern: string;\n distance?: number | \"auto\";\n name?: string;\n }[] = [];\n const indexMap: number[] = [];\n const nameMap: (string | undefined)[] = [];\n\n for (const cp of patterns) {\n const entry: (typeof fsPatterns)[number] = {\n pattern: cp.pattern as string,\n };\n if (cp.fuzzyDistance !== undefined)\n entry.distance = cp.fuzzyDistance;\n if (cp.name !== undefined) entry.name = cp.name;\n fsPatterns.push(entry);\n indexMap.push(cp.originalIndex);\n nameMap.push(cp.name);\n }\n\n const fsOptions: ConstructorParameters<\n typeof FuzzySearch\n >[1] = {\n unicodeBoundaries: options.unicodeBoundaries,\n wholeWords: options.wholeWords,\n };\n if (options.metric !== undefined)\n fsOptions.metric = options.metric;\n if (options.normalizeDiacritics !== undefined)\n fsOptions.normalizeDiacritics =\n options.normalizeDiacritics;\n if (options.caseInsensitive !== undefined)\n fsOptions.caseInsensitive =\n options.caseInsensitive;\n const fs = new FuzzySearch(fsPatterns, fsOptions);\n\n return { type: \"fuzzy\", fs, indexMap, nameMap };\n}\n\n/**\n * Dispatch isMatch to the correct engine.\n */\nfunction engineIsMatch(\n engine: EngineSlot,\n haystack: string,\n): boolean {\n switch (engine.type) {\n case \"ac\":\n return engine.ac.isMatch(haystack);\n case \"fuzzy\":\n return engine.fs.isMatch(haystack);\n case \"regex\":\n return engine.rs.isMatch(haystack);\n }\n}\n\n/**\n * Dispatch findIter to the correct engine.\n */\nfunction engineFindIter(\n engine: EngineSlot,\n haystack: string,\n): Match[] {\n switch (engine.type) {\n case \"ac\":\n return engine.ac.findIter(haystack);\n case \"fuzzy\":\n return engine.fs.findIter(haystack);\n case \"regex\":\n return engine.rs.findIter(haystack);\n }\n}\n\n/**\n * Remap engine-local match indices to original\n * input indices and add names.\n */\nfunction remapMatches(\n matches: Match[],\n engine: EngineSlot,\n): Match[] {\n return matches.map((m) => {\n const originalIdx =\n engine.indexMap[m.pattern]!;\n const name = engine.nameMap[m.pattern];\n const result: Match = {\n pattern: originalIdx,\n start: m.start,\n end: m.end,\n text: m.text,\n };\n if (name !== undefined) {\n result.name = name;\n }\n // Preserve edit distance from fuzzy matches\n if (\"distance\" in m && m.distance !== undefined) {\n result.distance = m.distance as number;\n }\n return result;\n });\n}\n"],"mappings":";;;;;;;;;AA6CA,SAAgB,iBACd,SACS;AAMT,MAAK,IAAI,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;EACvC,MAAM,KAAK,QAAQ;AACnB,MACE,OAAO,QACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,OACP,OAAO,IAEP,QAAO;;AAGX,QAAO,QAAQ,SAAS;;;;;;;;;;;;;AAc1B,SAAgB,kBACd,SACQ;CACR,IAAI,QAAQ;CACZ,IAAI,UAAU;CACd,IAAI,IAAI;CAKR,IAAI,MAAM;CACV,IAAI,eAAe;CACnB,MAAM,QAAkB,EAAE;AAE1B,QAAO,IAAI,QAAQ,QAAQ;EACzB,MAAM,KAAK,QAAQ;AAEnB,MAAI,OAAO,QAAQ,IAAI,IAAI,QAAQ,QAAQ;AACzC,QAAK;AACL;;AAGF,MAAI,OAAO,IAAK,WAAU;AAC1B,MAAI,OAAO,IAAK,WAAU;AAE1B,MAAI,CAAC,SAAS;AACZ,OAAI,OAAO,KAAK;AACd,UAAM,KAAK,aAAa;AACxB,mBAAe;AACf;;AAEF,OAAI,OAAO,KAAK;AACd,QAAI,eAAe,IAAK,OAAM;AAC9B,mBAAe,MAAM,KAAK,IAAI;AAC9B;;AAEF,OAAI,OAAO,IACT;;AAIJ;;AAGF,KAAI,eAAe,IAAK,OAAM;AAC9B,QAAO;;;;;AAMT,SAAgB,iBACd,SACA,aAAa,OACQ;AACrB,QAAO,QAAQ,KAAK,OAAO,MAAM;AAC/B,MAAI,OAAO,UAAU,SACnB,QAAO;GACL,eAAe;GACf,SAAS;GACT,kBAAkB,aACd,IACA,kBAAkB,MAAM;GAC5B,WAAW,cACT,iBAAiB,MAAM;GAC1B;AAGH,MAAI,iBAAiB,OACnB,QAAO;GACL,eAAe;GACf,SAAS;GACT,kBAAkB,kBAChB,MAAM,OACP;GACD,WAAW;GACZ;AAIH,MAAI,cAAc,OAAO;GACvB,MAAM,SAA4B;IAChC,eAAe;IACf,SAAS,MAAM;IACf,kBAAkB;IAClB,WAAW;IACX,eAAe,MAAM;IACtB;AACD,OAAI,MAAM,SAAS,KAAA,EAAW,QAAO,OAAO,MAAM;AAClD,UAAO;;AAIT,MAAI,aAAa,SAAS,MAAM,SAAS;GACvC,MAAM,oBACJ,qBAAqB,SACrB,gBAAgB;GAClB,MAAM,SAA4B;IAChC,eAAe;IACf,SAAS,MAAM;IACf,kBAAkB;IAClB,WAAW;IACZ;AACD,OAAI,MAAM,SAAS,KAAA,EAAW,QAAO,OAAO,MAAM;AAClD,OAAI,mBAAmB;IACrB,MAAM,OAEF,EAAE;AACN,QAAI,MAAM,oBAAoB,KAAA,EAC5B,MAAK,kBAAkB,MAAM;AAC/B,QAAI,MAAM,eAAe,KAAA,EACvB,MAAK,aAAa,MAAM;AAC1B,WAAO,YAAY;;AAErB,UAAO;;EAGT,MAAM,MAAM,MAAM;EAClB,MAAM,SACJ,eAAe,SAAS,IAAI,SAAS;EAEvC,MAAM,SAA4B;GAChC,eAAe;GACf,SAAS;GACT,kBAAkB,aACd,IACA,kBAAkB,OAAO;GAC7B,WACE,OAAO,QAAQ,aACd,cAAc,iBAAiB,IAAI;GACvC;AACD,MAAI,MAAM,SAAS,KAAA,EAAW,QAAO,OAAO,MAAM;AAClD,SAAO;GACP;;;;;;;;;;ACrNJ,SAAgB,eACd,SACS;AACT,KAAI,QAAQ,UAAU,EAAG,QAAO;AAGhC,SAAQ,MAAM,GAAG,MAAM;AACrB,MAAI,EAAE,UAAU,EAAE,MAChB,QAAO,EAAE,QAAQ,EAAE;AAErB,SAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE;GACpC;CAGF,MAAM,WAAoB,EAAE;CAC5B,IAAI,UAAU;AAEd,MAAK,MAAM,KAAK,QACd,KAAI,EAAE,SAAS,SAAS;AACtB,WAAS,KAAK,EAAE;AAChB,YAAU,EAAE;;AAIhB,QAAO;;;;;;;;;;;;;;;;;ACoBT,IAAa,aAAb,MAAwB;CACtB,UAAgC,EAAE;CAClC;CACA;;;;;;;CAOA,eAAgC;CAEhC,YACE,UACA,SACA;AACA,OAAK,eAAe,SAAS;AAC7B,OAAK,aACH,SAAS,oBAAoB;EAC/B,MAAM,SAAS,SAAS,mBAAmB;EAC3C,MAAM,aAAa,iBACjB,UACA,SAAS,cAAc,MACxB;EAOD,MAAM,QAA6B,EAAE;EACrC,MAAM,WAAgC,EAAE;EACxC,MAAM,SAA8B,EAAE;EACtC,MAAM,WAAgC,EAAE;AAExC,OAAK,MAAM,MAAM,WACf,KAAI,GAAG,kBAAkB,KAAA,EACvB,OAAM,KAAK,GAAG;WACL,GAAG,UACZ,UAAS,KAAK,GAAG;WACR,GAAG,mBAAmB,OAC/B,UAAS,KAAK,GAAG;MAEjB,QAAO,KAAK,GAAG;EAInB,MAAM,YAAY;GAChB,mBACE,SAAS,qBAAqB;GAChC,YAAY,SAAS,cAAc;GACnC,iBACE,SAAS,mBAAmB;GAC/B;AAGD,MAAI,MAAM,SAAS,GAAG;GACpB,MAAM,YAEC;IACL,mBACE,UAAU;IACZ,YAAY,UAAU;IACvB;AACD,OAAI,SAAS,gBAAgB,KAAA,EAC3B,WAAU,SAAS,QAAQ;AAC7B,OAAI,SAAS,wBAAwB,KAAA,EACnC,WAAU,sBACR,QAAQ;AACZ,OAAI,SAAS,oBAAoB,KAAA,EAC/B,WAAU,kBACR,QAAQ;AACZ,QAAK,QAAQ,KACX,iBAAiB,OAAO,UAAU,CACnC;;AAOH,MAAI,SAAS,SAAS,GAAG;GACvB,MAAM,yBAAS,IAAI,KAGhB;AACH,QAAK,MAAM,MAAM,UAAU;IACzB,MAAM,KACJ,GAAG,WAAW,mBACd,UAAU;IACZ,MAAM,KACJ,GAAG,WAAW,cACd,UAAU;IACZ,MAAM,MAAM,GAAG,KAAK,IAAI,EAAE,GAAG,KAAK,IAAI;IACtC,MAAM,QAAQ,OAAO,IAAI,IAAI;AAC7B,QAAI,MACF,OAAM,KAAK,GAAG;QAEd,QAAO,IAAI,KAAK,CAAC,GAAG,CAAC;;AAGzB,QAAK,MAAM,CAAC,KAAK,UAAU,QAAQ;IACjC,MAAM,CAAC,IAAI,MAAM,IAAI,MAAM,IAAI;AAC/B,SAAK,QAAQ,KACX,cAAc,OAAO;KACnB,GAAG;KACH,iBAAiB,OAAO;KACxB,YAAY,OAAO;KACpB,CAAC,CACH;;;AAQL,MAAI,OAAO,SAAS,GAAG;GACrB,MAAM,WAAW,iBACf,QACA,UACD;GAED,MAAM,QACJ,+FAGA,OAAO,GAAG;GACZ,MAAM,KAAK,YAAY,KAAK;AAC5B,YAAS,GAAG,SAAS,MAAM;GAC3B,MAAM,aAAa,YAAY,KAAK,GAAG;GAGvC,IAAI,eAAe;GACnB,MAAM,oBAAiC,EAAE;AACzC,QAAK,MAAM,MAAM,QAAQ;IACvB,MAAM,MAAM,iBACV,CAAC,GAAG,EACJ,UACD;IACD,MAAM,KAAK,YAAY,KAAK;AAC5B,QAAI,GAAG,SAAS,MAAM;AACtB,oBAAgB,YAAY,KAAK,GAAG;AACpC,sBAAkB,KAAK,IAAI;;AAG7B,OAAI,aAAa,eAAe,IAE9B,MAAK,MAAM,OAAO,kBAChB,MAAK,QAAQ,KAAK,IAAI;OAGxB,MAAK,QAAQ,KAAK,SAAS;aAEpB,OAAO,WAAW,EAC3B,MAAK,QAAQ,KACX,iBAAiB,QAAQ,UAAU,CACpC;AAGH,OAAK,MAAM,MAAM,SACf,MAAK,QAAQ,KACX,iBAAiB,CAAC,GAAG,EAAE,UAAU,CAClC;AAQH,MAAI,KAAK,QAAQ,WAAW;OAKtB,CAJW,KAAK,QAAQ,GACJ,QAAQ,MAC7B,MAAM,MAAM,KAAA,EACd,CAEC,MAAK,eAAe;;;;CAM1B,IAAI,SAAiB;AACnB,SAAO,KAAK;;;CAId,QAAQ,UAA2B;AACjC,OAAK,MAAM,UAAU,KAAK,QACxB,KAAI,cAAc,QAAQ,SAAS,CACjC,QAAO;AAGX,SAAO;;;;;;;;;;;CAYT,SAAS,UAA2B;AAIlC,MAAI,KAAK,aACP,QAAO,eACL,KAAK,QAAQ,IACb,SACD;AAIH,MAAI,KAAK,QAAQ,WAAW,EAC1B,QAAO,aACL,eAAe,KAAK,QAAQ,IAAK,SAAS,EAC1C,KAAK,QAAQ,GACd;EAIH,MAAM,MAAe,EAAE;AACvB,OAAK,MAAM,UAAU,KAAK,SAAS;GACjC,MAAM,UAAU,eACd,QACA,SACD;AAED,QAAK,MAAM,KAAK,aAAa,SAAS,OAAO,CAC3C,KAAI,KAAK,EAAE;;AAIf,MAAI,KAAK,WACP,QAAO,IAAI,MACR,GAAG,MAAM,EAAE,QAAQ,EAAE,MACvB;AAGH,SAAO,eAAe,IAAI;;;CAI5B,WAAW,UAA4B;EACrC,MAAM,uBAAO,IAAI,KAAa;AAE9B,OAAK,MAAM,UAAU,KAAK,SAAS;GAEjC,MAAM,UAAU,eACd,QACA,SACD;AACD,QAAK,MAAM,KAAK,QACd,MAAK,IAAI,OAAO,SAAS,EAAE,SAAU;;AAIzC,SAAO,CAAC,GAAG,KAAK;;;;;;CAOlB,WACE,UACA,cACQ;AACR,MAAI,aAAa,WAAW,KAAK,aAC/B,OAAM,IAAI,MACR,YAAY,KAAK,aAAa,qBACP,aAAa,SACrC;EAKH,MAAM,MAAe,EAAE;AACvB,OAAK,MAAM,UAAU,KAAK,SAAS;GACjC,MAAM,UAAU,eACd,QACA,SACD;AACD,QAAK,MAAM,KAAK,aAAa,SAAS,OAAO,CAC3C,KAAI,KAAK,EAAE;;EAGf,MAAM,UAAU,eAAe,IAAI;EAEnC,IAAI,SAAS;EACb,IAAI,OAAO;AAEX,OAAK,MAAM,KAAK,SAAS;AACvB,aAAU,SAAS,MAAM,MAAM,EAAE,MAAM;AACvC,aAAU,aAAa,EAAE;AACzB,UAAO,EAAE;;AAGX,YAAU,SAAS,MAAM,KAAK;AAC9B,SAAO;;;;;;AAOX,SAAS,iBACP,UACA,SAKW;CACX,MAAM,aAGC,EAAE;CACT,MAAM,WAAqB,EAAE;CAC7B,MAAM,UAAkC,EAAE;AAE1C,MAAK,MAAM,MAAM,UAAU;AACzB,MAAI,GAAG,SAAS,KAAA,EACd,YAAW,KAAK;GACd,SAAS,GAAG;GACZ,MAAM,GAAG;GACV,CAAC;MAEF,YAAW,KAAK,GAAG,QAAQ;AAE7B,WAAS,KAAK,GAAG,cAAc;AAC/B,UAAQ,KAAK,GAAG,KAAK;;AAKvB,QAAO;EAAE,MAAM;EAAS,IAFb,IAAI,SAAS,YAAY,QAAQ;EAEhB;EAAU;EAAS;;;;;AAMjD,SAAS,cACP,UACA,SAKQ;CACR,MAAM,WAAqB,EAAE;CAC7B,MAAM,WAAqB,EAAE;CAC7B,MAAM,UAAkC,EAAE;AAE1C,MAAK,MAAM,MAAM,UAAU;AACzB,WAAS,KAAK,GAAG,QAAkB;AACnC,WAAS,KAAK,GAAG,cAAc;AAC/B,UAAQ,KAAK,GAAG,KAAK;;AASvB,QAAO;EAAE,MAAM;EAAM,IANV,IAAI,YAAY,UAAU;GACnC,YAAY,QAAQ;GACpB,mBAAmB,QAAQ;GAC3B,iBAAiB,QAAQ;GAC1B,CAAC;EAEuB;EAAU;EAAS;;;;;AAM9C,SAAS,iBACP,UACA,SAOW;CACX,MAAM,aAIA,EAAE;CACR,MAAM,WAAqB,EAAE;CAC7B,MAAM,UAAkC,EAAE;AAE1C,MAAK,MAAM,MAAM,UAAU;EACzB,MAAM,QAAqC,EACzC,SAAS,GAAG,SACb;AACD,MAAI,GAAG,kBAAkB,KAAA,EACvB,OAAM,WAAW,GAAG;AACtB,MAAI,GAAG,SAAS,KAAA,EAAW,OAAM,OAAO,GAAG;AAC3C,aAAW,KAAK,MAAM;AACtB,WAAS,KAAK,GAAG,cAAc;AAC/B,UAAQ,KAAK,GAAG,KAAK;;CAGvB,MAAM,YAEC;EACL,mBAAmB,QAAQ;EAC3B,YAAY,QAAQ;EACrB;AACD,KAAI,QAAQ,WAAW,KAAA,EACrB,WAAU,SAAS,QAAQ;AAC7B,KAAI,QAAQ,wBAAwB,KAAA,EAClC,WAAU,sBACR,QAAQ;AACZ,KAAI,QAAQ,oBAAoB,KAAA,EAC9B,WAAU,kBACR,QAAQ;AAGZ,QAAO;EAAE,MAAM;EAAS,IAFb,IAAI,YAAY,YAAY,UAAU;EAErB;EAAU;EAAS;;;;;AAMjD,SAAS,cACP,QACA,UACS;AACT,SAAQ,OAAO,MAAf;EACE,KAAK,KACH,QAAO,OAAO,GAAG,QAAQ,SAAS;EACpC,KAAK,QACH,QAAO,OAAO,GAAG,QAAQ,SAAS;EACpC,KAAK,QACH,QAAO,OAAO,GAAG,QAAQ,SAAS;;;;;;AAOxC,SAAS,eACP,QACA,UACS;AACT,SAAQ,OAAO,MAAf;EACE,KAAK,KACH,QAAO,OAAO,GAAG,SAAS,SAAS;EACrC,KAAK,QACH,QAAO,OAAO,GAAG,SAAS,SAAS;EACrC,KAAK,QACH,QAAO,OAAO,GAAG,SAAS,SAAS;;;;;;;AAQzC,SAAS,aACP,SACA,QACS;AACT,QAAO,QAAQ,KAAK,MAAM;EACxB,MAAM,cACJ,OAAO,SAAS,EAAE;EACpB,MAAM,OAAO,OAAO,QAAQ,EAAE;EAC9B,MAAM,SAAgB;GACpB,SAAS;GACT,OAAO,EAAE;GACT,KAAK,EAAE;GACP,MAAM,EAAE;GACT;AACD,MAAI,SAAS,KAAA,EACX,QAAO,OAAO;AAGhB,MAAI,cAAc,KAAK,EAAE,aAAa,KAAA,EACpC,QAAO,WAAW,EAAE;AAEtB,SAAO;GACP"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@stll/text-search",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Multi-engine text search orchestrator. Routes patterns to optimal engines: Aho-Corasick, RegexSet, or FuzzySearch.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"text-search",
|
|
@@ -35,23 +35,23 @@
|
|
|
35
35
|
"dist"
|
|
36
36
|
],
|
|
37
37
|
"scripts": {
|
|
38
|
-
"build": "
|
|
38
|
+
"build": "tsdown",
|
|
39
39
|
"prepublishOnly": "bun run build",
|
|
40
40
|
"test": "bun test",
|
|
41
41
|
"lint": "oxlint .",
|
|
42
42
|
"format": "oxfmt ."
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@stll/aho-corasick": "^0.
|
|
46
|
-
"@stll/fuzzy-search": "^0.
|
|
47
|
-
"@stll/regex-set": "^0.
|
|
45
|
+
"@stll/aho-corasick": "^0.3.0",
|
|
46
|
+
"@stll/fuzzy-search": "^0.2.0",
|
|
47
|
+
"@stll/regex-set": "^0.5.0"
|
|
48
48
|
},
|
|
49
49
|
"devDependencies": {
|
|
50
50
|
"@types/node": "^22.0.0",
|
|
51
51
|
"bun-types": "^1.3.10",
|
|
52
52
|
"oxfmt": "^0.40.0",
|
|
53
53
|
"oxlint": "^1.55.0",
|
|
54
|
-
"
|
|
54
|
+
"tsdown": "^0.12.4",
|
|
55
55
|
"typescript": "^5.9.3"
|
|
56
56
|
},
|
|
57
57
|
"engines": {
|
package/dist/index.d.ts
DELETED
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* A single match result. Same shape as
|
|
3
|
-
* @stll/regex-set and @stll/aho-corasick.
|
|
4
|
-
*/
|
|
5
|
-
type Match = {
|
|
6
|
-
/** Index of the pattern that matched. */
|
|
7
|
-
pattern: number;
|
|
8
|
-
/** Start UTF-16 code unit offset. */
|
|
9
|
-
start: number;
|
|
10
|
-
/** End offset (exclusive). */
|
|
11
|
-
end: number;
|
|
12
|
-
/** The matched text. */
|
|
13
|
-
text: string;
|
|
14
|
-
/** Pattern name (if provided). */
|
|
15
|
-
name?: string;
|
|
16
|
-
/** Edit distance (fuzzy matches only). */
|
|
17
|
-
distance?: number;
|
|
18
|
-
};
|
|
19
|
-
/** A pattern entry for TextSearch. */
|
|
20
|
-
type PatternEntry = string | RegExp | {
|
|
21
|
-
pattern: string | RegExp;
|
|
22
|
-
name?: string;
|
|
23
|
-
} | {
|
|
24
|
-
pattern: string;
|
|
25
|
-
name?: string;
|
|
26
|
-
/** Fuzzy matching distance. Routes to
|
|
27
|
-
* @stll/fuzzy-search instead of regex. */
|
|
28
|
-
distance: number | "auto";
|
|
29
|
-
} | {
|
|
30
|
-
pattern: string;
|
|
31
|
-
name?: string;
|
|
32
|
-
/** Force literal matching via Aho-Corasick.
|
|
33
|
-
* Skips regex metacharacter detection so
|
|
34
|
-
* patterns like "č.p." or "s.r.o." are
|
|
35
|
-
* matched literally, not as regex. */
|
|
36
|
-
literal: true;
|
|
37
|
-
/** Per-pattern case-insensitive for AC.
|
|
38
|
-
* Overrides the global option for this
|
|
39
|
-
* pattern only. */
|
|
40
|
-
caseInsensitive?: boolean;
|
|
41
|
-
/** Per-pattern whole-word matching for AC. */
|
|
42
|
-
wholeWords?: boolean;
|
|
43
|
-
};
|
|
44
|
-
/** Options for TextSearch. */
|
|
45
|
-
type TextSearchOptions = {
|
|
46
|
-
/**
|
|
47
|
-
* Use Unicode word boundaries.
|
|
48
|
-
* @default true
|
|
49
|
-
*/
|
|
50
|
-
unicodeBoundaries?: boolean;
|
|
51
|
-
/**
|
|
52
|
-
* Only match whole words.
|
|
53
|
-
* @default false
|
|
54
|
-
*/
|
|
55
|
-
wholeWords?: boolean;
|
|
56
|
-
/**
|
|
57
|
-
* Max alternation branches before auto-splitting
|
|
58
|
-
* into a separate engine instance. Prevents DFA
|
|
59
|
-
* state explosion when large-alternation patterns
|
|
60
|
-
* are combined with other patterns.
|
|
61
|
-
* @default 50
|
|
62
|
-
*/
|
|
63
|
-
maxAlternations?: number;
|
|
64
|
-
/**
|
|
65
|
-
* Fuzzy matching metric.
|
|
66
|
-
* @default "levenshtein"
|
|
67
|
-
*/
|
|
68
|
-
fuzzyMetric?: "levenshtein" | "damerau-levenshtein";
|
|
69
|
-
/**
|
|
70
|
-
* Normalize diacritics for fuzzy matching.
|
|
71
|
-
* @default false
|
|
72
|
-
*/
|
|
73
|
-
normalizeDiacritics?: boolean;
|
|
74
|
-
/**
|
|
75
|
-
* Case-insensitive matching for AC literals
|
|
76
|
-
* and fuzzy patterns.
|
|
77
|
-
* @default false
|
|
78
|
-
*/
|
|
79
|
-
caseInsensitive?: boolean;
|
|
80
|
-
/**
|
|
81
|
-
* How to handle overlapping matches from
|
|
82
|
-
* different engines or patterns.
|
|
83
|
-
*
|
|
84
|
-
* - "longest": keep longest non-overlapping match
|
|
85
|
-
* at each position (default).
|
|
86
|
-
* - "all": return all matches including overlaps.
|
|
87
|
-
* Useful when the caller applies its own dedup.
|
|
88
|
-
*
|
|
89
|
-
* @default "longest"
|
|
90
|
-
*/
|
|
91
|
-
overlapStrategy?: "longest" | "all";
|
|
92
|
-
/**
|
|
93
|
-
* Treat ALL string patterns as literals (route
|
|
94
|
-
* to AC, skip metacharacter detection). Useful
|
|
95
|
-
* for deny-list patterns where "s.r.o." means
|
|
96
|
-
* the literal string, not a regex with wildcards.
|
|
97
|
-
* @default false
|
|
98
|
-
*/
|
|
99
|
-
allLiteral?: boolean;
|
|
100
|
-
};
|
|
101
|
-
|
|
102
|
-
/**
|
|
103
|
-
* Multi-engine text search orchestrator.
|
|
104
|
-
*
|
|
105
|
-
* Routes patterns to the optimal engine
|
|
106
|
-
* configuration:
|
|
107
|
-
* - Large alternation patterns get their own
|
|
108
|
-
* RegexSet instance (prevents DFA state explosion)
|
|
109
|
-
* - Normal patterns share a single RegexSet
|
|
110
|
-
* (single-pass multi-pattern DFA)
|
|
111
|
-
*
|
|
112
|
-
* Merges results from all engines into a unified
|
|
113
|
-
* non-overlapping Match[] sorted by position.
|
|
114
|
-
*/
|
|
115
|
-
declare class TextSearch {
|
|
116
|
-
private engines;
|
|
117
|
-
private patternCount;
|
|
118
|
-
private overlapAll;
|
|
119
|
-
/**
|
|
120
|
-
* True when there's exactly one engine and all
|
|
121
|
-
* patterns map to identity indices (0→0, 1→1, ...).
|
|
122
|
-
* Enables zero-overhead findIter: return raw engine
|
|
123
|
-
* output without remapping or object allocation.
|
|
124
|
-
*/
|
|
125
|
-
private zeroOverhead;
|
|
126
|
-
constructor(patterns: PatternEntry[], options?: TextSearchOptions);
|
|
127
|
-
/** Number of patterns. */
|
|
128
|
-
get length(): number;
|
|
129
|
-
/** Returns true if any pattern matches. */
|
|
130
|
-
isMatch(haystack: string): boolean;
|
|
131
|
-
/**
|
|
132
|
-
* Find matches in text.
|
|
133
|
-
*
|
|
134
|
-
* With `overlapStrategy: "longest"` (default):
|
|
135
|
-
* returns non-overlapping matches, longest wins.
|
|
136
|
-
*
|
|
137
|
-
* With `overlapStrategy: "all"`: returns all
|
|
138
|
-
* matches including overlaps, sorted by position.
|
|
139
|
-
*/
|
|
140
|
-
findIter(haystack: string): Match[];
|
|
141
|
-
/** Which pattern indices matched (not where). */
|
|
142
|
-
whichMatch(haystack: string): number[];
|
|
143
|
-
/**
|
|
144
|
-
* Replace all non-overlapping matches.
|
|
145
|
-
* replacements[i] replaces pattern i.
|
|
146
|
-
*/
|
|
147
|
-
replaceAll(haystack: string, replacements: string[]): string;
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
export { type Match, type PatternEntry, TextSearch, type TextSearchOptions };
|