@stll/text-search 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@stll/text-search",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Multi-engine text search orchestrator. Routes patterns to optimal engines: Aho-Corasick, RegexSet, or FuzzySearch.",
5
5
  "keywords": [
6
6
  "text-search",
@@ -26,7 +26,7 @@
26
26
  ".": "./src/index.ts"
27
27
  },
28
28
  "files": [
29
- "dist"
29
+ "src"
30
30
  ],
31
31
  "scripts": {
32
32
  "build": "bun build src/index.ts --outdir dist --target node",
@@ -0,0 +1,217 @@
1
+ import type { PatternEntry } from "./types";
2
+
3
+ /**
4
+ * Normalized pattern with metadata for routing.
5
+ */
6
+ export type ClassifiedPattern = {
7
+ /** Original index in the input array. */
8
+ originalIndex: number;
9
+ /** The regex-compatible pattern string. */
10
+ pattern: string | RegExp;
11
+ /** Optional name. */
12
+ name?: string;
13
+ /**
14
+ * Number of top-level alternation branches.
15
+ * Used to detect large alternations that should
16
+ * be isolated into their own RegexSet instance.
17
+ */
18
+ alternationCount: number;
19
+ /**
20
+ * True if the pattern is a pure literal string
21
+ * (no regex metacharacters). These can be routed
22
+ * to Aho-Corasick for SIMD-accelerated matching.
23
+ */
24
+ isLiteral: boolean;
25
+ /**
26
+ * Fuzzy distance if this is a fuzzy pattern.
27
+ * Routes to @stll/fuzzy-search.
28
+ */
29
+ fuzzyDistance?: number | "auto";
30
+ /**
31
+ * Per-pattern AC options. When set, this literal
32
+ * is grouped with others that have the same
33
+ * options into a separate AC engine instance.
34
+ */
35
+ acOptions?: {
36
+ caseInsensitive?: boolean;
37
+ wholeWords?: boolean;
38
+ };
39
+ };
40
+
41
+ /**
42
+ * Check if a string is a pure literal (no regex
43
+ * metacharacters). Pure literals are routed to
44
+ * Aho-Corasick instead of the regex DFA.
45
+ */
46
+ export function isLiteralPattern(
47
+ pattern: string,
48
+ ): boolean {
49
+ // All standard regex metacharacters cause a
50
+ // pattern to be classified as regex (→ RegexSet).
51
+ // To force literal AC routing for patterns with
52
+ // dots/parens (e.g., "s.r.o.", "č.p."), use the
53
+ // explicit { literal: true } PatternEntry flag.
54
+ for (let i = 0; i < pattern.length; i++) {
55
+ const ch = pattern[i]!;
56
+ if (
57
+ ch === "\\" ||
58
+ ch === "." ||
59
+ ch === "^" ||
60
+ ch === "$" ||
61
+ ch === "*" ||
62
+ ch === "+" ||
63
+ ch === "?" ||
64
+ ch === "{" ||
65
+ ch === "}" ||
66
+ ch === "(" ||
67
+ ch === ")" ||
68
+ ch === "[" ||
69
+ ch === "]" ||
70
+ ch === "|"
71
+ ) {
72
+ return false;
73
+ }
74
+ }
75
+ return pattern.length > 0;
76
+ }
77
+
78
+ /**
79
+ * Count the maximum alternation branches at any
80
+ * depth in a regex string. Used to detect patterns
81
+ * with large alternations (even nested inside
82
+ * groups) that should be isolated into their own
83
+ * RegexSet to prevent DFA state explosion.
84
+ *
85
+ * "a|b|c" → 3
86
+ * "(a|b)|c" → 2 (max of top=2, depth1=2)
87
+ * "(?:Ing\\.|Mgr\\.|Dr\\.)" → 3 (depth 1)
88
+ */
89
+ export function countAlternations(
90
+ pattern: string,
91
+ ): number {
92
+ let depth = 0;
93
+ let inClass = false;
94
+ let i = 0;
95
+
96
+ // Track max alternation count seen at any depth.
97
+ // Each time we enter a group, start a fresh count.
98
+ // When we leave, update the global max.
99
+ let max = 1;
100
+ let currentCount = 1; // count for current group
101
+ const stack: number[] = []; // saved counts
102
+
103
+ while (i < pattern.length) {
104
+ const ch = pattern[i];
105
+
106
+ if (ch === "\\" && i + 1 < pattern.length) {
107
+ i += 2;
108
+ continue;
109
+ }
110
+
111
+ if (ch === "[") inClass = true;
112
+ if (ch === "]") inClass = false;
113
+
114
+ if (!inClass) {
115
+ if (ch === "(") {
116
+ stack.push(currentCount);
117
+ currentCount = 1;
118
+ depth++;
119
+ }
120
+ if (ch === ")") {
121
+ if (currentCount > max) max = currentCount;
122
+ currentCount = stack.pop() ?? 1;
123
+ depth--;
124
+ }
125
+ if (ch === "|") {
126
+ currentCount++;
127
+ }
128
+ }
129
+
130
+ i++;
131
+ }
132
+ // Check top-level count too
133
+ if (currentCount > max) max = currentCount;
134
+ return max;
135
+ }
136
+
137
+ /**
138
+ * Classify and normalize pattern entries.
139
+ */
140
+ export function classifyPatterns(
141
+ entries: PatternEntry[],
142
+ allLiteral = false,
143
+ ): ClassifiedPattern[] {
144
+ return entries.map((entry, i) => {
145
+ if (typeof entry === "string") {
146
+ return {
147
+ originalIndex: i,
148
+ pattern: entry,
149
+ alternationCount: allLiteral
150
+ ? 0
151
+ : countAlternations(entry),
152
+ isLiteral: allLiteral ||
153
+ isLiteralPattern(entry),
154
+ };
155
+ }
156
+
157
+ if (entry instanceof RegExp) {
158
+ return {
159
+ originalIndex: i,
160
+ pattern: entry,
161
+ alternationCount: countAlternations(
162
+ entry.source,
163
+ ),
164
+ isLiteral: false, // RegExp is never literal
165
+ };
166
+ }
167
+
168
+ // Fuzzy pattern: has `distance` field
169
+ if ("distance" in entry) {
170
+ return {
171
+ originalIndex: i,
172
+ pattern: entry.pattern,
173
+ name: entry.name,
174
+ alternationCount: 0,
175
+ isLiteral: false,
176
+ fuzzyDistance: entry.distance,
177
+ };
178
+ }
179
+
180
+ // Explicit literal: skip metachar detection
181
+ if ("literal" in entry && entry.literal) {
182
+ const hasPerPatternOpts =
183
+ "caseInsensitive" in entry ||
184
+ "wholeWords" in entry;
185
+ return {
186
+ originalIndex: i,
187
+ pattern: entry.pattern,
188
+ name: entry.name,
189
+ alternationCount: 0,
190
+ isLiteral: true,
191
+ acOptions: hasPerPatternOpts
192
+ ? {
193
+ caseInsensitive:
194
+ entry.caseInsensitive,
195
+ wholeWords: entry.wholeWords,
196
+ }
197
+ : undefined,
198
+ };
199
+ }
200
+
201
+ const pat = entry.pattern;
202
+ const source =
203
+ pat instanceof RegExp ? pat.source : pat;
204
+
205
+ return {
206
+ originalIndex: i,
207
+ pattern: pat,
208
+ name: entry.name,
209
+ alternationCount: allLiteral
210
+ ? 0
211
+ : countAlternations(source),
212
+ isLiteral:
213
+ typeof pat === "string" &&
214
+ (allLiteral || isLiteralPattern(pat)),
215
+ };
216
+ });
217
+ }
package/src/merge.ts ADDED
@@ -0,0 +1,34 @@
1
+ import type { Match } from "./types";
2
+
3
+ /**
4
+ * Merge matches from multiple engines, sort by
5
+ * position, and select non-overlapping (longest
6
+ * first at ties). Same algorithm as regex-set's
7
+ * internal select_non_overlapping.
8
+ */
9
+ export function mergeAndSelect(
10
+ matches: Match[],
11
+ ): Match[] {
12
+ if (matches.length <= 1) return matches;
13
+
14
+ // Sort: start ascending, longest first at ties
15
+ matches.sort((a, b) => {
16
+ if (a.start !== b.start) {
17
+ return a.start - b.start;
18
+ }
19
+ return b.end - b.start - (a.end - a.start);
20
+ });
21
+
22
+ // Greedily select non-overlapping
23
+ const selected: Match[] = [];
24
+ let lastEnd = 0;
25
+
26
+ for (const m of matches) {
27
+ if (m.start >= lastEnd) {
28
+ selected.push(m);
29
+ lastEnd = m.end;
30
+ }
31
+ }
32
+
33
+ return selected;
34
+ }
@@ -0,0 +1,525 @@
1
+ import { AhoCorasick } from "@stll/aho-corasick";
2
+ import { FuzzySearch } from "@stll/fuzzy-search";
3
+ import { RegexSet } from "@stll/regex-set";
4
+
5
+ import type { ClassifiedPattern } from "./classify";
6
+ import { classifyPatterns } from "./classify";
7
+ import { mergeAndSelect } from "./merge";
8
+ import type {
9
+ Match,
10
+ PatternEntry,
11
+ TextSearchOptions,
12
+ } from "./types";
13
+
14
+ /**
15
+ * An engine instance with pattern index mapping.
16
+ */
17
+ type RegexSlot = {
18
+ type: "regex";
19
+ rs: RegexSet;
20
+ indexMap: number[];
21
+ nameMap: (string | undefined)[];
22
+ };
23
+
24
+ type AcSlot = {
25
+ type: "ac";
26
+ ac: AhoCorasick;
27
+ indexMap: number[];
28
+ nameMap: (string | undefined)[];
29
+ };
30
+
31
+ type FuzzySlot = {
32
+ type: "fuzzy";
33
+ fs: FuzzySearch;
34
+ indexMap: number[];
35
+ nameMap: (string | undefined)[];
36
+ };
37
+
38
+ type EngineSlot = RegexSlot | AcSlot | FuzzySlot;
39
+
40
+ /**
41
+ * Multi-engine text search orchestrator.
42
+ *
43
+ * Routes patterns to the optimal engine
44
+ * configuration:
45
+ * - Large alternation patterns get their own
46
+ * RegexSet instance (prevents DFA state explosion)
47
+ * - Normal patterns share a single RegexSet
48
+ * (single-pass multi-pattern DFA)
49
+ *
50
+ * Merges results from all engines into a unified
51
+ * non-overlapping Match[] sorted by position.
52
+ */
53
+ export class TextSearch {
54
+ private engines: EngineSlot[] = [];
55
+ private patternCount: number;
56
+ private overlapAll: boolean;
57
+ /**
58
+ * True when there's exactly one engine and all
59
+ * patterns map to identity indices (0→0, 1→1, ...).
60
+ * Enables zero-overhead findIter: return raw engine
61
+ * output without remapping or object allocation.
62
+ */
63
+ private zeroOverhead: boolean = false;
64
+
65
+ constructor(
66
+ patterns: PatternEntry[],
67
+ options?: TextSearchOptions,
68
+ ) {
69
+ this.patternCount = patterns.length;
70
+ this.overlapAll =
71
+ options?.overlapStrategy === "all";
72
+ const maxAlt = options?.maxAlternations ?? 50;
73
+ const classified = classifyPatterns(
74
+ patterns,
75
+ options?.allLiteral ?? false,
76
+ );
77
+
78
+ // Four buckets:
79
+ // 1. Fuzzy patterns → FuzzySearch (Levenshtein)
80
+ // 2. Pure literals → Aho-Corasick (SIMD)
81
+ // 3. Normal regex → shared RegexSet (DFA)
82
+ // 4. Large alternations → isolated RegexSet
83
+ const fuzzy: ClassifiedPattern[] = [];
84
+ const literals: ClassifiedPattern[] = [];
85
+ const shared: ClassifiedPattern[] = [];
86
+ const isolated: ClassifiedPattern[] = [];
87
+
88
+ for (const cp of classified) {
89
+ if (cp.fuzzyDistance !== undefined) {
90
+ fuzzy.push(cp);
91
+ } else if (cp.isLiteral) {
92
+ literals.push(cp);
93
+ } else if (cp.alternationCount > maxAlt) {
94
+ isolated.push(cp);
95
+ } else {
96
+ shared.push(cp);
97
+ }
98
+ }
99
+
100
+ const rsOptions = {
101
+ unicodeBoundaries:
102
+ options?.unicodeBoundaries ?? true,
103
+ wholeWords: options?.wholeWords ?? false,
104
+ caseInsensitive:
105
+ options?.caseInsensitive ?? false,
106
+ };
107
+
108
+ // Build fuzzy engine
109
+ if (fuzzy.length > 0) {
110
+ this.engines.push(
111
+ buildFuzzyEngine(fuzzy, {
112
+ unicodeBoundaries:
113
+ rsOptions.unicodeBoundaries,
114
+ wholeWords: rsOptions.wholeWords,
115
+ metric: options?.fuzzyMetric,
116
+ normalizeDiacritics:
117
+ options?.normalizeDiacritics,
118
+ caseInsensitive:
119
+ options?.caseInsensitive,
120
+ }),
121
+ );
122
+ }
123
+
124
+ // Build AC engine(s) for pure literals.
125
+ // Group by per-pattern AC options so patterns
126
+ // with different caseInsensitive/wholeWords
127
+ // settings get separate AC instances.
128
+ if (literals.length > 0) {
129
+ const groups = new Map<
130
+ string,
131
+ ClassifiedPattern[]
132
+ >();
133
+ for (const cp of literals) {
134
+ const ci =
135
+ cp.acOptions?.caseInsensitive ??
136
+ rsOptions.caseInsensitive;
137
+ const ww =
138
+ cp.acOptions?.wholeWords ??
139
+ rsOptions.wholeWords;
140
+ const key = `${ci ? 1 : 0}:${ww ? 1 : 0}`;
141
+ const group = groups.get(key);
142
+ if (group) {
143
+ group.push(cp);
144
+ } else {
145
+ groups.set(key, [cp]);
146
+ }
147
+ }
148
+ for (const [key, group] of groups) {
149
+ const [ci, ww] = key.split(":");
150
+ this.engines.push(
151
+ buildAcEngine(group, {
152
+ ...rsOptions,
153
+ caseInsensitive: ci === "1",
154
+ wholeWords: ww === "1",
155
+ }),
156
+ );
157
+ }
158
+ }
159
+
160
+ // Adaptive regex grouping: try combining shared
161
+ // patterns, measure actual search time on a
162
+ // probe string. If combined is slower than
163
+ // individual, fall back to isolation.
164
+ if (shared.length > 1) {
165
+ const combined = buildRegexEngine(
166
+ shared,
167
+ rsOptions,
168
+ );
169
+ // Probe: 1KB of mixed content
170
+ const probe = (
171
+ "Hello World 123 test@example.com " +
172
+ "2025-01-01 +420 123 456 789 " +
173
+ "Ing. Jan Novák, s.r.o. Praha 1 "
174
+ ).repeat(10);
175
+ const t0 = performance.now();
176
+ combined.rs.findIter(probe);
177
+ const combinedMs = performance.now() - t0;
178
+
179
+ // Individual baseline (sum of isolated scans)
180
+ let individualMs = 0;
181
+ const individualEngines: RegexSlot[] = [];
182
+ for (const cp of shared) {
183
+ const eng = buildRegexEngine(
184
+ [cp],
185
+ rsOptions,
186
+ );
187
+ const t1 = performance.now();
188
+ eng.rs.findIter(probe);
189
+ individualMs += performance.now() - t1;
190
+ individualEngines.push(eng);
191
+ }
192
+
193
+ if (combinedMs > individualMs * 1.5) {
194
+ // Combined is >1.5x slower — isolate
195
+ for (const eng of individualEngines) {
196
+ this.engines.push(eng);
197
+ }
198
+ } else {
199
+ this.engines.push(combined);
200
+ }
201
+ } else if (shared.length === 1) {
202
+ this.engines.push(
203
+ buildRegexEngine(shared, rsOptions),
204
+ );
205
+ }
206
+
207
+ for (const cp of isolated) {
208
+ this.engines.push(
209
+ buildRegexEngine([cp], rsOptions),
210
+ );
211
+ }
212
+
213
+ // Zero-overhead fast path: when all patterns
214
+ // land in a single engine, the indexMap is
215
+ // identity (0→0, 1→1, ...) and no names need
216
+ // attaching. findIter can return raw engine
217
+ // output without any JS-side remapping.
218
+ if (this.engines.length === 1) {
219
+ const engine = this.engines[0]!;
220
+ const hasNames = engine.nameMap.some(
221
+ (n) => n !== undefined,
222
+ );
223
+ if (!hasNames) {
224
+ this.zeroOverhead = true;
225
+ }
226
+ }
227
+ }
228
+
229
+ /** Number of patterns. */
230
+ get length(): number {
231
+ return this.patternCount;
232
+ }
233
+
234
+ /** Returns true if any pattern matches. */
235
+ isMatch(haystack: string): boolean {
236
+ for (const engine of this.engines) {
237
+ if (engineIsMatch(engine, haystack)) {
238
+ return true;
239
+ }
240
+ }
241
+ return false;
242
+ }
243
+
244
+ /**
245
+ * Find matches in text.
246
+ *
247
+ * With `overlapStrategy: "longest"` (default):
248
+ * returns non-overlapping matches, longest wins.
249
+ *
250
+ * With `overlapStrategy: "all"`: returns all
251
+ * matches including overlaps, sorted by position.
252
+ */
253
+ findIter(haystack: string): Match[] {
254
+ // Fast path: single engine, identity indexMap,
255
+ // no names → return raw engine output directly.
256
+ // Zero JS overhead: no remapping, no allocation.
257
+ if (this.zeroOverhead) {
258
+ return engineFindIter(
259
+ this.engines[0]!,
260
+ haystack,
261
+ );
262
+ }
263
+
264
+ // Single engine but needs name remapping
265
+ if (this.engines.length === 1) {
266
+ return remapMatches(
267
+ engineFindIter(this.engines[0]!, haystack),
268
+ this.engines[0]!,
269
+ );
270
+ }
271
+
272
+ // Multi-engine: collect from all, remap in-place
273
+ const all: Match[] = [];
274
+ for (const engine of this.engines) {
275
+ const matches = engineFindIter(
276
+ engine,
277
+ haystack,
278
+ );
279
+ // In-place remapping avoids .map() allocation
280
+ for (const m of remapMatches(matches, engine)) {
281
+ all.push(m);
282
+ }
283
+ }
284
+
285
+ if (this.overlapAll) {
286
+ return all.sort(
287
+ (a, b) => a.start - b.start,
288
+ );
289
+ }
290
+
291
+ return mergeAndSelect(all);
292
+ }
293
+
294
+ /** Which pattern indices matched (not where). */
295
+ whichMatch(haystack: string): number[] {
296
+ const seen = new Set<number>();
297
+
298
+ for (const engine of this.engines) {
299
+ // AC doesn't have whichMatch — use findIter
300
+ const matches = engineFindIter(
301
+ engine,
302
+ haystack,
303
+ );
304
+ for (const m of matches) {
305
+ seen.add(engine.indexMap[m.pattern]!);
306
+ }
307
+ }
308
+
309
+ return [...seen];
310
+ }
311
+
312
+ /**
313
+ * Replace all non-overlapping matches.
314
+ * replacements[i] replaces pattern i.
315
+ */
316
+ replaceAll(
317
+ haystack: string,
318
+ replacements: string[],
319
+ ): string {
320
+ if (replacements.length !== this.patternCount) {
321
+ throw new Error(
322
+ `Expected ${this.patternCount} ` +
323
+ `replacements, got ${replacements.length}`,
324
+ );
325
+ }
326
+
327
+ // Always use non-overlapping matches for
328
+ // replacement, even if overlapStrategy is "all".
329
+ const all: Match[] = [];
330
+ for (const engine of this.engines) {
331
+ const matches = engineFindIter(
332
+ engine,
333
+ haystack,
334
+ );
335
+ for (const m of remapMatches(matches, engine)) {
336
+ all.push(m);
337
+ }
338
+ }
339
+ const matches = mergeAndSelect(all);
340
+
341
+ let result = "";
342
+ let last = 0;
343
+
344
+ for (const m of matches) {
345
+ result += haystack.slice(last, m.start);
346
+ result += replacements[m.pattern]!;
347
+ last = m.end;
348
+ }
349
+
350
+ result += haystack.slice(last);
351
+ return result;
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Build a RegexSet engine from classified patterns.
357
+ */
358
+ function buildRegexEngine(
359
+ patterns: ClassifiedPattern[],
360
+ options: {
361
+ unicodeBoundaries: boolean;
362
+ wholeWords: boolean;
363
+ caseInsensitive: boolean;
364
+ },
365
+ ): RegexSlot {
366
+ const rsPatterns: (string | RegExp | {
367
+ pattern: string | RegExp;
368
+ name?: string;
369
+ })[] = [];
370
+ const indexMap: number[] = [];
371
+ const nameMap: (string | undefined)[] = [];
372
+
373
+ for (const cp of patterns) {
374
+ if (cp.name !== undefined) {
375
+ rsPatterns.push({
376
+ pattern: cp.pattern,
377
+ name: cp.name,
378
+ });
379
+ } else {
380
+ rsPatterns.push(cp.pattern);
381
+ }
382
+ indexMap.push(cp.originalIndex);
383
+ nameMap.push(cp.name);
384
+ }
385
+
386
+ const rs = new RegexSet(rsPatterns, options);
387
+
388
+ return { type: "regex", rs, indexMap, nameMap };
389
+ }
390
+
391
+ /**
392
+ * Build an Aho-Corasick engine from literal patterns.
393
+ */
394
+ function buildAcEngine(
395
+ patterns: ClassifiedPattern[],
396
+ options: {
397
+ unicodeBoundaries: boolean;
398
+ wholeWords: boolean;
399
+ caseInsensitive: boolean;
400
+ },
401
+ ): AcSlot {
402
+ const literals: string[] = [];
403
+ const indexMap: number[] = [];
404
+ const nameMap: (string | undefined)[] = [];
405
+
406
+ for (const cp of patterns) {
407
+ literals.push(cp.pattern as string);
408
+ indexMap.push(cp.originalIndex);
409
+ nameMap.push(cp.name);
410
+ }
411
+
412
+ const ac = new AhoCorasick(literals, {
413
+ wholeWords: options.wholeWords,
414
+ unicodeBoundaries: options.unicodeBoundaries,
415
+ caseInsensitive: options.caseInsensitive,
416
+ });
417
+
418
+ return { type: "ac", ac, indexMap, nameMap };
419
+ }
420
+
421
+ /**
422
+ * Build a FuzzySearch engine from fuzzy patterns.
423
+ */
424
+ function buildFuzzyEngine(
425
+ patterns: ClassifiedPattern[],
426
+ options: {
427
+ unicodeBoundaries: boolean;
428
+ wholeWords: boolean;
429
+ metric?: "levenshtein" | "damerau-levenshtein";
430
+ normalizeDiacritics?: boolean;
431
+ caseInsensitive?: boolean;
432
+ },
433
+ ): FuzzySlot {
434
+ const fsPatterns: {
435
+ pattern: string;
436
+ distance?: number | "auto";
437
+ name?: string;
438
+ }[] = [];
439
+ const indexMap: number[] = [];
440
+ const nameMap: (string | undefined)[] = [];
441
+
442
+ for (const cp of patterns) {
443
+ fsPatterns.push({
444
+ pattern: cp.pattern as string,
445
+ distance: cp.fuzzyDistance,
446
+ name: cp.name,
447
+ });
448
+ indexMap.push(cp.originalIndex);
449
+ nameMap.push(cp.name);
450
+ }
451
+
452
+ const fs = new FuzzySearch(fsPatterns, {
453
+ unicodeBoundaries: options.unicodeBoundaries,
454
+ wholeWords: options.wholeWords,
455
+ metric: options.metric,
456
+ normalizeDiacritics:
457
+ options.normalizeDiacritics,
458
+ caseInsensitive: options.caseInsensitive,
459
+ });
460
+
461
+ return { type: "fuzzy", fs, indexMap, nameMap };
462
+ }
463
+
464
+ /**
465
+ * Dispatch isMatch to the correct engine.
466
+ */
467
+ function engineIsMatch(
468
+ engine: EngineSlot,
469
+ haystack: string,
470
+ ): boolean {
471
+ switch (engine.type) {
472
+ case "ac":
473
+ return engine.ac.isMatch(haystack);
474
+ case "fuzzy":
475
+ return engine.fs.isMatch(haystack);
476
+ case "regex":
477
+ return engine.rs.isMatch(haystack);
478
+ }
479
+ }
480
+
481
+ /**
482
+ * Dispatch findIter to the correct engine.
483
+ */
484
+ function engineFindIter(
485
+ engine: EngineSlot,
486
+ haystack: string,
487
+ ): Match[] {
488
+ switch (engine.type) {
489
+ case "ac":
490
+ return engine.ac.findIter(haystack);
491
+ case "fuzzy":
492
+ return engine.fs.findIter(haystack);
493
+ case "regex":
494
+ return engine.rs.findIter(haystack);
495
+ }
496
+ }
497
+
498
+ /**
499
+ * Remap engine-local match indices to original
500
+ * input indices and add names.
501
+ */
502
+ function remapMatches(
503
+ matches: Match[],
504
+ engine: EngineSlot,
505
+ ): Match[] {
506
+ return matches.map((m) => {
507
+ const originalIdx =
508
+ engine.indexMap[m.pattern]!;
509
+ const name = engine.nameMap[m.pattern];
510
+ const result: Match = {
511
+ pattern: originalIdx,
512
+ start: m.start,
513
+ end: m.end,
514
+ text: m.text,
515
+ };
516
+ if (name !== undefined) {
517
+ result.name = name;
518
+ }
519
+ // Preserve edit distance from fuzzy matches
520
+ if ("distance" in m && m.distance !== undefined) {
521
+ result.distance = m.distance as number;
522
+ }
523
+ return result;
524
+ });
525
+ }
package/src/types.ts ADDED
@@ -0,0 +1,114 @@
1
+ /**
2
+ * A single match result. Same shape as
3
+ * @stll/regex-set and @stll/aho-corasick.
4
+ */
5
+ export type Match = {
6
+ /** Index of the pattern that matched. */
7
+ pattern: number;
8
+ /** Start UTF-16 code unit offset. */
9
+ start: number;
10
+ /** End offset (exclusive). */
11
+ end: number;
12
+ /** The matched text. */
13
+ text: string;
14
+ /** Pattern name (if provided). */
15
+ name?: string;
16
+ /** Edit distance (fuzzy matches only). */
17
+ distance?: number;
18
+ };
19
+
20
+ /** A pattern entry for TextSearch. */
21
+ export type PatternEntry =
22
+ | string
23
+ | RegExp
24
+ | {
25
+ pattern: string | RegExp;
26
+ name?: string;
27
+ }
28
+ | {
29
+ pattern: string;
30
+ name?: string;
31
+ /** Fuzzy matching distance. Routes to
32
+ * @stll/fuzzy-search instead of regex. */
33
+ distance: number | "auto";
34
+ }
35
+ | {
36
+ pattern: string;
37
+ name?: string;
38
+ /** Force literal matching via Aho-Corasick.
39
+ * Skips regex metacharacter detection so
40
+ * patterns like "č.p." or "s.r.o." are
41
+ * matched literally, not as regex. */
42
+ literal: true;
43
+ /** Per-pattern case-insensitive for AC.
44
+ * Overrides the global option for this
45
+ * pattern only. */
46
+ caseInsensitive?: boolean;
47
+ /** Per-pattern whole-word matching for AC. */
48
+ wholeWords?: boolean;
49
+ };
50
+
51
+ /** Options for TextSearch. */
52
+ export type TextSearchOptions = {
53
+ /**
54
+ * Use Unicode word boundaries.
55
+ * @default true
56
+ */
57
+ unicodeBoundaries?: boolean;
58
+
59
+ /**
60
+ * Only match whole words.
61
+ * @default false
62
+ */
63
+ wholeWords?: boolean;
64
+
65
+ /**
66
+ * Max alternation branches before auto-splitting
67
+ * into a separate engine instance. Prevents DFA
68
+ * state explosion when large-alternation patterns
69
+ * are combined with other patterns.
70
+ * @default 50
71
+ */
72
+ maxAlternations?: number;
73
+
74
+ /**
75
+ * Fuzzy matching metric.
76
+ * @default "levenshtein"
77
+ */
78
+ fuzzyMetric?: "levenshtein" | "damerau-levenshtein";
79
+
80
+ /**
81
+ * Normalize diacritics for fuzzy matching.
82
+ * @default false
83
+ */
84
+ normalizeDiacritics?: boolean;
85
+
86
+ /**
87
+ * Case-insensitive matching for AC literals
88
+ * and fuzzy patterns.
89
+ * @default false
90
+ */
91
+ caseInsensitive?: boolean;
92
+
93
+ /**
94
+ * How to handle overlapping matches from
95
+ * different engines or patterns.
96
+ *
97
+ * - "longest": keep longest non-overlapping match
98
+ * at each position (default).
99
+ * - "all": return all matches including overlaps.
100
+ * Useful when the caller applies its own dedup.
101
+ *
102
+ * @default "longest"
103
+ */
104
+ overlapStrategy?: "longest" | "all";
105
+
106
+ /**
107
+ * Treat ALL string patterns as literals (route
108
+ * to AC, skip metacharacter detection). Useful
109
+ * for deny-list patterns where "s.r.o." means
110
+ * the literal string, not a regex with wildcards.
111
+ * @default false
112
+ */
113
+ allLiteral?: boolean;
114
+ };