@stll/text-search 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +150 -0
- package/dist/index.js +464 -0
- package/package.json +15 -6
- package/src/classify.ts +223 -0
- package/src/merge.ts +34 -0
- package/src/text-search.ts +540 -0
- package/src/types.ts +114 -0
package/src/classify.ts
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import type { PatternEntry } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Normalized pattern with metadata for routing.
|
|
5
|
+
*/
|
|
6
|
+
export type ClassifiedPattern = {
|
|
7
|
+
/** Original index in the input array. */
|
|
8
|
+
originalIndex: number;
|
|
9
|
+
/** The regex-compatible pattern string. */
|
|
10
|
+
pattern: string | RegExp;
|
|
11
|
+
/** Optional name. */
|
|
12
|
+
name?: string;
|
|
13
|
+
/**
|
|
14
|
+
* Number of top-level alternation branches.
|
|
15
|
+
* Used to detect large alternations that should
|
|
16
|
+
* be isolated into their own RegexSet instance.
|
|
17
|
+
*/
|
|
18
|
+
alternationCount: number;
|
|
19
|
+
/**
|
|
20
|
+
* True if the pattern is a pure literal string
|
|
21
|
+
* (no regex metacharacters). These can be routed
|
|
22
|
+
* to Aho-Corasick for SIMD-accelerated matching.
|
|
23
|
+
*/
|
|
24
|
+
isLiteral: boolean;
|
|
25
|
+
/**
|
|
26
|
+
* Fuzzy distance if this is a fuzzy pattern.
|
|
27
|
+
* Routes to @stll/fuzzy-search.
|
|
28
|
+
*/
|
|
29
|
+
fuzzyDistance?: number | "auto";
|
|
30
|
+
/**
|
|
31
|
+
* Per-pattern AC options. When set, this literal
|
|
32
|
+
* is grouped with others that have the same
|
|
33
|
+
* options into a separate AC engine instance.
|
|
34
|
+
*/
|
|
35
|
+
acOptions?: {
|
|
36
|
+
caseInsensitive?: boolean;
|
|
37
|
+
wholeWords?: boolean;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Check if a string is a pure literal (no regex
|
|
43
|
+
* metacharacters). Pure literals are routed to
|
|
44
|
+
* Aho-Corasick instead of the regex DFA.
|
|
45
|
+
*/
|
|
46
|
+
export function isLiteralPattern(
|
|
47
|
+
pattern: string,
|
|
48
|
+
): boolean {
|
|
49
|
+
// All standard regex metacharacters cause a
|
|
50
|
+
// pattern to be classified as regex (→ RegexSet).
|
|
51
|
+
// To force literal AC routing for patterns with
|
|
52
|
+
// dots/parens (e.g., "s.r.o.", "č.p."), use the
|
|
53
|
+
// explicit { literal: true } PatternEntry flag.
|
|
54
|
+
for (let i = 0; i < pattern.length; i++) {
|
|
55
|
+
const ch = pattern[i]!;
|
|
56
|
+
if (
|
|
57
|
+
ch === "\\" ||
|
|
58
|
+
ch === "." ||
|
|
59
|
+
ch === "^" ||
|
|
60
|
+
ch === "$" ||
|
|
61
|
+
ch === "*" ||
|
|
62
|
+
ch === "+" ||
|
|
63
|
+
ch === "?" ||
|
|
64
|
+
ch === "{" ||
|
|
65
|
+
ch === "}" ||
|
|
66
|
+
ch === "(" ||
|
|
67
|
+
ch === ")" ||
|
|
68
|
+
ch === "[" ||
|
|
69
|
+
ch === "]" ||
|
|
70
|
+
ch === "|"
|
|
71
|
+
) {
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return pattern.length > 0;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Count the maximum alternation branches at any
|
|
80
|
+
* depth in a regex string. Used to detect patterns
|
|
81
|
+
* with large alternations (even nested inside
|
|
82
|
+
* groups) that should be isolated into their own
|
|
83
|
+
* RegexSet to prevent DFA state explosion.
|
|
84
|
+
*
|
|
85
|
+
* "a|b|c" → 3
|
|
86
|
+
* "(a|b)|c" → 2 (max of top=2, depth1=2)
|
|
87
|
+
* "(?:Ing\\.|Mgr\\.|Dr\\.)" → 3 (depth 1)
|
|
88
|
+
*/
|
|
89
|
+
export function countAlternations(
|
|
90
|
+
pattern: string,
|
|
91
|
+
): number {
|
|
92
|
+
let depth = 0;
|
|
93
|
+
let inClass = false;
|
|
94
|
+
let i = 0;
|
|
95
|
+
|
|
96
|
+
// Track max alternation count seen at any depth.
|
|
97
|
+
// Each time we enter a group, start a fresh count.
|
|
98
|
+
// When we leave, update the global max.
|
|
99
|
+
let max = 1;
|
|
100
|
+
let currentCount = 1; // count for current group
|
|
101
|
+
const stack: number[] = []; // saved counts
|
|
102
|
+
|
|
103
|
+
while (i < pattern.length) {
|
|
104
|
+
const ch = pattern[i];
|
|
105
|
+
|
|
106
|
+
if (ch === "\\" && i + 1 < pattern.length) {
|
|
107
|
+
i += 2;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (ch === "[") inClass = true;
|
|
112
|
+
if (ch === "]") inClass = false;
|
|
113
|
+
|
|
114
|
+
if (!inClass) {
|
|
115
|
+
if (ch === "(") {
|
|
116
|
+
stack.push(currentCount);
|
|
117
|
+
currentCount = 1;
|
|
118
|
+
depth++;
|
|
119
|
+
}
|
|
120
|
+
if (ch === ")") {
|
|
121
|
+
if (currentCount > max) max = currentCount;
|
|
122
|
+
currentCount = stack.pop() ?? 1;
|
|
123
|
+
depth--;
|
|
124
|
+
}
|
|
125
|
+
if (ch === "|") {
|
|
126
|
+
currentCount++;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
i++;
|
|
131
|
+
}
|
|
132
|
+
// Check top-level count too
|
|
133
|
+
if (currentCount > max) max = currentCount;
|
|
134
|
+
return max;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Classify and normalize pattern entries.
|
|
139
|
+
*/
|
|
140
|
+
export function classifyPatterns(
|
|
141
|
+
entries: PatternEntry[],
|
|
142
|
+
allLiteral = false,
|
|
143
|
+
): ClassifiedPattern[] {
|
|
144
|
+
return entries.map((entry, i) => {
|
|
145
|
+
if (typeof entry === "string") {
|
|
146
|
+
return {
|
|
147
|
+
originalIndex: i,
|
|
148
|
+
pattern: entry,
|
|
149
|
+
alternationCount: allLiteral
|
|
150
|
+
? 0
|
|
151
|
+
: countAlternations(entry),
|
|
152
|
+
isLiteral: allLiteral ||
|
|
153
|
+
isLiteralPattern(entry),
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (entry instanceof RegExp) {
|
|
158
|
+
return {
|
|
159
|
+
originalIndex: i,
|
|
160
|
+
pattern: entry,
|
|
161
|
+
alternationCount: countAlternations(
|
|
162
|
+
entry.source,
|
|
163
|
+
),
|
|
164
|
+
isLiteral: false, // RegExp is never literal
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Fuzzy pattern: has `distance` field
|
|
169
|
+
if ("distance" in entry) {
|
|
170
|
+
const result: ClassifiedPattern = {
|
|
171
|
+
originalIndex: i,
|
|
172
|
+
pattern: entry.pattern,
|
|
173
|
+
alternationCount: 0,
|
|
174
|
+
isLiteral: false,
|
|
175
|
+
fuzzyDistance: entry.distance,
|
|
176
|
+
};
|
|
177
|
+
if (entry.name !== undefined) result.name = entry.name;
|
|
178
|
+
return result;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Explicit literal: skip metachar detection
|
|
182
|
+
if ("literal" in entry && entry.literal) {
|
|
183
|
+
const hasPerPatternOpts =
|
|
184
|
+
"caseInsensitive" in entry ||
|
|
185
|
+
"wholeWords" in entry;
|
|
186
|
+
const result: ClassifiedPattern = {
|
|
187
|
+
originalIndex: i,
|
|
188
|
+
pattern: entry.pattern,
|
|
189
|
+
alternationCount: 0,
|
|
190
|
+
isLiteral: true,
|
|
191
|
+
};
|
|
192
|
+
if (entry.name !== undefined) result.name = entry.name;
|
|
193
|
+
if (hasPerPatternOpts) {
|
|
194
|
+
const opts: NonNullable<
|
|
195
|
+
ClassifiedPattern["acOptions"]
|
|
196
|
+
> = {};
|
|
197
|
+
if (entry.caseInsensitive !== undefined)
|
|
198
|
+
opts.caseInsensitive = entry.caseInsensitive;
|
|
199
|
+
if (entry.wholeWords !== undefined)
|
|
200
|
+
opts.wholeWords = entry.wholeWords;
|
|
201
|
+
result.acOptions = opts;
|
|
202
|
+
}
|
|
203
|
+
return result;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const pat = entry.pattern;
|
|
207
|
+
const source =
|
|
208
|
+
pat instanceof RegExp ? pat.source : pat;
|
|
209
|
+
|
|
210
|
+
const result: ClassifiedPattern = {
|
|
211
|
+
originalIndex: i,
|
|
212
|
+
pattern: pat,
|
|
213
|
+
alternationCount: allLiteral
|
|
214
|
+
? 0
|
|
215
|
+
: countAlternations(source),
|
|
216
|
+
isLiteral:
|
|
217
|
+
typeof pat === "string" &&
|
|
218
|
+
(allLiteral || isLiteralPattern(pat)),
|
|
219
|
+
};
|
|
220
|
+
if (entry.name !== undefined) result.name = entry.name;
|
|
221
|
+
return result;
|
|
222
|
+
});
|
|
223
|
+
}
|
package/src/merge.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { Match } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Merge matches from multiple engines, sort by
|
|
5
|
+
* position, and select non-overlapping (longest
|
|
6
|
+
* first at ties). Same algorithm as regex-set's
|
|
7
|
+
* internal select_non_overlapping.
|
|
8
|
+
*/
|
|
9
|
+
export function mergeAndSelect(
|
|
10
|
+
matches: Match[],
|
|
11
|
+
): Match[] {
|
|
12
|
+
if (matches.length <= 1) return matches;
|
|
13
|
+
|
|
14
|
+
// Sort: start ascending, longest first at ties
|
|
15
|
+
matches.sort((a, b) => {
|
|
16
|
+
if (a.start !== b.start) {
|
|
17
|
+
return a.start - b.start;
|
|
18
|
+
}
|
|
19
|
+
return b.end - b.start - (a.end - a.start);
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
// Greedily select non-overlapping
|
|
23
|
+
const selected: Match[] = [];
|
|
24
|
+
let lastEnd = 0;
|
|
25
|
+
|
|
26
|
+
for (const m of matches) {
|
|
27
|
+
if (m.start >= lastEnd) {
|
|
28
|
+
selected.push(m);
|
|
29
|
+
lastEnd = m.end;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return selected;
|
|
34
|
+
}
|