@stll/text-search 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ import type { PatternEntry } from "./types";
2
+
3
+ /**
4
+ * Normalized pattern with metadata for routing.
5
+ */
6
+ export type ClassifiedPattern = {
7
+ /** Original index in the input array. */
8
+ originalIndex: number;
9
+ /** The regex-compatible pattern string. */
10
+ pattern: string | RegExp;
11
+ /** Optional name. */
12
+ name?: string;
13
+ /**
14
+ * Number of top-level alternation branches.
15
+ * Used to detect large alternations that should
16
+ * be isolated into their own RegexSet instance.
17
+ */
18
+ alternationCount: number;
19
+ /**
20
+ * True if the pattern is a pure literal string
21
+ * (no regex metacharacters). These can be routed
22
+ * to Aho-Corasick for SIMD-accelerated matching.
23
+ */
24
+ isLiteral: boolean;
25
+ /**
26
+ * Fuzzy distance if this is a fuzzy pattern.
27
+ * Routes to @stll/fuzzy-search.
28
+ */
29
+ fuzzyDistance?: number | "auto";
30
+ /**
31
+ * Per-pattern AC options. When set, this literal
32
+ * is grouped with others that have the same
33
+ * options into a separate AC engine instance.
34
+ */
35
+ acOptions?: {
36
+ caseInsensitive?: boolean;
37
+ wholeWords?: boolean;
38
+ };
39
+ };
40
+
41
+ /**
42
+ * Check if a string is a pure literal (no regex
43
+ * metacharacters). Pure literals are routed to
44
+ * Aho-Corasick instead of the regex DFA.
45
+ */
46
+ export function isLiteralPattern(
47
+ pattern: string,
48
+ ): boolean {
49
+ // All standard regex metacharacters cause a
50
+ // pattern to be classified as regex (→ RegexSet).
51
+ // To force literal AC routing for patterns with
52
+ // dots/parens (e.g., "s.r.o.", "č.p."), use the
53
+ // explicit { literal: true } PatternEntry flag.
54
+ for (let i = 0; i < pattern.length; i++) {
55
+ const ch = pattern[i]!;
56
+ if (
57
+ ch === "\\" ||
58
+ ch === "." ||
59
+ ch === "^" ||
60
+ ch === "$" ||
61
+ ch === "*" ||
62
+ ch === "+" ||
63
+ ch === "?" ||
64
+ ch === "{" ||
65
+ ch === "}" ||
66
+ ch === "(" ||
67
+ ch === ")" ||
68
+ ch === "[" ||
69
+ ch === "]" ||
70
+ ch === "|"
71
+ ) {
72
+ return false;
73
+ }
74
+ }
75
+ return pattern.length > 0;
76
+ }
77
+
78
+ /**
79
+ * Count the maximum alternation branches at any
80
+ * depth in a regex string. Used to detect patterns
81
+ * with large alternations (even nested inside
82
+ * groups) that should be isolated into their own
83
+ * RegexSet to prevent DFA state explosion.
84
+ *
85
+ * "a|b|c" → 3
86
+ * "(a|b)|c" → 2 (max of top=2, depth1=2)
87
+ * "(?:Ing\\.|Mgr\\.|Dr\\.)" → 3 (depth 1)
88
+ */
89
+ export function countAlternations(
90
+ pattern: string,
91
+ ): number {
92
+ let depth = 0;
93
+ let inClass = false;
94
+ let i = 0;
95
+
96
+ // Track max alternation count seen at any depth.
97
+ // Each time we enter a group, start a fresh count.
98
+ // When we leave, update the global max.
99
+ let max = 1;
100
+ let currentCount = 1; // count for current group
101
+ const stack: number[] = []; // saved counts
102
+
103
+ while (i < pattern.length) {
104
+ const ch = pattern[i];
105
+
106
+ if (ch === "\\" && i + 1 < pattern.length) {
107
+ i += 2;
108
+ continue;
109
+ }
110
+
111
+ if (ch === "[") inClass = true;
112
+ if (ch === "]") inClass = false;
113
+
114
+ if (!inClass) {
115
+ if (ch === "(") {
116
+ stack.push(currentCount);
117
+ currentCount = 1;
118
+ depth++;
119
+ }
120
+ if (ch === ")") {
121
+ if (currentCount > max) max = currentCount;
122
+ currentCount = stack.pop() ?? 1;
123
+ depth--;
124
+ }
125
+ if (ch === "|") {
126
+ currentCount++;
127
+ }
128
+ }
129
+
130
+ i++;
131
+ }
132
+ // Check top-level count too
133
+ if (currentCount > max) max = currentCount;
134
+ return max;
135
+ }
136
+
137
+ /**
138
+ * Classify and normalize pattern entries.
139
+ */
140
+ export function classifyPatterns(
141
+ entries: PatternEntry[],
142
+ allLiteral = false,
143
+ ): ClassifiedPattern[] {
144
+ return entries.map((entry, i) => {
145
+ if (typeof entry === "string") {
146
+ return {
147
+ originalIndex: i,
148
+ pattern: entry,
149
+ alternationCount: allLiteral
150
+ ? 0
151
+ : countAlternations(entry),
152
+ isLiteral: allLiteral ||
153
+ isLiteralPattern(entry),
154
+ };
155
+ }
156
+
157
+ if (entry instanceof RegExp) {
158
+ return {
159
+ originalIndex: i,
160
+ pattern: entry,
161
+ alternationCount: countAlternations(
162
+ entry.source,
163
+ ),
164
+ isLiteral: false, // RegExp is never literal
165
+ };
166
+ }
167
+
168
+ // Fuzzy pattern: has `distance` field
169
+ if ("distance" in entry) {
170
+ const result: ClassifiedPattern = {
171
+ originalIndex: i,
172
+ pattern: entry.pattern,
173
+ alternationCount: 0,
174
+ isLiteral: false,
175
+ fuzzyDistance: entry.distance,
176
+ };
177
+ if (entry.name !== undefined) result.name = entry.name;
178
+ return result;
179
+ }
180
+
181
+ // Explicit literal: skip metachar detection
182
+ if ("literal" in entry && entry.literal) {
183
+ const hasPerPatternOpts =
184
+ "caseInsensitive" in entry ||
185
+ "wholeWords" in entry;
186
+ const result: ClassifiedPattern = {
187
+ originalIndex: i,
188
+ pattern: entry.pattern,
189
+ alternationCount: 0,
190
+ isLiteral: true,
191
+ };
192
+ if (entry.name !== undefined) result.name = entry.name;
193
+ if (hasPerPatternOpts) {
194
+ const opts: NonNullable<
195
+ ClassifiedPattern["acOptions"]
196
+ > = {};
197
+ if (entry.caseInsensitive !== undefined)
198
+ opts.caseInsensitive = entry.caseInsensitive;
199
+ if (entry.wholeWords !== undefined)
200
+ opts.wholeWords = entry.wholeWords;
201
+ result.acOptions = opts;
202
+ }
203
+ return result;
204
+ }
205
+
206
+ const pat = entry.pattern;
207
+ const source =
208
+ pat instanceof RegExp ? pat.source : pat;
209
+
210
+ const result: ClassifiedPattern = {
211
+ originalIndex: i,
212
+ pattern: pat,
213
+ alternationCount: allLiteral
214
+ ? 0
215
+ : countAlternations(source),
216
+ isLiteral:
217
+ typeof pat === "string" &&
218
+ (allLiteral || isLiteralPattern(pat)),
219
+ };
220
+ if (entry.name !== undefined) result.name = entry.name;
221
+ return result;
222
+ });
223
+ }
package/src/merge.ts ADDED
@@ -0,0 +1,34 @@
1
+ import type { Match } from "./types";
2
+
3
+ /**
4
+ * Merge matches from multiple engines, sort by
5
+ * position, and select non-overlapping (longest
6
+ * first at ties). Same algorithm as regex-set's
7
+ * internal select_non_overlapping.
8
+ */
9
+ export function mergeAndSelect(
10
+ matches: Match[],
11
+ ): Match[] {
12
+ if (matches.length <= 1) return matches;
13
+
14
+ // Sort: start ascending, longest first at ties
15
+ matches.sort((a, b) => {
16
+ if (a.start !== b.start) {
17
+ return a.start - b.start;
18
+ }
19
+ return b.end - b.start - (a.end - a.start);
20
+ });
21
+
22
+ // Greedily select non-overlapping
23
+ const selected: Match[] = [];
24
+ let lastEnd = 0;
25
+
26
+ for (const m of matches) {
27
+ if (m.start >= lastEnd) {
28
+ selected.push(m);
29
+ lastEnd = m.end;
30
+ }
31
+ }
32
+
33
+ return selected;
34
+ }