@agenticmail/enterprise 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ // src/lib/text-search.ts
2
+ var BM25_K1 = 1.2;
3
+ var BM25_B = 0.75;
4
+ var FIELD_WEIGHT_TITLE = 3;
5
+ var FIELD_WEIGHT_TAGS = 2;
6
+ var FIELD_WEIGHT_CONTENT = 1;
7
+ var PREFIX_MATCH_PENALTY = 0.7;
8
+ var STOP_WORDS = /* @__PURE__ */ new Set([
9
+ "a",
10
+ "about",
11
+ "above",
12
+ "after",
13
+ "again",
14
+ "against",
15
+ "all",
16
+ "am",
17
+ "an",
18
+ "and",
19
+ "any",
20
+ "are",
21
+ "as",
22
+ "at",
23
+ "be",
24
+ "because",
25
+ "been",
26
+ "before",
27
+ "being",
28
+ "below",
29
+ "between",
30
+ "both",
31
+ "but",
32
+ "by",
33
+ "can",
34
+ "could",
35
+ "did",
36
+ "do",
37
+ "does",
38
+ "doing",
39
+ "down",
40
+ "during",
41
+ "each",
42
+ "either",
43
+ "every",
44
+ "few",
45
+ "for",
46
+ "from",
47
+ "further",
48
+ "get",
49
+ "got",
50
+ "had",
51
+ "has",
52
+ "have",
53
+ "having",
54
+ "he",
55
+ "her",
56
+ "here",
57
+ "hers",
58
+ "herself",
59
+ "him",
60
+ "himself",
61
+ "his",
62
+ "how",
63
+ "i",
64
+ "if",
65
+ "in",
66
+ "into",
67
+ "is",
68
+ "it",
69
+ "its",
70
+ "itself",
71
+ "just",
72
+ "may",
73
+ "me",
74
+ "might",
75
+ "more",
76
+ "most",
77
+ "must",
78
+ "my",
79
+ "myself",
80
+ "neither",
81
+ "no",
82
+ "nor",
83
+ "not",
84
+ "now",
85
+ "of",
86
+ "off",
87
+ "on",
88
+ "once",
89
+ "only",
90
+ "or",
91
+ "other",
92
+ "ought",
93
+ "our",
94
+ "ours",
95
+ "ourselves",
96
+ "out",
97
+ "over",
98
+ "own",
99
+ "same",
100
+ "shall",
101
+ "she",
102
+ "should",
103
+ "so",
104
+ "some",
105
+ "such",
106
+ "than",
107
+ "that",
108
+ "the",
109
+ "their",
110
+ "theirs",
111
+ "them",
112
+ "themselves",
113
+ "then",
114
+ "there",
115
+ "these",
116
+ "they",
117
+ "this",
118
+ "those",
119
+ "through",
120
+ "to",
121
+ "too",
122
+ "under",
123
+ "until",
124
+ "up",
125
+ "us",
126
+ "very",
127
+ "was",
128
+ "we",
129
+ "were",
130
+ "what",
131
+ "when",
132
+ "where",
133
+ "which",
134
+ "while",
135
+ "who",
136
+ "whom",
137
+ "why",
138
+ "will",
139
+ "with",
140
+ "would",
141
+ "yet",
142
+ "you",
143
+ "your",
144
+ "yours",
145
+ "yourself",
146
+ "yourselves"
147
+ ]);
148
+ var STEM_RULES = [
149
+ // Step 1: plurals and past participles
150
+ [/ies$/, "i", 3],
151
+ // policies → polici,eries → eri
152
+ [/sses$/, "ss", 4],
153
+ // addresses → address
154
+ [/([^s])s$/, "$1", 3],
155
+ // items → item, but not "ss"
156
+ [/eed$/, "ee", 4],
157
+ // agreed → agree
158
+ [/ed$/, "", 3],
159
+ // configured → configur, but min length 3
160
+ [/ing$/, "", 4],
161
+ // running → runn → run (handled below)
162
+ // Step 2: derivational suffixes
163
+ [/ational$/, "ate", 6],
164
+ // relational → relate
165
+ [/tion$/, "t", 5],
166
+ // adoption → adopt
167
+ [/ness$/, "", 5],
168
+ // awareness → aware
169
+ [/ment$/, "", 5],
170
+ // deployment → deploy
171
+ [/able$/, "", 5],
172
+ // configurable → configur
173
+ [/ible$/, "", 5],
174
+ // accessible → access
175
+ [/ful$/, "", 5],
176
+ // powerful → power
177
+ [/ous$/, "", 5],
178
+ // dangerous → danger
179
+ [/ive$/, "", 5],
180
+ // interactive → interact
181
+ [/ize$/, "", 4],
182
+ // normalize → normal
183
+ [/ise$/, "", 4],
184
+ // organise → organ
185
+ [/ally$/, "", 5],
186
+ // automatically → automat
187
+ [/ly$/, "", 4],
188
+ // quickly → quick
189
+ [/er$/, "", 4]
190
+ // handler → handl
191
+ ];
192
+ var DOUBLE_CONSONANT = /([^aeiou])\1$/;
193
+ function stem(word) {
194
+ if (word.length < 3) return word;
195
+ let stemmed = word;
196
+ for (const [pattern, replacement, minLen] of STEM_RULES) {
197
+ if (stemmed.length >= minLen && pattern.test(stemmed)) {
198
+ stemmed = stemmed.replace(pattern, replacement);
199
+ break;
200
+ }
201
+ }
202
+ if (stemmed.length > 2 && DOUBLE_CONSONANT.test(stemmed)) {
203
+ stemmed = stemmed.slice(0, -1);
204
+ }
205
+ return stemmed;
206
+ }
207
+ function tokenize(text) {
208
+ return text.toLowerCase().split(/[^a-z0-9]+/).filter((t) => t.length > 1 && !STOP_WORDS.has(t)).map(stem);
209
+ }
210
+ var MemorySearchIndex = class {
211
+ /** Posting lists: stemmed term → Set of memory IDs containing it */
212
+ postings = /* @__PURE__ */ new Map();
213
+ /** Per-document metadata for BM25 scoring */
214
+ docs = /* @__PURE__ */ new Map();
215
+ /** Pre-computed IDF values. Stale flag triggers lazy recomputation. */
216
+ idf = /* @__PURE__ */ new Map();
217
+ idfStale = true;
218
+ /** 3-character prefix map for prefix matching: prefix → Set of full stems */
219
+ prefixMap = /* @__PURE__ */ new Map();
220
+ /** Total weighted document length (for computing average) */
221
+ totalWeightedLen = 0;
222
+ get docCount() {
223
+ return this.docs.size;
224
+ }
225
+ get avgDocLen() {
226
+ return this.docs.size > 0 ? this.totalWeightedLen / this.docs.size : 1;
227
+ }
228
+ /**
229
+ * Index a memory entry. Extracts stems from title, content, and tags
230
+ * with field-specific weighting and builds posting lists.
231
+ */
232
+ addDocument(id, entry) {
233
+ if (this.docs.has(id)) this.removeDocument(id);
234
+ const titleTokens = tokenize(entry.title);
235
+ const contentTokens = tokenize(entry.content);
236
+ const tagTokens = entry.tags.flatMap((t) => tokenize(t));
237
+ const weightedTf = /* @__PURE__ */ new Map();
238
+ for (const t of titleTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TITLE);
239
+ for (const t of tagTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TAGS);
240
+ for (const t of contentTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_CONTENT);
241
+ const weightedLen = titleTokens.length * FIELD_WEIGHT_TITLE + tagTokens.length * FIELD_WEIGHT_TAGS + contentTokens.length * FIELD_WEIGHT_CONTENT;
242
+ const allStems = /* @__PURE__ */ new Set();
243
+ for (const t of weightedTf.keys()) allStems.add(t);
244
+ const stemSequence = [...titleTokens, ...contentTokens];
245
+ const docRecord = { weightedTf, weightedLen, allStems, stemSequence };
246
+ this.docs.set(id, docRecord);
247
+ this.totalWeightedLen += weightedLen;
248
+ for (const term of allStems) {
249
+ let posting = this.postings.get(term);
250
+ if (!posting) {
251
+ posting = /* @__PURE__ */ new Set();
252
+ this.postings.set(term, posting);
253
+ }
254
+ posting.add(id);
255
+ if (term.length >= 3) {
256
+ const prefix = term.slice(0, 3);
257
+ let prefixSet = this.prefixMap.get(prefix);
258
+ if (!prefixSet) {
259
+ prefixSet = /* @__PURE__ */ new Set();
260
+ this.prefixMap.set(prefix, prefixSet);
261
+ }
262
+ prefixSet.add(term);
263
+ }
264
+ }
265
+ this.idfStale = true;
266
+ }
267
+ /** Remove a document from the index. */
268
+ removeDocument(id) {
269
+ const doc = this.docs.get(id);
270
+ if (!doc) return;
271
+ this.totalWeightedLen -= doc.weightedLen;
272
+ this.docs.delete(id);
273
+ for (const term of doc.allStems) {
274
+ const posting = this.postings.get(term);
275
+ if (posting) {
276
+ posting.delete(id);
277
+ if (posting.size === 0) {
278
+ this.postings.delete(term);
279
+ if (term.length >= 3) {
280
+ const prefixSet = this.prefixMap.get(term.slice(0, 3));
281
+ if (prefixSet) {
282
+ prefixSet.delete(term);
283
+ if (prefixSet.size === 0) this.prefixMap.delete(term.slice(0, 3));
284
+ }
285
+ }
286
+ }
287
+ }
288
+ }
289
+ this.idfStale = true;
290
+ }
291
+ /** Recompute IDF values for all terms. Called lazily before search. */
292
+ refreshIdf() {
293
+ if (!this.idfStale) return;
294
+ const N = this.docs.size;
295
+ this.idf.clear();
296
+ for (const [term, posting] of this.postings) {
297
+ const df = posting.size;
298
+ this.idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
299
+ }
300
+ this.idfStale = false;
301
+ }
302
+ /**
303
+ * Expand query terms with prefix matches.
304
+ * "deploy" → ["deploy", "deployment", "deploying", ...] (if they exist in the index)
305
+ */
306
+ expandQueryTerms(queryStems) {
307
+ const expanded = /* @__PURE__ */ new Map();
308
+ for (const qs of queryStems) {
309
+ if (this.postings.has(qs)) {
310
+ expanded.set(qs, Math.max(expanded.get(qs) || 0, 1));
311
+ }
312
+ if (qs.length >= 3) {
313
+ const prefix = qs.slice(0, 3);
314
+ const candidates = this.prefixMap.get(prefix);
315
+ if (candidates) {
316
+ for (const candidate of candidates) {
317
+ if (candidate !== qs && candidate.startsWith(qs)) {
318
+ expanded.set(candidate, Math.max(expanded.get(candidate) || 0, PREFIX_MATCH_PENALTY));
319
+ }
320
+ }
321
+ }
322
+ }
323
+ }
324
+ return expanded;
325
+ }
326
+ /**
327
+ * Compute bigram proximity boost: if two query terms appear adjacent
328
+ * in the document's stem sequence, boost the score.
329
+ */
330
+ bigramProximityBoost(docId, queryStems) {
331
+ if (queryStems.length < 2) return 0;
332
+ const doc = this.docs.get(docId);
333
+ if (!doc || doc.stemSequence.length < 2) return 0;
334
+ let boost = 0;
335
+ const seq = doc.stemSequence;
336
+ const querySet = new Set(queryStems);
337
+ for (let i = 0; i < seq.length - 1; i++) {
338
+ if (querySet.has(seq[i]) && querySet.has(seq[i + 1]) && seq[i] !== seq[i + 1]) {
339
+ boost += 0.5;
340
+ }
341
+ }
342
+ return Math.min(boost, 2);
343
+ }
344
+ /**
345
+ * Search the index for documents matching a query.
346
+ * Returns scored results sorted by BM25F relevance.
347
+ *
348
+ * @param query - Raw query string
349
+ * @param candidateIds - Optional: only score these document IDs (for agent-scoped search)
350
+ * @returns Array of { id, score } sorted by descending score
351
+ */
352
+ search(query, candidateIds) {
353
+ const queryStems = tokenize(query);
354
+ if (queryStems.length === 0) return [];
355
+ this.refreshIdf();
356
+ const expandedTerms = this.expandQueryTerms(queryStems);
357
+ if (expandedTerms.size === 0) return [];
358
+ const avgDl = this.avgDocLen;
359
+ const candidates = /* @__PURE__ */ new Set();
360
+ for (const term of expandedTerms.keys()) {
361
+ const posting = this.postings.get(term);
362
+ if (posting) {
363
+ for (const docId of posting) {
364
+ if (!candidateIds || candidateIds.has(docId)) candidates.add(docId);
365
+ }
366
+ }
367
+ }
368
+ const results = [];
369
+ for (const docId of candidates) {
370
+ const doc = this.docs.get(docId);
371
+ if (!doc) continue;
372
+ let score = 0;
373
+ for (const [term, weight] of expandedTerms) {
374
+ const tf = doc.weightedTf.get(term) || 0;
375
+ if (tf === 0) continue;
376
+ const termIdf = this.idf.get(term) || 0;
377
+ const numerator = tf * (BM25_K1 + 1);
378
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc.weightedLen / avgDl));
379
+ score += termIdf * (numerator / denominator) * weight;
380
+ }
381
+ score += this.bigramProximityBoost(docId, queryStems);
382
+ if (score > 0) results.push({ id: docId, score });
383
+ }
384
+ results.sort((a, b) => b.score - a.score);
385
+ return results;
386
+ }
387
+ /** Check if a document exists in the index. */
388
+ has(id) {
389
+ return this.docs.has(id);
390
+ }
391
+ };
392
+
393
+ export {
394
+ MemorySearchIndex
395
+ };