@agenticmail/enterprise 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,358 @@
1
+ /**
2
+ * BM25F Full-Text Search Engine
3
+ *
4
+ * Comprehensive text relevance scoring with zero dependencies.
5
+ * Extracted as a shared module for use by both the engine memory
6
+ * system and the agent tool memory.
7
+ *
8
+ * Features:
9
+ * - Pre-built inverted index maintained incrementally (no re-indexing on query)
10
+ * - Lightweight Porter-style stemmer (suffix stripping for English)
11
+ * - Field weighting via BM25F: title x3, tags x2, content x1
12
+ * - Pre-computed IDF values updated on index mutations
13
+ * - Prefix matching: "deploy" matches "deployment", "deployments"
14
+ * - Per-agent partitioning for scoped searches
15
+ * - Bigram proximity boost: terms appearing adjacent score higher
16
+ */
17
+
18
+ // ── BM25 Parameters ──
19
+
20
+ export const BM25_K1 = 1.2; // Term frequency saturation
21
+ export const BM25_B = 0.75; // Document length normalization
22
+ export const FIELD_WEIGHT_TITLE = 3.0;
23
+ export const FIELD_WEIGHT_TAGS = 2.0;
24
+ export const FIELD_WEIGHT_CONTENT = 1.0;
25
+ export const PREFIX_MATCH_PENALTY = 0.7; // Prefix matches score 70% of exact matches
26
+
27
+ // ── Stop Words ──
28
+
29
+ export const STOP_WORDS = new Set([
30
+ 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
31
+ 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
32
+ 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'could', 'did',
33
+ 'do', 'does', 'doing', 'down', 'during', 'each', 'either', 'every',
34
+ 'few', 'for', 'from', 'further', 'get', 'got', 'had', 'has', 'have',
35
+ 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself',
36
+ 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself',
37
+ 'just', 'may', 'me', 'might', 'more', 'most', 'must', 'my', 'myself',
38
+ 'neither', 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only',
39
+ 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
40
+ 'same', 'shall', 'she', 'should', 'so', 'some', 'such', 'than', 'that',
41
+ 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these',
42
+ 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up',
43
+ 'us', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which',
44
+ 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',
45
+ 'your', 'yours', 'yourself', 'yourselves',
46
+ ]);
47
+
48
+ // ── Porter Stemmer (lightweight suffix stripping) ──
49
+ // Handles common English suffixes to normalize "deployments" → "deploy",
50
+ // "running" → "run", "policies" → "polici", "configured" → "configur".
51
+ // Not a full Porter stemmer — covers the 80/20 of suffixes that matter most.
52
+
53
+ const STEM_RULES: [RegExp, string, number][] = [
54
+ // Step 1: plurals and past participles
55
+ [/ies$/, 'i', 3], // policies → polici,eries → eri
56
+ [/sses$/, 'ss', 4], // addresses → address
57
+ [/([^s])s$/, '$1', 3], // items → item, but not "ss"
58
+ [/eed$/, 'ee', 4], // agreed → agree
59
+ [/ed$/, '', 3], // configured → configur, but min length 3
60
+ [/ing$/, '', 4], // running → runn → run (handled below)
61
+ // Step 2: derivational suffixes
62
+ [/ational$/, 'ate', 6], // relational → relate
63
+ [/tion$/, 't', 5], // adoption → adopt
64
+ [/ness$/, '', 5], // awareness → aware
65
+ [/ment$/, '', 5], // deployment → deploy
66
+ [/able$/, '', 5], // configurable → configur
67
+ [/ible$/, '', 5], // accessible → access
68
+ [/ful$/, '', 5], // powerful → power
69
+ [/ous$/, '', 5], // dangerous → danger
70
+ [/ive$/, '', 5], // interactive → interact
71
+ [/ize$/, '', 4], // normalize → normal
72
+ [/ise$/, '', 4], // organise → organ
73
+ [/ally$/, '', 5], // automatically → automat
74
+ [/ly$/, '', 4], // quickly → quick
75
+ [/er$/, '', 4], // handler → handl
76
+ ];
77
+
78
+ /** Clean up common doubling artifacts after suffix stripping. */
79
+ const DOUBLE_CONSONANT = /([^aeiou])\1$/;
80
+
81
+ export function stem(word: string): string {
82
+ if (word.length < 3) return word;
83
+ let stemmed = word;
84
+ for (const [pattern, replacement, minLen] of STEM_RULES) {
85
+ if (stemmed.length >= minLen && pattern.test(stemmed)) {
86
+ stemmed = stemmed.replace(pattern, replacement);
87
+ break; // Apply only the first matching rule
88
+ }
89
+ }
90
+ // Clean doubled consonants: runn → run, configurr → configur
91
+ if (stemmed.length > 2 && DOUBLE_CONSONANT.test(stemmed)) {
92
+ stemmed = stemmed.slice(0, -1);
93
+ }
94
+ return stemmed;
95
+ }
96
+
97
+ // ── Tokenizer ──
98
+
99
+ /** Tokenize text into stemmed, lowercase terms, filtering stop words. */
100
+ export function tokenize(text: string): string[] {
101
+ return text.toLowerCase()
102
+ .split(/[^a-z0-9]+/)
103
+ .filter((t) => t.length > 1 && !STOP_WORDS.has(t))
104
+ .map(stem);
105
+ }
106
+
107
+ /** Tokenize preserving original (unstemmed) forms alongside stems. */
108
+ export function tokenizeWithOriginals(text: string): { stem: string; original: string }[] {
109
+ return text.toLowerCase()
110
+ .split(/[^a-z0-9]+/)
111
+ .filter((t) => t.length > 1 && !STOP_WORDS.has(t))
112
+ .map((t) => ({ stem: stem(t), original: t }));
113
+ }
114
+
115
+ // ── Inverted Index Data Structures ──
116
+
117
+ export interface DocRecord {
118
+ /** Weighted term frequencies across all fields: title (3x), tags (2x), content (1x) */
119
+ weightedTf: Map<string, number>;
120
+ /** Total weighted document length (for BM25 length normalization) */
121
+ weightedLen: number;
122
+ /** All unique stems in the document (for prefix matching) */
123
+ allStems: Set<string>;
124
+ /** Ordered list of stems for bigram proximity detection */
125
+ stemSequence: string[];
126
+ }
127
+
128
+ /**
129
+ * Pre-built inverted index for fast text search.
130
+ * Maintained incrementally — no re-indexing needed on queries.
131
+ *
132
+ * Structure:
133
+ * term → Set<docId> (posting list — which docs contain this term)
134
+ * prefixMap: prefix → Set<stem> (3-char prefixes → full stems for prefix matching)
135
+ * docs: docId → DocRecord (per-doc weighted TF and length)
136
+ * idf: term → number (pre-computed IDF, refreshed on mutations)
137
+ */
138
+ export class MemorySearchIndex {
139
+ /** Posting lists: stemmed term → Set of memory IDs containing it */
140
+ private postings = new Map<string, Set<string>>();
141
+ /** Per-document metadata for BM25 scoring */
142
+ private docs = new Map<string, DocRecord>();
143
+ /** Pre-computed IDF values. Stale flag triggers lazy recomputation. */
144
+ private idf = new Map<string, number>();
145
+ private idfStale = true;
146
+ /** 3-character prefix map for prefix matching: prefix → Set of full stems */
147
+ private prefixMap = new Map<string, Set<string>>();
148
+ /** Total weighted document length (for computing average) */
149
+ private totalWeightedLen = 0;
150
+
151
+ get docCount(): number { return this.docs.size; }
152
+ get avgDocLen(): number { return this.docs.size > 0 ? this.totalWeightedLen / this.docs.size : 1; }
153
+
154
+ /**
155
+ * Index a memory entry. Extracts stems from title, content, and tags
156
+ * with field-specific weighting and builds posting lists.
157
+ */
158
+ addDocument(id: string, entry: { title: string; content: string; tags: string[] }): void {
159
+ // Remove old version if updating
160
+ if (this.docs.has(id)) this.removeDocument(id);
161
+
162
+ const titleTokens = tokenize(entry.title);
163
+ const contentTokens = tokenize(entry.content);
164
+ const tagTokens = entry.tags.flatMap((t) => tokenize(t));
165
+
166
+ // Build weighted term frequency map
167
+ const weightedTf = new Map<string, number>();
168
+ for (const t of titleTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TITLE);
169
+ for (const t of tagTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TAGS);
170
+ for (const t of contentTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_CONTENT);
171
+
172
+ const weightedLen = titleTokens.length * FIELD_WEIGHT_TITLE
173
+ + tagTokens.length * FIELD_WEIGHT_TAGS
174
+ + contentTokens.length * FIELD_WEIGHT_CONTENT;
175
+
176
+ const allStems = new Set<string>();
177
+ for (const t of weightedTf.keys()) allStems.add(t);
178
+
179
+ // Stem sequence for bigram proximity (title first, then content — most important ordering)
180
+ const stemSequence = [...titleTokens, ...contentTokens];
181
+
182
+ const docRecord: DocRecord = { weightedTf, weightedLen, allStems, stemSequence };
183
+ this.docs.set(id, docRecord);
184
+ this.totalWeightedLen += weightedLen;
185
+
186
+ // Update posting lists
187
+ for (const term of allStems) {
188
+ let posting = this.postings.get(term);
189
+ if (!posting) { posting = new Set(); this.postings.set(term, posting); }
190
+ posting.add(id);
191
+
192
+ // Update prefix map (3-char prefixes for prefix matching)
193
+ if (term.length >= 3) {
194
+ const prefix = term.slice(0, 3);
195
+ let prefixSet = this.prefixMap.get(prefix);
196
+ if (!prefixSet) { prefixSet = new Set(); this.prefixMap.set(prefix, prefixSet); }
197
+ prefixSet.add(term);
198
+ }
199
+ }
200
+
201
+ this.idfStale = true;
202
+ }
203
+
204
+ /** Remove a document from the index. */
205
+ removeDocument(id: string): void {
206
+ const doc = this.docs.get(id);
207
+ if (!doc) return;
208
+
209
+ this.totalWeightedLen -= doc.weightedLen;
210
+ this.docs.delete(id);
211
+
212
+ // Remove from posting lists
213
+ for (const term of doc.allStems) {
214
+ const posting = this.postings.get(term);
215
+ if (posting) {
216
+ posting.delete(id);
217
+ if (posting.size === 0) {
218
+ this.postings.delete(term);
219
+ // Clean prefix map
220
+ if (term.length >= 3) {
221
+ const prefixSet = this.prefixMap.get(term.slice(0, 3));
222
+ if (prefixSet) { prefixSet.delete(term); if (prefixSet.size === 0) this.prefixMap.delete(term.slice(0, 3)); }
223
+ }
224
+ }
225
+ }
226
+ }
227
+
228
+ this.idfStale = true;
229
+ }
230
+
231
+ /** Recompute IDF values for all terms. Called lazily before search. */
232
+ private refreshIdf(): void {
233
+ if (!this.idfStale) return;
234
+ const N = this.docs.size;
235
+ this.idf.clear();
236
+ for (const [term, posting] of this.postings) {
237
+ const df = posting.size;
238
+ // BM25 IDF: log((N - df + 0.5) / (df + 0.5) + 1)
239
+ this.idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
240
+ }
241
+ this.idfStale = false;
242
+ }
243
+
244
+ /**
245
+ * Expand query terms with prefix matches.
246
+ * "deploy" → ["deploy", "deployment", "deploying", ...] (if they exist in the index)
247
+ */
248
+ private expandQueryTerms(queryStems: string[]): Map<string, number> {
249
+ const expanded = new Map<string, number>();
250
+
251
+ for (const qs of queryStems) {
252
+ // Exact match always gets full weight
253
+ if (this.postings.has(qs)) {
254
+ expanded.set(qs, Math.max(expanded.get(qs) || 0, 1.0));
255
+ }
256
+
257
+ // Prefix expansion: find all stems that start with the query stem (min 3 chars)
258
+ if (qs.length >= 3) {
259
+ const prefix = qs.slice(0, 3);
260
+ const candidates = this.prefixMap.get(prefix);
261
+ if (candidates) {
262
+ for (const candidate of candidates) {
263
+ if (candidate !== qs && candidate.startsWith(qs)) {
264
+ expanded.set(candidate, Math.max(expanded.get(candidate) || 0, PREFIX_MATCH_PENALTY));
265
+ }
266
+ }
267
+ }
268
+ }
269
+ }
270
+
271
+ return expanded;
272
+ }
273
+
274
+ /**
275
+ * Compute bigram proximity boost: if two query terms appear adjacent
276
+ * in the document's stem sequence, boost the score.
277
+ */
278
+ private bigramProximityBoost(docId: string, queryStems: string[]): number {
279
+ if (queryStems.length < 2) return 0;
280
+ const doc = this.docs.get(docId);
281
+ if (!doc || doc.stemSequence.length < 2) return 0;
282
+
283
+ let boost = 0;
284
+ const seq = doc.stemSequence;
285
+ const querySet = new Set(queryStems);
286
+
287
+ for (let i = 0; i < seq.length - 1; i++) {
288
+ if (querySet.has(seq[i]) && querySet.has(seq[i + 1]) && seq[i] !== seq[i + 1]) {
289
+ boost += 0.5; // Each adjacent pair of query terms adds 0.5
290
+ }
291
+ }
292
+
293
+ return Math.min(boost, 2.0); // Cap at 2.0 bonus
294
+ }
295
+
296
+ /**
297
+ * Search the index for documents matching a query.
298
+ * Returns scored results sorted by BM25F relevance.
299
+ *
300
+ * @param query - Raw query string
301
+ * @param candidateIds - Optional: only score these document IDs (for agent-scoped search)
302
+ * @returns Array of { id, score } sorted by descending score
303
+ */
304
+ search(query: string, candidateIds?: Set<string>): Array<{ id: string; score: number }> {
305
+ const queryStems = tokenize(query);
306
+ if (queryStems.length === 0) return [];
307
+
308
+ this.refreshIdf();
309
+
310
+ const expandedTerms = this.expandQueryTerms(queryStems);
311
+ if (expandedTerms.size === 0) return [];
312
+
313
+ const avgDl = this.avgDocLen;
314
+
315
+ // Collect candidate document IDs from posting lists
316
+ const candidates = new Set<string>();
317
+ for (const term of expandedTerms.keys()) {
318
+ const posting = this.postings.get(term);
319
+ if (posting) {
320
+ for (const docId of posting) {
321
+ if (!candidateIds || candidateIds.has(docId)) candidates.add(docId);
322
+ }
323
+ }
324
+ }
325
+
326
+ // Score each candidate
327
+ const results: Array<{ id: string; score: number }> = [];
328
+
329
+ for (const docId of candidates) {
330
+ const doc = this.docs.get(docId);
331
+ if (!doc) continue;
332
+
333
+ let score = 0;
334
+
335
+ for (const [term, weight] of expandedTerms) {
336
+ const tf = doc.weightedTf.get(term) || 0;
337
+ if (tf === 0) continue;
338
+ const termIdf = this.idf.get(term) || 0;
339
+
340
+ // BM25F: IDF × (weightedTF × (k1 + 1)) / (weightedTF + k1 × (1 - b + b × docLen/avgDocLen))
341
+ const numerator = tf * (BM25_K1 + 1);
342
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc.weightedLen / avgDl));
343
+ score += termIdf * (numerator / denominator) * weight;
344
+ }
345
+
346
+ // Bigram proximity boost
347
+ score += this.bigramProximityBoost(docId, queryStems);
348
+
349
+ if (score > 0) results.push({ id: docId, score });
350
+ }
351
+
352
+ results.sort((a, b) => b.score - a.score);
353
+ return results;
354
+ }
355
+
356
+ /** Check if a document exists in the index. */
357
+ has(id: string): boolean { return this.docs.has(id); }
358
+ }