@agenticmail/enterprise 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-ANW4OHXR.js +764 -0
- package/dist/chunk-EVQPFQ55.js +9040 -0
- package/dist/chunk-JMTNHH7I.js +12666 -0
- package/dist/chunk-TYW5XTOW.js +395 -0
- package/dist/chunk-V2YIXYDJ.js +1943 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +5 -4
- package/dist/routes-ALTC4I2R.js +5674 -0
- package/dist/runtime-JLFTHMIT.js +47 -0
- package/dist/server-OGQWCOT6.js +11 -0
- package/dist/setup-HCMMUEW6.js +20 -0
- package/package.json +1 -1
- package/src/agent-tools/tools/memory.ts +42 -15
- package/src/engine/agent-memory.ts +4 -355
- package/src/lib/text-search.ts +358 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25F Full-Text Search Engine
|
|
3
|
+
*
|
|
4
|
+
* Comprehensive text relevance scoring with zero dependencies.
|
|
5
|
+
* Extracted as a shared module for use by both the engine memory
|
|
6
|
+
* system and the agent tool memory.
|
|
7
|
+
*
|
|
8
|
+
* Features:
|
|
9
|
+
* - Pre-built inverted index maintained incrementally (no re-indexing on query)
|
|
10
|
+
* - Lightweight Porter-style stemmer (suffix stripping for English)
|
|
11
|
+
* - Field weighting via BM25F: title x3, tags x2, content x1
|
|
12
|
+
* - Pre-computed IDF values updated on index mutations
|
|
13
|
+
* - Prefix matching: "deploy" matches "deployment", "deployments"
|
|
14
|
+
* - Per-agent partitioning for scoped searches
|
|
15
|
+
* - Bigram proximity boost: terms appearing adjacent score higher
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// ── BM25 Parameters ──
|
|
19
|
+
|
|
20
|
+
export const BM25_K1 = 1.2; // Term frequency saturation
|
|
21
|
+
export const BM25_B = 0.75; // Document length normalization
|
|
22
|
+
export const FIELD_WEIGHT_TITLE = 3.0;
|
|
23
|
+
export const FIELD_WEIGHT_TAGS = 2.0;
|
|
24
|
+
export const FIELD_WEIGHT_CONTENT = 1.0;
|
|
25
|
+
export const PREFIX_MATCH_PENALTY = 0.7; // Prefix matches score 70% of exact matches
|
|
26
|
+
|
|
27
|
+
// ── Stop Words ──
|
|
28
|
+
|
|
29
|
+
export const STOP_WORDS = new Set([
|
|
30
|
+
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
|
|
31
|
+
'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
|
|
32
|
+
'being', 'below', 'between', 'both', 'but', 'by', 'can', 'could', 'did',
|
|
33
|
+
'do', 'does', 'doing', 'down', 'during', 'each', 'either', 'every',
|
|
34
|
+
'few', 'for', 'from', 'further', 'get', 'got', 'had', 'has', 'have',
|
|
35
|
+
'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself',
|
|
36
|
+
'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself',
|
|
37
|
+
'just', 'may', 'me', 'might', 'more', 'most', 'must', 'my', 'myself',
|
|
38
|
+
'neither', 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only',
|
|
39
|
+
'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
|
|
40
|
+
'same', 'shall', 'she', 'should', 'so', 'some', 'such', 'than', 'that',
|
|
41
|
+
'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these',
|
|
42
|
+
'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up',
|
|
43
|
+
'us', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which',
|
|
44
|
+
'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you',
|
|
45
|
+
'your', 'yours', 'yourself', 'yourselves',
|
|
46
|
+
]);
|
|
47
|
+
|
|
48
|
+
// ── Porter Stemmer (lightweight suffix stripping) ──
|
|
49
|
+
// Handles common English suffixes to normalize "deployments" → "deploy",
|
|
50
|
+
// "running" → "run", "policies" → "polici", "configured" → "configur".
|
|
51
|
+
// Not a full Porter stemmer — covers the 80/20 of suffixes that matter most.
|
|
52
|
+
|
|
53
|
+
const STEM_RULES: [RegExp, string, number][] = [
|
|
54
|
+
// Step 1: plurals and past participles
|
|
55
|
+
[/ies$/, 'i', 3], // policies → polici,eries → eri
|
|
56
|
+
[/sses$/, 'ss', 4], // addresses → address
|
|
57
|
+
[/([^s])s$/, '$1', 3], // items → item, but not "ss"
|
|
58
|
+
[/eed$/, 'ee', 4], // agreed → agree
|
|
59
|
+
[/ed$/, '', 3], // configured → configur, but min length 3
|
|
60
|
+
[/ing$/, '', 4], // running → runn → run (handled below)
|
|
61
|
+
// Step 2: derivational suffixes
|
|
62
|
+
[/ational$/, 'ate', 6], // relational → relate
|
|
63
|
+
[/tion$/, 't', 5], // adoption → adopt
|
|
64
|
+
[/ness$/, '', 5], // awareness → aware
|
|
65
|
+
[/ment$/, '', 5], // deployment → deploy
|
|
66
|
+
[/able$/, '', 5], // configurable → configur
|
|
67
|
+
[/ible$/, '', 5], // accessible → access
|
|
68
|
+
[/ful$/, '', 5], // powerful → power
|
|
69
|
+
[/ous$/, '', 5], // dangerous → danger
|
|
70
|
+
[/ive$/, '', 5], // interactive → interact
|
|
71
|
+
[/ize$/, '', 4], // normalize → normal
|
|
72
|
+
[/ise$/, '', 4], // organise → organ
|
|
73
|
+
[/ally$/, '', 5], // automatically → automat
|
|
74
|
+
[/ly$/, '', 4], // quickly → quick
|
|
75
|
+
[/er$/, '', 4], // handler → handl
|
|
76
|
+
];
|
|
77
|
+
|
|
78
|
+
/** Clean up common doubling artifacts after suffix stripping. */
|
|
79
|
+
const DOUBLE_CONSONANT = /([^aeiou])\1$/;
|
|
80
|
+
|
|
81
|
+
export function stem(word: string): string {
|
|
82
|
+
if (word.length < 3) return word;
|
|
83
|
+
let stemmed = word;
|
|
84
|
+
for (const [pattern, replacement, minLen] of STEM_RULES) {
|
|
85
|
+
if (stemmed.length >= minLen && pattern.test(stemmed)) {
|
|
86
|
+
stemmed = stemmed.replace(pattern, replacement);
|
|
87
|
+
break; // Apply only the first matching rule
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Clean doubled consonants: runn → run, configurr → configur
|
|
91
|
+
if (stemmed.length > 2 && DOUBLE_CONSONANT.test(stemmed)) {
|
|
92
|
+
stemmed = stemmed.slice(0, -1);
|
|
93
|
+
}
|
|
94
|
+
return stemmed;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ── Tokenizer ──
|
|
98
|
+
|
|
99
|
+
/** Tokenize text into stemmed, lowercase terms, filtering stop words. */
|
|
100
|
+
export function tokenize(text: string): string[] {
|
|
101
|
+
return text.toLowerCase()
|
|
102
|
+
.split(/[^a-z0-9]+/)
|
|
103
|
+
.filter((t) => t.length > 1 && !STOP_WORDS.has(t))
|
|
104
|
+
.map(stem);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Tokenize preserving original (unstemmed) forms alongside stems. */
|
|
108
|
+
export function tokenizeWithOriginals(text: string): { stem: string; original: string }[] {
|
|
109
|
+
return text.toLowerCase()
|
|
110
|
+
.split(/[^a-z0-9]+/)
|
|
111
|
+
.filter((t) => t.length > 1 && !STOP_WORDS.has(t))
|
|
112
|
+
.map((t) => ({ stem: stem(t), original: t }));
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ── Inverted Index Data Structures ──
|
|
116
|
+
|
|
117
|
+
export interface DocRecord {
|
|
118
|
+
/** Weighted term frequencies across all fields: title (3x), tags (2x), content (1x) */
|
|
119
|
+
weightedTf: Map<string, number>;
|
|
120
|
+
/** Total weighted document length (for BM25 length normalization) */
|
|
121
|
+
weightedLen: number;
|
|
122
|
+
/** All unique stems in the document (for prefix matching) */
|
|
123
|
+
allStems: Set<string>;
|
|
124
|
+
/** Ordered list of stems for bigram proximity detection */
|
|
125
|
+
stemSequence: string[];
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Pre-built inverted index for fast text search.
|
|
130
|
+
* Maintained incrementally — no re-indexing needed on queries.
|
|
131
|
+
*
|
|
132
|
+
* Structure:
|
|
133
|
+
* term → Set<docId> (posting list — which docs contain this term)
|
|
134
|
+
* prefixMap: prefix → Set<stem> (3-char prefixes → full stems for prefix matching)
|
|
135
|
+
* docs: docId → DocRecord (per-doc weighted TF and length)
|
|
136
|
+
* idf: term → number (pre-computed IDF, refreshed on mutations)
|
|
137
|
+
*/
|
|
138
|
+
export class MemorySearchIndex {
|
|
139
|
+
/** Posting lists: stemmed term → Set of memory IDs containing it */
|
|
140
|
+
private postings = new Map<string, Set<string>>();
|
|
141
|
+
/** Per-document metadata for BM25 scoring */
|
|
142
|
+
private docs = new Map<string, DocRecord>();
|
|
143
|
+
/** Pre-computed IDF values. Stale flag triggers lazy recomputation. */
|
|
144
|
+
private idf = new Map<string, number>();
|
|
145
|
+
private idfStale = true;
|
|
146
|
+
/** 3-character prefix map for prefix matching: prefix → Set of full stems */
|
|
147
|
+
private prefixMap = new Map<string, Set<string>>();
|
|
148
|
+
/** Total weighted document length (for computing average) */
|
|
149
|
+
private totalWeightedLen = 0;
|
|
150
|
+
|
|
151
|
+
get docCount(): number { return this.docs.size; }
|
|
152
|
+
get avgDocLen(): number { return this.docs.size > 0 ? this.totalWeightedLen / this.docs.size : 1; }
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Index a memory entry. Extracts stems from title, content, and tags
|
|
156
|
+
* with field-specific weighting and builds posting lists.
|
|
157
|
+
*/
|
|
158
|
+
addDocument(id: string, entry: { title: string; content: string; tags: string[] }): void {
|
|
159
|
+
// Remove old version if updating
|
|
160
|
+
if (this.docs.has(id)) this.removeDocument(id);
|
|
161
|
+
|
|
162
|
+
const titleTokens = tokenize(entry.title);
|
|
163
|
+
const contentTokens = tokenize(entry.content);
|
|
164
|
+
const tagTokens = entry.tags.flatMap((t) => tokenize(t));
|
|
165
|
+
|
|
166
|
+
// Build weighted term frequency map
|
|
167
|
+
const weightedTf = new Map<string, number>();
|
|
168
|
+
for (const t of titleTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TITLE);
|
|
169
|
+
for (const t of tagTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_TAGS);
|
|
170
|
+
for (const t of contentTokens) weightedTf.set(t, (weightedTf.get(t) || 0) + FIELD_WEIGHT_CONTENT);
|
|
171
|
+
|
|
172
|
+
const weightedLen = titleTokens.length * FIELD_WEIGHT_TITLE
|
|
173
|
+
+ tagTokens.length * FIELD_WEIGHT_TAGS
|
|
174
|
+
+ contentTokens.length * FIELD_WEIGHT_CONTENT;
|
|
175
|
+
|
|
176
|
+
const allStems = new Set<string>();
|
|
177
|
+
for (const t of weightedTf.keys()) allStems.add(t);
|
|
178
|
+
|
|
179
|
+
// Stem sequence for bigram proximity (title first, then content — most important ordering)
|
|
180
|
+
const stemSequence = [...titleTokens, ...contentTokens];
|
|
181
|
+
|
|
182
|
+
const docRecord: DocRecord = { weightedTf, weightedLen, allStems, stemSequence };
|
|
183
|
+
this.docs.set(id, docRecord);
|
|
184
|
+
this.totalWeightedLen += weightedLen;
|
|
185
|
+
|
|
186
|
+
// Update posting lists
|
|
187
|
+
for (const term of allStems) {
|
|
188
|
+
let posting = this.postings.get(term);
|
|
189
|
+
if (!posting) { posting = new Set(); this.postings.set(term, posting); }
|
|
190
|
+
posting.add(id);
|
|
191
|
+
|
|
192
|
+
// Update prefix map (3-char prefixes for prefix matching)
|
|
193
|
+
if (term.length >= 3) {
|
|
194
|
+
const prefix = term.slice(0, 3);
|
|
195
|
+
let prefixSet = this.prefixMap.get(prefix);
|
|
196
|
+
if (!prefixSet) { prefixSet = new Set(); this.prefixMap.set(prefix, prefixSet); }
|
|
197
|
+
prefixSet.add(term);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
this.idfStale = true;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/** Remove a document from the index. */
|
|
205
|
+
removeDocument(id: string): void {
|
|
206
|
+
const doc = this.docs.get(id);
|
|
207
|
+
if (!doc) return;
|
|
208
|
+
|
|
209
|
+
this.totalWeightedLen -= doc.weightedLen;
|
|
210
|
+
this.docs.delete(id);
|
|
211
|
+
|
|
212
|
+
// Remove from posting lists
|
|
213
|
+
for (const term of doc.allStems) {
|
|
214
|
+
const posting = this.postings.get(term);
|
|
215
|
+
if (posting) {
|
|
216
|
+
posting.delete(id);
|
|
217
|
+
if (posting.size === 0) {
|
|
218
|
+
this.postings.delete(term);
|
|
219
|
+
// Clean prefix map
|
|
220
|
+
if (term.length >= 3) {
|
|
221
|
+
const prefixSet = this.prefixMap.get(term.slice(0, 3));
|
|
222
|
+
if (prefixSet) { prefixSet.delete(term); if (prefixSet.size === 0) this.prefixMap.delete(term.slice(0, 3)); }
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
this.idfStale = true;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/** Recompute IDF values for all terms. Called lazily before search. */
|
|
232
|
+
private refreshIdf(): void {
|
|
233
|
+
if (!this.idfStale) return;
|
|
234
|
+
const N = this.docs.size;
|
|
235
|
+
this.idf.clear();
|
|
236
|
+
for (const [term, posting] of this.postings) {
|
|
237
|
+
const df = posting.size;
|
|
238
|
+
// BM25 IDF: log((N - df + 0.5) / (df + 0.5) + 1)
|
|
239
|
+
this.idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
|
|
240
|
+
}
|
|
241
|
+
this.idfStale = false;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Expand query terms with prefix matches.
|
|
246
|
+
* "deploy" → ["deploy", "deployment", "deploying", ...] (if they exist in the index)
|
|
247
|
+
*/
|
|
248
|
+
private expandQueryTerms(queryStems: string[]): Map<string, number> {
|
|
249
|
+
const expanded = new Map<string, number>();
|
|
250
|
+
|
|
251
|
+
for (const qs of queryStems) {
|
|
252
|
+
// Exact match always gets full weight
|
|
253
|
+
if (this.postings.has(qs)) {
|
|
254
|
+
expanded.set(qs, Math.max(expanded.get(qs) || 0, 1.0));
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Prefix expansion: find all stems that start with the query stem (min 3 chars)
|
|
258
|
+
if (qs.length >= 3) {
|
|
259
|
+
const prefix = qs.slice(0, 3);
|
|
260
|
+
const candidates = this.prefixMap.get(prefix);
|
|
261
|
+
if (candidates) {
|
|
262
|
+
for (const candidate of candidates) {
|
|
263
|
+
if (candidate !== qs && candidate.startsWith(qs)) {
|
|
264
|
+
expanded.set(candidate, Math.max(expanded.get(candidate) || 0, PREFIX_MATCH_PENALTY));
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return expanded;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Compute bigram proximity boost: if two query terms appear adjacent
|
|
276
|
+
* in the document's stem sequence, boost the score.
|
|
277
|
+
*/
|
|
278
|
+
private bigramProximityBoost(docId: string, queryStems: string[]): number {
|
|
279
|
+
if (queryStems.length < 2) return 0;
|
|
280
|
+
const doc = this.docs.get(docId);
|
|
281
|
+
if (!doc || doc.stemSequence.length < 2) return 0;
|
|
282
|
+
|
|
283
|
+
let boost = 0;
|
|
284
|
+
const seq = doc.stemSequence;
|
|
285
|
+
const querySet = new Set(queryStems);
|
|
286
|
+
|
|
287
|
+
for (let i = 0; i < seq.length - 1; i++) {
|
|
288
|
+
if (querySet.has(seq[i]) && querySet.has(seq[i + 1]) && seq[i] !== seq[i + 1]) {
|
|
289
|
+
boost += 0.5; // Each adjacent pair of query terms adds 0.5
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return Math.min(boost, 2.0); // Cap at 2.0 bonus
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Search the index for documents matching a query.
|
|
298
|
+
* Returns scored results sorted by BM25F relevance.
|
|
299
|
+
*
|
|
300
|
+
* @param query - Raw query string
|
|
301
|
+
* @param candidateIds - Optional: only score these document IDs (for agent-scoped search)
|
|
302
|
+
* @returns Array of { id, score } sorted by descending score
|
|
303
|
+
*/
|
|
304
|
+
search(query: string, candidateIds?: Set<string>): Array<{ id: string; score: number }> {
|
|
305
|
+
const queryStems = tokenize(query);
|
|
306
|
+
if (queryStems.length === 0) return [];
|
|
307
|
+
|
|
308
|
+
this.refreshIdf();
|
|
309
|
+
|
|
310
|
+
const expandedTerms = this.expandQueryTerms(queryStems);
|
|
311
|
+
if (expandedTerms.size === 0) return [];
|
|
312
|
+
|
|
313
|
+
const avgDl = this.avgDocLen;
|
|
314
|
+
|
|
315
|
+
// Collect candidate document IDs from posting lists
|
|
316
|
+
const candidates = new Set<string>();
|
|
317
|
+
for (const term of expandedTerms.keys()) {
|
|
318
|
+
const posting = this.postings.get(term);
|
|
319
|
+
if (posting) {
|
|
320
|
+
for (const docId of posting) {
|
|
321
|
+
if (!candidateIds || candidateIds.has(docId)) candidates.add(docId);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Score each candidate
|
|
327
|
+
const results: Array<{ id: string; score: number }> = [];
|
|
328
|
+
|
|
329
|
+
for (const docId of candidates) {
|
|
330
|
+
const doc = this.docs.get(docId);
|
|
331
|
+
if (!doc) continue;
|
|
332
|
+
|
|
333
|
+
let score = 0;
|
|
334
|
+
|
|
335
|
+
for (const [term, weight] of expandedTerms) {
|
|
336
|
+
const tf = doc.weightedTf.get(term) || 0;
|
|
337
|
+
if (tf === 0) continue;
|
|
338
|
+
const termIdf = this.idf.get(term) || 0;
|
|
339
|
+
|
|
340
|
+
// BM25F: IDF × (weightedTF × (k1 + 1)) / (weightedTF + k1 × (1 - b + b × docLen/avgDocLen))
|
|
341
|
+
const numerator = tf * (BM25_K1 + 1);
|
|
342
|
+
const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc.weightedLen / avgDl));
|
|
343
|
+
score += termIdf * (numerator / denominator) * weight;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Bigram proximity boost
|
|
347
|
+
score += this.bigramProximityBoost(docId, queryStems);
|
|
348
|
+
|
|
349
|
+
if (score > 0) results.push({ id: docId, score });
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
results.sort((a, b) => b.score - a.score);
|
|
353
|
+
return results;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/** Check if a document exists in the index. */
|
|
357
|
+
has(id: string): boolean { return this.docs.has(id); }
|
|
358
|
+
}
|