@icex-labs/openclaw-memory-engine 4.2.2 → 5.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/auto-capture.js +31 -60
- package/lib/classifier.js +253 -0
- package/lib/consolidate.js +19 -35
- package/lib/quality.js +28 -168
- package/package.json +1 -1
package/lib/auto-capture.js
CHANGED
|
@@ -1,71 +1,36 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Auto-capture: hook into message
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
* No reliance on the agent calling tools — memory happens passively.
|
|
2
|
+
* Auto-capture: hook into message events, passively store facts.
|
|
3
|
+
* Uses embedding-based classification (v5.0) — no hardcoded keywords.
|
|
6
4
|
*/
|
|
7
5
|
|
|
8
6
|
import { loadArchival, appendRecord } from "./archival.js";
|
|
9
|
-
import { indexEmbedding } from "./embedding.js";
|
|
7
|
+
import { getEmbedding, indexEmbedding } from "./embedding.js";
|
|
10
8
|
import { extractTriples, addTriple } from "./graph.js";
|
|
11
|
-
import {
|
|
9
|
+
import { classify } from "./classifier.js";
|
|
12
10
|
|
|
13
|
-
/** Recent capture cache to prevent duplicates within short windows. */
|
|
14
|
-
const recentCaptures = new Map(); // content_hash → timestamp
|
|
15
|
-
const DEDUP_WINDOW_MS = 60_000; // 60 seconds
|
|
16
|
-
|
|
17
|
-
// Minimum message length to consider for fact extraction
|
|
18
11
|
const MIN_LENGTH = 20;
|
|
12
|
+
const MAX_LENGTH = 500;
|
|
19
13
|
|
|
20
|
-
// Skip
|
|
21
|
-
const
|
|
22
|
-
/^(hi|hello|hey|ok|thanks|good morning|good night|早|晚安|你好|嗯|好的|谢谢)/i,
|
|
23
|
-
/^HEARTBEAT_OK$/,
|
|
24
|
-
/^\//, // slash commands
|
|
25
|
-
/^(yes|no|yeah|nah|sure|maybe)$/i,
|
|
26
|
-
];
|
|
27
|
-
|
|
28
|
-
// High-value content patterns — always store these
|
|
29
|
-
const HIGH_VALUE_PATTERNS = [
|
|
30
|
-
/\b(decided|decision|plan|scheduled|booked|bought|sold|paid|签|买|卖|预约|决定)\b/i,
|
|
31
|
-
/\b(doctor|lawyer|immigration|IRCC|IBKR|account|password|address|phone|email)\b/i,
|
|
32
|
-
/\b(remember|don't forget|提醒|记住|别忘)\b/i,
|
|
33
|
-
/\$\d{2,}/, // dollar amounts
|
|
34
|
-
/\b\d{4}-\d{2}-\d{2}\b/, // dates
|
|
35
|
-
];
|
|
14
|
+
// Skip very short / obvious non-fact messages (language-agnostic via length)
|
|
15
|
+
const SKIP_EXACT = new Set(["heartbeat_ok", "ok", "yes", "no", "y", "n"]);
|
|
36
16
|
|
|
37
|
-
|
|
38
|
-
const
|
|
39
|
-
|
|
40
|
-
[/\b(immigration|PR|IRCC|CBSA|visa|lawyer|律师)/i, "immigration"],
|
|
41
|
-
[/\b(doctor|医生|hospital|health|medication|药)/i, "health"],
|
|
42
|
-
[/\b(car|vehicle|Escalade|GX550|ES350|Tesla|tire|车)/i, "vehicles"],
|
|
43
|
-
[/\b(school|homework|exam|swimming|lesson|学校|课)/i, "education"],
|
|
44
|
-
[/\b(deploy|k3d|ArgoCD|kubectl|CI|cluster)/i, "infrastructure"],
|
|
45
|
-
[/\b(quant|trading|backtest|signal|strategy)/i, "quant"],
|
|
46
|
-
];
|
|
47
|
-
|
|
48
|
-
function inferEntity(text) {
|
|
49
|
-
for (const [pat, name] of ENTITY_PATTERNS) {
|
|
50
|
-
if (pat.test(text)) return name;
|
|
51
|
-
}
|
|
52
|
-
return "conversation";
|
|
53
|
-
}
|
|
17
|
+
/** Recent capture cache to prevent duplicates. */
|
|
18
|
+
const recentCaptures = new Map();
|
|
19
|
+
const DEDUP_WINDOW_MS = 60_000;
|
|
54
20
|
|
|
55
21
|
function shouldCapture(content) {
|
|
56
22
|
if (!content || content.length < MIN_LENGTH) return false;
|
|
57
|
-
|
|
23
|
+
const lower = content.trim().toLowerCase();
|
|
24
|
+
if (SKIP_EXACT.has(lower)) return false;
|
|
25
|
+
if (lower.startsWith("/")) return false; // slash commands
|
|
58
26
|
return true;
|
|
59
27
|
}
|
|
60
28
|
|
|
61
|
-
function isHighValue(content) {
|
|
62
|
-
return HIGH_VALUE_PATTERNS.some((p) => p.test(content));
|
|
63
|
-
}
|
|
64
|
-
|
|
65
29
|
/**
|
|
66
|
-
* Process
|
|
30
|
+
* Process a message and auto-store if valuable.
|
|
31
|
+
* Classification is embedding-based — works with any language.
|
|
67
32
|
*/
|
|
68
|
-
export function captureMessage(ws, content, source = "auto-capture") {
|
|
33
|
+
export async function captureMessage(ws, content, source = "auto-capture") {
|
|
69
34
|
if (!shouldCapture(content)) return null;
|
|
70
35
|
|
|
71
36
|
// Dedup: skip if same content captured in last 60s
|
|
@@ -76,14 +41,14 @@ export function captureMessage(ws, content, source = "auto-capture") {
|
|
|
76
41
|
}
|
|
77
42
|
recentCaptures.set(contentHash, now);
|
|
78
43
|
|
|
79
|
-
// Clean old
|
|
44
|
+
// Clean old dedup entries
|
|
80
45
|
if (recentCaptures.size > 200) {
|
|
81
46
|
for (const [key, ts] of recentCaptures) {
|
|
82
47
|
if (now - ts > DEDUP_WINDOW_MS) recentCaptures.delete(key);
|
|
83
48
|
}
|
|
84
49
|
}
|
|
85
50
|
|
|
86
|
-
//
|
|
51
|
+
// Check against recent archival records (keyword overlap)
|
|
87
52
|
const existing = loadArchival(ws);
|
|
88
53
|
const contentLower = content.toLowerCase();
|
|
89
54
|
const contentWords = new Set(contentLower.split(/\s+/).filter((w) => w.length > 2));
|
|
@@ -93,15 +58,15 @@ export function captureMessage(ws, content, source = "auto-capture") {
|
|
|
93
58
|
const exWords = new Set(ex.split(/\s+/).filter((w) => w.length > 2));
|
|
94
59
|
let overlap = 0;
|
|
95
60
|
for (const w of contentWords) { if (exWords.has(w)) overlap++; }
|
|
96
|
-
if (overlap / contentWords.size > 0.7) return null;
|
|
61
|
+
if (overlap / contentWords.size > 0.7) return null;
|
|
97
62
|
}
|
|
98
63
|
}
|
|
99
64
|
|
|
100
|
-
|
|
101
|
-
const
|
|
65
|
+
// Trim long messages
|
|
66
|
+
const trimmed = content.length > MAX_LENGTH ? content.slice(0, MAX_LENGTH - 3) + "..." : content;
|
|
102
67
|
|
|
103
|
-
//
|
|
104
|
-
const
|
|
68
|
+
// Classify using embeddings (language-agnostic)
|
|
69
|
+
const { entity, importance, embedding } = await classify(trimmed, ws);
|
|
105
70
|
|
|
106
71
|
const record = appendRecord(ws, {
|
|
107
72
|
content: trimmed,
|
|
@@ -110,9 +75,15 @@ export function captureMessage(ws, content, source = "auto-capture") {
|
|
|
110
75
|
importance,
|
|
111
76
|
});
|
|
112
77
|
|
|
113
|
-
//
|
|
114
|
-
|
|
78
|
+
// Reuse embedding for search indexing (no duplicate API call)
|
|
79
|
+
if (embedding) {
|
|
80
|
+
const { loadEmbeddingCache, saveEmbeddingCache } = await import("./embedding.js");
|
|
81
|
+
const cache = loadEmbeddingCache(ws);
|
|
82
|
+
cache[record.id] = embedding;
|
|
83
|
+
saveEmbeddingCache(ws);
|
|
84
|
+
}
|
|
115
85
|
|
|
86
|
+
// Extract graph triples
|
|
116
87
|
const triples = extractTriples(trimmed);
|
|
117
88
|
for (const t of triples) {
|
|
118
89
|
addTriple(ws, t.s, t.r, t.o, record.id);
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding-based classifier — replaces all hardcoded regex patterns.
|
|
3
|
+
* Language-agnostic: works with any language the embedding model supports.
|
|
4
|
+
*
|
|
5
|
+
* Uses "anchor embeddings" — short descriptions of each category.
|
|
6
|
+
* Classifies by cosine similarity against anchors.
|
|
7
|
+
* Anchors are computed once and cached to disk.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
|
|
11
|
+
import { join } from "node:path";
|
|
12
|
+
import { getEmbedding, cosineSimilarity } from "./embedding.js";
|
|
13
|
+
|
|
14
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
15
|
+
// Category anchors — short descriptions, NOT keywords
|
|
16
|
+
// The embedding model understands semantics, so these work in ANY language
|
|
17
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
18
|
+
|
|
19
|
+
const ENTITY_ANCHORS = {
|
|
20
|
+
health: "medical health doctor hospital clinic medication prescription treatment diagnosis symptom illness disease checkup appointment therapy",
|
|
21
|
+
finance: "money investment portfolio bank account mortgage loan interest rate tax income salary budget expense stock trading IBKR brokerage dividend",
|
|
22
|
+
immigration: "immigration visa permanent resident citizenship passport border agency lawyer legal court petition complaint refugee asylum",
|
|
23
|
+
legal: "lawyer attorney lawsuit court tribunal legal complaint case hearing judgment ruling contract",
|
|
24
|
+
vehicles: "car vehicle automobile SUV sedan truck van tire wheel maintenance repair insurance collision driving license",
|
|
25
|
+
property: "house home apartment condo real estate mortgage rent property landlord tenant renovation",
|
|
26
|
+
education: "school student class homework exam test grade teacher professor university college kindergarten lesson tutorial",
|
|
27
|
+
family: "wife husband spouse child son daughter parent mother father sibling family relative wedding anniversary",
|
|
28
|
+
career: "job work career company employer employee salary promotion interview resume hiring office meeting boss manager",
|
|
29
|
+
infrastructure: "server cluster kubernetes docker container deployment pipeline CI CD devops cloud hosting database",
|
|
30
|
+
technology: "code programming software AI machine learning LLM model API plugin framework library",
|
|
31
|
+
shopping: "buy purchase order shop store online delivery coupon discount price sale",
|
|
32
|
+
travel: "flight airline airport hotel trip vacation travel booking passport luggage destination",
|
|
33
|
+
food: "restaurant meal dinner lunch breakfast cooking recipe food grocery kitchen chef",
|
|
34
|
+
entertainment: "movie music game sport hobby concert show streaming video book reading",
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
const IMPORTANCE_ANCHORS = {
|
|
38
|
+
critical: "lawsuit court immigration visa legal case medical emergency surgery hospital critical urgent deadline account number password credential secret key",
|
|
39
|
+
high: "investment portfolio large amount financial planning doctor appointment medical treatment insurance policy contract agreement major decision career change",
|
|
40
|
+
medium: "project task deployment code fix feature technical work meeting schedule plan discussion regular maintenance",
|
|
41
|
+
low: "casual chat greeting small talk weather joke daily routine trivial minor note acknowledgment ok thanks yes no",
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const IMPORTANCE_SCORES = { critical: 9, high: 7, medium: 5, low: 3 };
|
|
45
|
+
|
|
46
|
+
// Threshold: if no anchor scores above this, keep default
|
|
47
|
+
const ENTITY_THRESHOLD = 0.3;
|
|
48
|
+
const IMPORTANCE_THRESHOLD = 0.25;
|
|
49
|
+
|
|
50
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
51
|
+
// Anchor cache — compute once, reuse forever
|
|
52
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
53
|
+
|
|
54
|
+
let anchorCache = null;
|
|
55
|
+
let anchorCachePath = null;
|
|
56
|
+
|
|
57
|
+
function getAnchorCachePath(ws) {
|
|
58
|
+
return join(ws, "memory", "classifier-anchors.json");
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function loadAnchors(ws) {
|
|
62
|
+
if (anchorCache) return anchorCache;
|
|
63
|
+
|
|
64
|
+
const cachePath = getAnchorCachePath(ws);
|
|
65
|
+
anchorCachePath = cachePath;
|
|
66
|
+
|
|
67
|
+
// Try loading from disk
|
|
68
|
+
if (existsSync(cachePath)) {
|
|
69
|
+
try {
|
|
70
|
+
anchorCache = JSON.parse(readFileSync(cachePath, "utf-8"));
|
|
71
|
+
// Validate: check if all categories are present
|
|
72
|
+
const entityKeys = Object.keys(ENTITY_ANCHORS);
|
|
73
|
+
const cachedKeys = Object.keys(anchorCache.entities || {});
|
|
74
|
+
if (entityKeys.every((k) => cachedKeys.includes(k))) {
|
|
75
|
+
return anchorCache;
|
|
76
|
+
}
|
|
77
|
+
// Cache incomplete, recompute
|
|
78
|
+
} catch { /* recompute */ }
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Compute anchor embeddings
|
|
82
|
+
console.error("[memory-engine] Computing classifier anchor embeddings...");
|
|
83
|
+
const entities = {};
|
|
84
|
+
for (const [name, desc] of Object.entries(ENTITY_ANCHORS)) {
|
|
85
|
+
const emb = await getEmbedding(desc);
|
|
86
|
+
if (emb) entities[name] = emb;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const importance = {};
|
|
90
|
+
for (const [name, desc] of Object.entries(IMPORTANCE_ANCHORS)) {
|
|
91
|
+
const emb = await getEmbedding(desc);
|
|
92
|
+
if (emb) importance[name] = emb;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
anchorCache = { entities, importance, version: 2 };
|
|
96
|
+
|
|
97
|
+
// Save to disk
|
|
98
|
+
mkdirSync(join(ws, "memory"), { recursive: true });
|
|
99
|
+
writeFileSync(cachePath, JSON.stringify(anchorCache), "utf-8");
|
|
100
|
+
console.error(`[memory-engine] Anchor embeddings cached (${Object.keys(entities).length} entities, ${Object.keys(importance).length} importance levels)`);
|
|
101
|
+
|
|
102
|
+
return anchorCache;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
106
|
+
// Classification functions
|
|
107
|
+
// ═══════════════════════════════════════════════════════════════════
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Classify entity using embedding similarity.
|
|
111
|
+
* @param {float[]} contentEmbedding - pre-computed embedding of the content
|
|
112
|
+
* @param {string} ws - workspace path (for anchor cache)
|
|
113
|
+
* @returns {Promise<string>} entity category or "general"
|
|
114
|
+
*/
|
|
115
|
+
export async function classifyEntity(contentEmbedding, ws) {
|
|
116
|
+
if (!contentEmbedding) return "general";
|
|
117
|
+
|
|
118
|
+
const anchors = await loadAnchors(ws);
|
|
119
|
+
if (!anchors?.entities || Object.keys(anchors.entities).length === 0) return "general";
|
|
120
|
+
|
|
121
|
+
let bestCategory = "general";
|
|
122
|
+
let bestScore = ENTITY_THRESHOLD;
|
|
123
|
+
|
|
124
|
+
for (const [category, anchorEmb] of Object.entries(anchors.entities)) {
|
|
125
|
+
const sim = cosineSimilarity(contentEmbedding, anchorEmb);
|
|
126
|
+
if (sim > bestScore) {
|
|
127
|
+
bestScore = sim;
|
|
128
|
+
bestCategory = category;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return bestCategory;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Rate importance using embedding similarity.
|
|
137
|
+
* @param {float[]} contentEmbedding - pre-computed embedding
|
|
138
|
+
* @param {string} ws - workspace path
|
|
139
|
+
* @returns {Promise<number>} importance score 1-10
|
|
140
|
+
*/
|
|
141
|
+
export async function classifyImportance(contentEmbedding, ws) {
|
|
142
|
+
if (!contentEmbedding) return 5;
|
|
143
|
+
|
|
144
|
+
const anchors = await loadAnchors(ws);
|
|
145
|
+
if (!anchors?.importance || Object.keys(anchors.importance).length === 0) return 5;
|
|
146
|
+
|
|
147
|
+
let bestLevel = "medium";
|
|
148
|
+
let bestScore = IMPORTANCE_THRESHOLD;
|
|
149
|
+
|
|
150
|
+
for (const [level, anchorEmb] of Object.entries(anchors.importance)) {
|
|
151
|
+
const sim = cosineSimilarity(contentEmbedding, anchorEmb);
|
|
152
|
+
if (sim > bestScore) {
|
|
153
|
+
bestScore = sim;
|
|
154
|
+
bestLevel = level;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return IMPORTANCE_SCORES[bestLevel] || 5;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Lightweight fallback classifier — no embedding API needed.
|
|
163
|
+
* Uses format/symbol signals that work across languages:
|
|
164
|
+
* - $ amounts → finance
|
|
165
|
+
* - URLs → technology
|
|
166
|
+
* - dates → general (but higher importance)
|
|
167
|
+
* - very short messages → low importance
|
|
168
|
+
*/
|
|
169
|
+
function fallbackClassify(content) {
|
|
170
|
+
let entity = "general";
|
|
171
|
+
let importance = 5;
|
|
172
|
+
|
|
173
|
+
// Finance: currency symbols, large numbers
|
|
174
|
+
if (/[\$€£¥₹]\s*[\d,.]+|\b\d{4,}[\d,.]*\b/.test(content)) {
|
|
175
|
+
entity = "finance";
|
|
176
|
+
importance = 7;
|
|
177
|
+
}
|
|
178
|
+
// Technology: URLs, code patterns, file paths
|
|
179
|
+
else if (/https?:\/\/|```|\/\w+\/\w+|\.(js|py|ts|json|yaml|md)\b/i.test(content)) {
|
|
180
|
+
entity = "technology";
|
|
181
|
+
}
|
|
182
|
+
// Dates with context → likely scheduling/planning
|
|
183
|
+
else if (/\b\d{4}-\d{2}-\d{2}\b|\b\d{1,2}:\d{2}\b/.test(content)) {
|
|
184
|
+
importance = 6;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Short messages are less important
|
|
188
|
+
if (content.length < 30) importance = Math.min(importance, 3);
|
|
189
|
+
// Long detailed messages are more important
|
|
190
|
+
if (content.length > 200) importance = Math.max(importance, 6);
|
|
191
|
+
|
|
192
|
+
return { entity, importance };
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Full classification: entity + importance in one call.
|
|
197
|
+
* Uses embedding similarity when available, falls back to format-based heuristics.
|
|
198
|
+
* @param {string} content - text to classify
|
|
199
|
+
* @param {string} ws - workspace path
|
|
200
|
+
* @param {float[]} [existingEmbedding] - reuse if already computed
|
|
201
|
+
* @returns {Promise<{ entity: string, importance: number, embedding: float[]|null }>}
|
|
202
|
+
*/
|
|
203
|
+
export async function classify(content, ws, existingEmbedding = null) {
|
|
204
|
+
const emb = existingEmbedding || await getEmbedding(content);
|
|
205
|
+
|
|
206
|
+
// If no embedding available (no API key), use fallback
|
|
207
|
+
if (!emb) {
|
|
208
|
+
const fb = fallbackClassify(content);
|
|
209
|
+
return { entity: fb.entity, importance: fb.importance, embedding: null };
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const [entity, importance] = await Promise.all([
|
|
213
|
+
classifyEntity(emb, ws),
|
|
214
|
+
classifyImportance(emb, ws),
|
|
215
|
+
]);
|
|
216
|
+
return { entity, importance, embedding: emb };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Batch re-classify existing records.
|
|
221
|
+
* @param {string} ws - workspace path
|
|
222
|
+
* @param {object[]} records - archival records with embeddings
|
|
223
|
+
* @param {object} embeddingCache - { id: float[] }
|
|
224
|
+
* @returns {Promise<{ reclassified: number, rerated: number }>}
|
|
225
|
+
*/
|
|
226
|
+
export async function batchReclassify(ws, records, embeddingCache) {
|
|
227
|
+
await loadAnchors(ws); // ensure anchors are cached
|
|
228
|
+
|
|
229
|
+
let reclassified = 0;
|
|
230
|
+
let rerated = 0;
|
|
231
|
+
|
|
232
|
+
for (const record of records) {
|
|
233
|
+
const emb = embeddingCache[record.id];
|
|
234
|
+
if (!emb) continue;
|
|
235
|
+
|
|
236
|
+
const newEntity = await classifyEntity(emb, ws);
|
|
237
|
+
if (newEntity !== "general" && record.entity === "general") {
|
|
238
|
+
record.entity = newEntity;
|
|
239
|
+
reclassified++;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const currentImp = record.importance ?? 5;
|
|
243
|
+
if (currentImp === 5) { // only re-rate flat defaults
|
|
244
|
+
const newImp = await classifyImportance(emb, ws);
|
|
245
|
+
if (newImp !== 5) {
|
|
246
|
+
record.importance = newImp;
|
|
247
|
+
rerated++;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return { reclassified, rerated };
|
|
253
|
+
}
|
package/lib/consolidate.js
CHANGED
|
@@ -1,30 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
2
|
+
* Extract structured facts from text blocks.
|
|
3
|
+
* v5.0: embedding-based classification — no hardcoded keywords.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { loadArchival, appendRecord } from "./archival.js";
|
|
7
|
-
import {
|
|
8
|
-
|
|
9
|
-
/** Generic entity inference patterns (no personal data). */
|
|
10
|
-
const ENTITY_PATTERNS = [
|
|
11
|
-
[/\b(IBKR|Interactive Brokers)\b/i, "IBKR"],
|
|
12
|
-
[/\b(immigration|PR|IRCC|CBSA|visa)\b/i, "immigration"],
|
|
13
|
-
[/\b(quant|trading|backtest|portfolio)\b/i, "trading"],
|
|
14
|
-
[/\b(doctor|医生|hospital|医院|clinic)\b/i, "health"],
|
|
15
|
-
[/\b(car|vehicle|SUV|sedan|truck|Tesla|Toyota|Lexus|BMW)\b/i, "vehicles"],
|
|
16
|
-
[/\b(house|home|mortgage|rent|property)\b/i, "property"],
|
|
17
|
-
[/\b(school|university|college|学校)\b/i, "education"],
|
|
18
|
-
[/\b(insurance|保险)\b/i, "insurance"],
|
|
19
|
-
[/\b(lawyer|律师|attorney|legal)\b/i, "legal"],
|
|
20
|
-
];
|
|
21
|
-
|
|
22
|
-
function inferEntity(text, fallback) {
|
|
23
|
-
for (const [pat, name] of ENTITY_PATTERNS) {
|
|
24
|
-
if (pat.test(text)) return name;
|
|
25
|
-
}
|
|
26
|
-
return fallback;
|
|
27
|
-
}
|
|
7
|
+
import { loadEmbeddingCache, saveEmbeddingCache } from "./embedding.js";
|
|
8
|
+
import { classify } from "./classifier.js";
|
|
28
9
|
|
|
29
10
|
/** Split text into sentence-level fact candidates. */
|
|
30
11
|
function extractCandidates(text) {
|
|
@@ -42,32 +23,27 @@ function extractCandidates(text) {
|
|
|
42
23
|
return segments
|
|
43
24
|
.filter((seg) => {
|
|
44
25
|
if (seg.startsWith("#") || seg.length < 10) return false;
|
|
45
|
-
if (/^(
|
|
26
|
+
if (/^(##|===|---|\*\*\*|```|>|\|)/.test(seg)) return false;
|
|
46
27
|
return true;
|
|
47
28
|
})
|
|
48
29
|
.map((seg) => seg.replace(/^[-*•]\s*/, "").replace(/^\d+\.\s*/, "").trim())
|
|
49
30
|
.filter((s) => s.length >= 10);
|
|
50
31
|
}
|
|
51
32
|
|
|
52
|
-
/** Check if a fact is a near-duplicate of existing content (keyword overlap >70%). */
|
|
53
33
|
function isDuplicate(factLower, existingTexts) {
|
|
54
34
|
const factWords = new Set(factLower.split(/\s+/).filter((w) => w.length > 2));
|
|
55
35
|
if (factWords.size === 0) return false;
|
|
56
|
-
|
|
57
36
|
for (const ex of existingTexts) {
|
|
58
37
|
const exWords = new Set(ex.split(/\s+/).filter((w) => w.length > 2));
|
|
59
38
|
let overlap = 0;
|
|
60
|
-
for (const w of factWords) {
|
|
61
|
-
if (exWords.has(w)) overlap++;
|
|
62
|
-
}
|
|
39
|
+
for (const w of factWords) { if (exWords.has(w)) overlap++; }
|
|
63
40
|
if (overlap / factWords.size > 0.7) return true;
|
|
64
41
|
}
|
|
65
42
|
return false;
|
|
66
43
|
}
|
|
67
44
|
|
|
68
45
|
/**
|
|
69
|
-
* Extract facts from text, deduplicate, and insert
|
|
70
|
-
* @returns {{ inserted: string[], skipped: string[], total: number }}
|
|
46
|
+
* Extract facts from text, classify via embeddings, deduplicate, and insert.
|
|
71
47
|
*/
|
|
72
48
|
export async function consolidateText(ws, text, defaultEntity = "", defaultTags = []) {
|
|
73
49
|
const candidates = extractCandidates(text);
|
|
@@ -80,20 +56,28 @@ export async function consolidateText(ws, text, defaultEntity = "", defaultTags
|
|
|
80
56
|
|
|
81
57
|
for (const fact of candidates) {
|
|
82
58
|
const factLower = fact.toLowerCase();
|
|
83
|
-
|
|
84
59
|
if (isDuplicate(factLower, existingTexts)) {
|
|
85
60
|
skipped.push(fact.slice(0, 60));
|
|
86
61
|
continue;
|
|
87
62
|
}
|
|
88
63
|
|
|
89
|
-
const entity =
|
|
64
|
+
const { entity, importance, embedding } = await classify(fact, ws);
|
|
65
|
+
const finalEntity = (entity !== "general") ? entity : defaultEntity || "general";
|
|
66
|
+
|
|
90
67
|
const record = appendRecord(ws, {
|
|
91
68
|
content: fact,
|
|
92
|
-
entity,
|
|
69
|
+
entity: finalEntity,
|
|
93
70
|
tags: defaultTags,
|
|
71
|
+
importance,
|
|
94
72
|
source: "consolidate",
|
|
95
73
|
});
|
|
96
|
-
|
|
74
|
+
|
|
75
|
+
if (embedding) {
|
|
76
|
+
const cache = loadEmbeddingCache(ws);
|
|
77
|
+
cache[record.id] = embedding;
|
|
78
|
+
saveEmbeddingCache(ws);
|
|
79
|
+
}
|
|
80
|
+
|
|
97
81
|
inserted.push(record.id);
|
|
98
82
|
existingTexts.push(factLower);
|
|
99
83
|
}
|
package/lib/quality.js
CHANGED
|
@@ -1,180 +1,43 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Data quality engine:
|
|
3
|
-
*
|
|
2
|
+
* Data quality engine v5.0: embedding-based classification.
|
|
3
|
+
* Replaces hardcoded regex patterns with semantic similarity.
|
|
4
|
+
* Language-agnostic — works with any language.
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { loadArchival, rewriteArchival } from "./archival.js";
|
|
7
8
|
import { addTriple, extractTriples, loadGraph } from "./graph.js";
|
|
8
9
|
import { saveEpisode, loadEpisodes } from "./episodes.js";
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
// Extended entity patterns (much richer than consolidate.js)
|
|
12
|
-
// ═══════════════════════════════════════════════════════════════════
|
|
13
|
-
|
|
14
|
-
const ENTITY_PATTERNS = [
|
|
15
|
-
// People / family
|
|
16
|
-
[/\b(wife|husband|spouse|老婆|老公|太太|丈夫|妻子)\b/i, "family"],
|
|
17
|
-
[/\b(son|daughter|child|kid|儿子|女儿|孩子)\b/i, "family"],
|
|
18
|
-
[/\b(mom|dad|mother|father|parent|妈|爸|父母)\b/i, "family"],
|
|
19
|
-
|
|
20
|
-
// Finance
|
|
21
|
-
[/\b(IBKR|Interactive Brokers|broker|brokerage)\b/i, "finance"],
|
|
22
|
-
[/\b(TFSA|RRSP|RESP|401k|IRA|pension)\b/i, "finance"],
|
|
23
|
-
[/\b(invest|portfolio|NAV|stock|ETF|QQQ|VOO|dividend)\b/i, "finance"],
|
|
24
|
-
[/\b(HELOC|mortgage|loan|credit|debt|利率|rate)\b/i, "finance"],
|
|
25
|
-
[/\b(bank|RBC|TD|BMO|Scotiabank|CIBC)\b/i, "finance"],
|
|
26
|
-
[/\b(budget|expense|income|salary|payment|pay|报税|tax)\b/i, "finance"],
|
|
27
|
-
[/\b(accountant|bookkeep|会计)\b/i, "finance"],
|
|
28
|
-
[/\b(\$\d|CAD|USD|万|千)\b/i, "finance"],
|
|
29
|
-
|
|
30
|
-
// Immigration / legal
|
|
31
|
-
[/\b(immigration|immigrant|PR|permanent resident|移民)\b/i, "immigration"],
|
|
32
|
-
[/\b(IRCC|CBSA|ATIP|NSIRA|Mandamus|IMM-)\b/i, "immigration"],
|
|
33
|
-
[/\b(visa|work permit|签证|工签)\b/i, "immigration"],
|
|
34
|
-
[/\b(lawyer|attorney|paralegal|律师|法律)\b/i, "legal"],
|
|
35
|
-
[/\b(petition|complaint|CHRC|tribunal|court|案)\b/i, "legal"],
|
|
36
|
-
|
|
37
|
-
// Health
|
|
38
|
-
[/\b(doctor|physician|GP|医生|主治|Dr\.)\b/i, "health"],
|
|
39
|
-
[/\b(hospital|clinic|medical|诊所|医院)\b/i, "health"],
|
|
40
|
-
[/\b(medication|medicine|drug|pill|tablet|药|处方)\b/i, "health"],
|
|
41
|
-
[/\b(cetirizine|urticaria|荨麻疹|allergy|过敏)\b/i, "health"],
|
|
42
|
-
[/\b(health|symptom|diagnosis|体检|检查|screening)\b/i, "health"],
|
|
43
|
-
[/\b(dental|dentist|vision|eye|牙|眼)\b/i, "health"],
|
|
44
|
-
|
|
45
|
-
// Vehicles
|
|
46
|
-
[/\b(car|vehicle|SUV|sedan|truck|van|minivan|车)\b/i, "vehicles"],
|
|
47
|
-
[/\b(Tesla|Toyota|Lexus|BMW|Mercedes|Cadillac|Honda|Audi)\b/i, "vehicles"],
|
|
48
|
-
[/\b(Escalade|GX550|ES350|Sienna|Model [3SXY])\b/i, "vehicles"],
|
|
49
|
-
[/\b(tire|tyre|PPF|wrap|oil change|maintenance|保养|轮胎)\b/i, "vehicles"],
|
|
50
|
-
[/\b(insurance|保险|Desjardins|policy)\b/i, "vehicles"],
|
|
51
|
-
|
|
52
|
-
// Infrastructure / DevOps
|
|
53
|
-
[/\b(k3d|k3s|k8s|kubernetes|cluster|pod|deploy)\b/i, "infrastructure"],
|
|
54
|
-
[/\b(ArgoCD|Helm|kubectl|GitOps|CI|CD|pipeline)\b/i, "infrastructure"],
|
|
55
|
-
[/\b(Docker|container|image|registry|GHCR)\b/i, "infrastructure"],
|
|
56
|
-
[/\b(U9|prod|production|staging|dev cluster)\b/i, "infrastructure"],
|
|
57
|
-
[/\b(SOPS|secret|encrypt|cert|SSL|TLS)\b/i, "infrastructure"],
|
|
58
|
-
|
|
59
|
-
// OpenClaw / AI
|
|
60
|
-
[/\b(OpenClaw|openclaw|gateway|plugin|hook)\b/i, "openclaw"],
|
|
61
|
-
[/\b(agent|session|compaction|memory|embedding)\b/i, "openclaw"],
|
|
62
|
-
[/\b(LLM|Claude|Anthropic|GPT|OpenAI|AI|token)\b/i, "ai"],
|
|
63
|
-
[/\b(prompt|context window|model|inference)\b/i, "ai"],
|
|
64
|
-
|
|
65
|
-
// Quant / trading
|
|
66
|
-
[/\b(quant|quantitative|backtest|backtesting)\b/i, "quant"],
|
|
67
|
-
[/\b(trading|trade|signal|strategy|turtle|海龟)\b/i, "quant"],
|
|
68
|
-
[/\b(Sharpe|drawdown|回撤|年化|annualized)\b/i, "quant"],
|
|
69
|
-
[/\b(paper trading|live trading|order|position)\b/i, "quant"],
|
|
70
|
-
|
|
71
|
-
// Messaging
|
|
72
|
-
[/\b(Telegram|Discord|WhatsApp|Slack|bot|channel)\b/i, "messaging"],
|
|
73
|
-
|
|
74
|
-
// Property / home
|
|
75
|
-
[/\b(house|home|condo|apartment|property|房|租)\b/i, "property"],
|
|
76
|
-
[/\b(NAS|Synology|backup|Time Machine)\b/i, "property"],
|
|
77
|
-
[/\b(lawn|garden|yard|snow|草坪|铲雪)\b/i, "property"],
|
|
78
|
-
|
|
79
|
-
// Education / kids
|
|
80
|
-
[/\b(school|class|homework|exam|test|学校|作业)\b/i, "education"],
|
|
81
|
-
[/\b(kindergarten|grade|teacher|老师)\b/i, "education"],
|
|
82
|
-
[/\b(swimming|skating|skiing|hockey|lesson|课)\b/i, "education"],
|
|
83
|
-
[/\b(Science Fair|concert|recital|表演)\b/i, "education"],
|
|
84
|
-
|
|
85
|
-
// Projects / SaaS
|
|
86
|
-
[/\b(icex|SaaS|MVP|startup|product|launch)\b/i, "project"],
|
|
87
|
-
[/\b(ESP32|Arduino|IoT|hardware|sensor)\b/i, "project"],
|
|
88
|
-
|
|
89
|
-
// Shopping / daily
|
|
90
|
-
[/\b(Costco|Amazon|Walmart|shopping|购物|买)\b/i, "shopping"],
|
|
91
|
-
[/\b(flight|airline|Air Canada|travel|trip|机票|飞)\b/i, "travel"],
|
|
92
|
-
[/\b(restaurant|food|meal|dinner|lunch|吃|饭)\b/i, "daily"],
|
|
93
|
-
];
|
|
94
|
-
|
|
95
|
-
// ═══════════════════════════════════════════════════════════════════
|
|
96
|
-
// Importance rules
|
|
97
|
-
// ═══════════════════════════════════════════════════════════════════
|
|
98
|
-
|
|
99
|
-
const IMPORTANCE_RULES = [
|
|
100
|
-
// High (8-9): critical life matters
|
|
101
|
-
{ match: /\b(immigration|PR|IRCC|CBSA|Mandamus|visa|NSIRA|CHRC|petition|lawsuit|court)\b/i, importance: 9 },
|
|
102
|
-
{ match: /\b(IBKR|NAV|portfolio|invest|\$\d{4,}|万|HELOC|mortgage)\b/i, importance: 8 },
|
|
103
|
-
{ match: /\b(doctor|hospital|medication|diagnosis|surgery|health insurance|AHCIP)\b/i, importance: 8 },
|
|
104
|
-
{ match: /\b(lawyer|attorney|legal|律师)\b/i, importance: 8 },
|
|
105
|
-
{ match: /\b(永远不要|NEVER|CRITICAL|严禁|必须|MUST)\b/i, importance: 9 },
|
|
106
|
-
{ match: /\b(VIN|policy number|case number|account number|IMM-)\b/i, importance: 8 },
|
|
107
|
-
|
|
108
|
-
// Medium-high (7): important but not critical
|
|
109
|
-
{ match: /\b(ArgoCD|GitOps|k3d|U9|prod|deploy|CI)\b/i, importance: 6 },
|
|
110
|
-
{ match: /\b(quant|backtest|trading|signal|Sharpe)\b/i, importance: 7 },
|
|
111
|
-
{ match: /\b(GX550|Escalade|ES350|car insurance)\b/i, importance: 6 },
|
|
112
|
-
{ match: /\b(OpenClaw|gateway|plugin|config)\b/i, importance: 6 },
|
|
113
|
-
{ match: /\b(icex|SaaS|MVP|ESP32)\b/i, importance: 6 },
|
|
114
|
-
|
|
115
|
-
// Low (3): ephemeral
|
|
116
|
-
{ match: /\b(swimming lesson|concert|recital|playdate)\b/i, importance: 3 },
|
|
117
|
-
{ match: /\b(weather|天气)\b/i, importance: 2 },
|
|
118
|
-
{ match: /\b(heartbeat|HEARTBEAT_OK|session start|daily log)\b/i, importance: 2 },
|
|
119
|
-
{ match: /\b(good morning|good night|早上好|晚安)\b/i, importance: 2 },
|
|
120
|
-
];
|
|
121
|
-
|
|
122
|
-
function inferImportance(content, currentImportance) {
|
|
123
|
-
// Only re-rate if currently at default (5)
|
|
124
|
-
if (currentImportance !== 5) return currentImportance;
|
|
125
|
-
|
|
126
|
-
for (const rule of IMPORTANCE_RULES) {
|
|
127
|
-
if (rule.match.test(content)) return rule.importance;
|
|
128
|
-
}
|
|
129
|
-
return 5; // keep default if no rule matches
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
function inferEntity(content, currentEntity) {
|
|
133
|
-
// Only re-classify if currently "general" or empty
|
|
134
|
-
if (currentEntity && currentEntity !== "general") return currentEntity;
|
|
135
|
-
|
|
136
|
-
for (const [pattern, entity] of ENTITY_PATTERNS) {
|
|
137
|
-
if (pattern.test(content)) return entity;
|
|
138
|
-
}
|
|
139
|
-
return currentEntity || "general";
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
// ═══════════════════════════════════════════════════════════════════
|
|
143
|
-
// Quality pass
|
|
144
|
-
// ═══════════════════════════════════════════════════════════════════
|
|
10
|
+
import { loadEmbeddingCache } from "./embedding.js";
|
|
11
|
+
import { batchReclassify } from "./classifier.js";
|
|
145
12
|
|
|
146
13
|
/**
|
|
147
14
|
* Run a full quality pass over archival records.
|
|
15
|
+
* Uses embedding-based classification (no regex).
|
|
148
16
|
* @returns {{ reclassified, rerated, triplesAdded, episodesGenerated }}
|
|
149
17
|
*/
|
|
150
|
-
export function runQualityPass(ws, options = {}) {
|
|
18
|
+
export async function runQualityPass(ws, options = {}) {
|
|
151
19
|
const records = loadArchival(ws);
|
|
152
|
-
const
|
|
153
|
-
const existingTripleSet = new Set(
|
|
154
|
-
existingGraph.map((t) => `${t.s}|${t.r}|${t.o}`.toLowerCase()),
|
|
155
|
-
);
|
|
20
|
+
const embCache = loadEmbeddingCache(ws);
|
|
156
21
|
|
|
22
|
+
// 1. Embedding-based re-classification (entity + importance)
|
|
157
23
|
let reclassified = 0;
|
|
158
24
|
let rerated = 0;
|
|
159
|
-
let triplesAdded = 0;
|
|
160
|
-
|
|
161
|
-
for (const record of records) {
|
|
162
|
-
// 1. Re-classify entity
|
|
163
|
-
const newEntity = inferEntity(record.content, record.entity);
|
|
164
|
-
if (newEntity !== record.entity) {
|
|
165
|
-
record.entity = newEntity;
|
|
166
|
-
reclassified++;
|
|
167
|
-
}
|
|
168
25
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
26
|
+
const embeddedRecords = records.filter((r) => embCache[r.id]);
|
|
27
|
+
if (embeddedRecords.length > 0) {
|
|
28
|
+
const result = await batchReclassify(ws, embeddedRecords, embCache);
|
|
29
|
+
reclassified = result.reclassified;
|
|
30
|
+
rerated = result.rerated;
|
|
31
|
+
}
|
|
175
32
|
|
|
176
|
-
|
|
177
|
-
|
|
33
|
+
// 2. Extract graph triples
|
|
34
|
+
let triplesAdded = 0;
|
|
35
|
+
if (!options.skipGraph) {
|
|
36
|
+
const existingGraph = loadGraph(ws);
|
|
37
|
+
const existingTripleSet = new Set(
|
|
38
|
+
existingGraph.map((t) => `${t.s}|${t.r}|${t.o}`.toLowerCase()),
|
|
39
|
+
);
|
|
40
|
+
for (const record of records) {
|
|
178
41
|
const triples = extractTriples(record.content);
|
|
179
42
|
for (const t of triples) {
|
|
180
43
|
const key = `${t.s}|${t.r}|${t.o}`.toLowerCase();
|
|
@@ -189,12 +52,12 @@ export function runQualityPass(ws, options = {}) {
|
|
|
189
52
|
}
|
|
190
53
|
}
|
|
191
54
|
|
|
192
|
-
// Save updated records
|
|
55
|
+
// 3. Save updated records
|
|
193
56
|
if (reclassified > 0 || rerated > 0) {
|
|
194
57
|
rewriteArchival(ws, records);
|
|
195
58
|
}
|
|
196
59
|
|
|
197
|
-
// 4. Generate episodes from
|
|
60
|
+
// 4. Generate episodes from daily record clusters
|
|
198
61
|
let episodesGenerated = 0;
|
|
199
62
|
if (!options.skipEpisodes) {
|
|
200
63
|
episodesGenerated = generateEpisodesFromRecords(ws, records);
|
|
@@ -210,7 +73,6 @@ function generateEpisodesFromRecords(ws, records) {
|
|
|
210
73
|
const episodes = loadEpisodes(ws);
|
|
211
74
|
const existingDates = new Set(episodes.map((e) => e.ts?.slice(0, 10)));
|
|
212
75
|
|
|
213
|
-
// Group records by date
|
|
214
76
|
const byDate = {};
|
|
215
77
|
for (const r of records) {
|
|
216
78
|
if (!r.ts) continue;
|
|
@@ -221,10 +83,8 @@ function generateEpisodesFromRecords(ws, records) {
|
|
|
221
83
|
|
|
222
84
|
let generated = 0;
|
|
223
85
|
for (const [date, dayRecords] of Object.entries(byDate)) {
|
|
224
|
-
// Skip if episode already exists for this date, or too few records
|
|
225
86
|
if (existingDates.has(date) || dayRecords.length < 3) continue;
|
|
226
87
|
|
|
227
|
-
// Aggregate topics and entities
|
|
228
88
|
const topics = [...new Set(dayRecords.map((r) => r.entity).filter((e) => e && e !== "general"))];
|
|
229
89
|
const topContent = dayRecords
|
|
230
90
|
.sort((a, b) => (b.importance || 5) - (a.importance || 5))
|
|
@@ -251,11 +111,11 @@ function generateEpisodesFromRecords(ws, records) {
|
|
|
251
111
|
*/
|
|
252
112
|
export function formatQualityReport(result) {
|
|
253
113
|
const lines = [
|
|
254
|
-
`📊 Memory Quality Pass Complete`,
|
|
114
|
+
`📊 Memory Quality Pass Complete (embedding-based v5.0)`,
|
|
255
115
|
``,
|
|
256
116
|
` Records scanned: ${result.total}`,
|
|
257
|
-
` Entities re-classified: ${result.reclassified}`,
|
|
258
|
-
` Importance re-rated: ${result.rerated}`,
|
|
117
|
+
` Entities re-classified: ${result.reclassified} (via semantic similarity)`,
|
|
118
|
+
` Importance re-rated: ${result.rerated} (via semantic similarity)`,
|
|
259
119
|
` Graph triples extracted: ${result.triplesAdded}`,
|
|
260
120
|
` Episodes generated: ${result.episodesGenerated}`,
|
|
261
121
|
];
|
package/package.json
CHANGED