memshell 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/ingest.js ADDED
@@ -0,0 +1,348 @@
1
+ 'use strict';
2
+
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+ const os = require('os');
6
+
7
+ // ── LLM Extraction ────────────────────────────────────────────
8
+ async function callLLM(text, config = {}) {
9
+ const anthropicKey = config.anthropicKey || process.env.ANTHROPIC_API_KEY;
10
+ const openaiKey = config.apiKey || config.openaiKey || process.env.OPENAI_API_KEY;
11
+ const model = config.model || 'gpt-4o-mini';
12
+
13
+ const systemPrompt = 'Extract key facts, user preferences, decisions, and important context from this conversation. Return as a JSON array of strings, each a standalone fact. Only return the JSON array, nothing else.';
14
+
15
+ if (anthropicKey && (model.startsWith('claude') || !openaiKey)) {
16
+ const res = await fetch('https://api.anthropic.com/v1/messages', {
17
+ method: 'POST',
18
+ headers: {
19
+ 'Content-Type': 'application/json',
20
+ 'x-api-key': anthropicKey,
21
+ 'anthropic-version': '2023-06-01'
22
+ },
23
+ body: JSON.stringify({
24
+ model: model.startsWith('claude') ? model : 'claude-3-haiku-20240307',
25
+ max_tokens: 2048,
26
+ system: systemPrompt,
27
+ messages: [{ role: 'user', content: text }]
28
+ })
29
+ });
30
+ if (!res.ok) throw new Error(`Anthropic API error: ${res.status} ${await res.text()}`);
31
+ const data = await res.json();
32
+ const content = data.content[0].text;
33
+ return JSON.parse(content);
34
+ }
35
+
36
+ if (openaiKey) {
37
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
38
+ method: 'POST',
39
+ headers: {
40
+ 'Content-Type': 'application/json',
41
+ 'Authorization': `Bearer ${openaiKey}`
42
+ },
43
+ body: JSON.stringify({
44
+ model: model.startsWith('claude') ? 'gpt-4o-mini' : model,
45
+ messages: [
46
+ { role: 'system', content: systemPrompt },
47
+ { role: 'user', content: text }
48
+ ],
49
+ temperature: 0.3
50
+ })
51
+ });
52
+ if (!res.ok) throw new Error(`OpenAI API error: ${res.status} ${await res.text()}`);
53
+ const data = await res.json();
54
+ const content = data.choices[0].message.content;
55
+ // Extract JSON array from response
56
+ const match = content.match(/\[[\s\S]*\]/);
57
+ if (!match) throw new Error('LLM did not return a valid JSON array');
58
+ return JSON.parse(match[0]);
59
+ }
60
+
61
+ throw new Error('No API key found. Set OPENAI_API_KEY or ANTHROPIC_API_KEY, or run: memshell config set apiKey <key>');
62
+ }
63
+
64
+ // ── Chunking ───────────────────────────────────────────────────
65
+ function chunkText(text, maxTokens = 2000) {
66
+ // Rough estimate: 1 token ≈ 4 chars
67
+ const maxChars = maxTokens * 4;
68
+ if (text.length <= maxChars) return [text];
69
+
70
+ const chunks = [];
71
+ const lines = text.split('\n');
72
+ let current = '';
73
+
74
+ for (const line of lines) {
75
+ if ((current + '\n' + line).length > maxChars && current.length > 0) {
76
+ chunks.push(current.trim());
77
+ current = line;
78
+ } else {
79
+ current += (current ? '\n' : '') + line;
80
+ }
81
+ }
82
+ if (current.trim()) chunks.push(current.trim());
83
+ return chunks;
84
+ }
85
+
86
+ // ── Similarity (simple word overlap for dedup) ─────────────────
87
+ function wordSet(text) {
88
+ return new Set(text.toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(Boolean));
89
+ }
90
+
91
+ function jaccardSimilarity(a, b) {
92
+ const setA = wordSet(a);
93
+ const setB = wordSet(b);
94
+ let intersection = 0;
95
+ for (const w of setA) if (setB.has(w)) intersection++;
96
+ const union = setA.size + setB.size - intersection;
97
+ return union === 0 ? 0 : intersection / union;
98
+ }
99
+
100
+ // ── Main Ingest Function ──────────────────────────────────────
101
+ async function ingest(text, store, opts = {}) {
102
+ const config = loadConfig();
103
+ const mergedConfig = { ...config, ...opts };
104
+ const source = opts.source || 'auto-ingest';
105
+ const agent = opts.agent || 'default';
106
+
107
+ const chunks = chunkText(text);
108
+ let totalExtracted = 0;
109
+ let totalStored = 0;
110
+ let totalDuplicates = 0;
111
+
112
+ // Get existing memories for dedup
113
+ const existing = await store.list({ agent });
114
+ const existingTexts = existing.map(m => m.text);
115
+
116
+ for (const chunk of chunks) {
117
+ if (chunk.trim().length < 20) continue; // skip tiny chunks
118
+
119
+ let facts;
120
+ try {
121
+ facts = await callLLM(chunk, mergedConfig);
122
+ } catch (e) {
123
+ console.error(` Warning: LLM extraction failed for chunk: ${e.message}`);
124
+ continue;
125
+ }
126
+
127
+ if (!Array.isArray(facts)) continue;
128
+ totalExtracted += facts.length;
129
+
130
+ for (const fact of facts) {
131
+ if (typeof fact !== 'string' || fact.trim().length < 5) continue;
132
+
133
+ // Dedup check
134
+ let isDuplicate = false;
135
+ for (const existing of existingTexts) {
136
+ if (jaccardSimilarity(fact, existing) > 0.85) {
137
+ isDuplicate = true;
138
+ break;
139
+ }
140
+ }
141
+
142
+ if (isDuplicate) {
143
+ totalDuplicates++;
144
+ continue;
145
+ }
146
+
147
+ // Auto-generate tags from fact
148
+ const tags = [source, 'auto'].join(',');
149
+ await store.set(fact, { agent, tags, source });
150
+ existingTexts.push(fact); // prevent self-duplication within batch
151
+ totalStored++;
152
+ }
153
+ }
154
+
155
+ return { extracted: totalExtracted, stored: totalStored, duplicates: totalDuplicates };
156
+ }
157
+
158
+ // ── JSONL Parser (OpenClaw format) ─────────────────────────────
159
+ function parseJSONL(content) {
160
+ const lines = content.split('\n').filter(l => l.trim());
161
+ const messages = [];
162
+
163
+ for (const line of lines) {
164
+ try {
165
+ const obj = JSON.parse(line);
166
+ if (obj.role && obj.content) {
167
+ if (obj.role === 'user' || obj.role === 'assistant') {
168
+ const text = typeof obj.content === 'string' ? obj.content : JSON.stringify(obj.content);
169
+ messages.push(`${obj.role}: ${text}`);
170
+ }
171
+ }
172
+ } catch {
173
+ // skip invalid lines
174
+ }
175
+ }
176
+
177
+ return messages.join('\n');
178
+ }
179
+
180
+ // ── Config Management ──────────────────────────────────────────
181
+ function configPath() {
182
+ return path.join(os.homedir(), '.mem', 'config.json');
183
+ }
184
+
185
+ function loadConfig() {
186
+ try {
187
+ return JSON.parse(fs.readFileSync(configPath(), 'utf8'));
188
+ } catch {
189
+ return {};
190
+ }
191
+ }
192
+
193
+ function saveConfig(config) {
194
+ const dir = path.dirname(configPath());
195
+ fs.mkdirSync(dir, { recursive: true });
196
+ fs.writeFileSync(configPath(), JSON.stringify(config, null, 2));
197
+ }
198
+
199
+ function setConfigValue(key, value) {
200
+ const config = loadConfig();
201
+ // Support dotted keys like watch.openclaw
202
+ const parts = key.split('.');
203
+ let obj = config;
204
+ for (let i = 0; i < parts.length - 1; i++) {
205
+ if (!obj[parts[i]] || typeof obj[parts[i]] !== 'object') obj[parts[i]] = {};
206
+ obj = obj[parts[i]];
207
+ }
208
+ obj[parts[parts.length - 1]] = value;
209
+ saveConfig(config);
210
+ return config;
211
+ }
212
+
213
+ // ── Processed Tracker ──────────────────────────────────────────
214
+ function processedPath() {
215
+ return path.join(os.homedir(), '.mem', 'processed.json');
216
+ }
217
+
218
+ function loadProcessed() {
219
+ try {
220
+ return JSON.parse(fs.readFileSync(processedPath(), 'utf8'));
221
+ } catch {
222
+ return { files: {} };
223
+ }
224
+ }
225
+
226
+ function saveProcessed(data) {
227
+ const dir = path.dirname(processedPath());
228
+ fs.mkdirSync(dir, { recursive: true });
229
+ fs.writeFileSync(processedPath(), JSON.stringify(data, null, 2));
230
+ }
231
+
232
+ function markProcessed(filePath, mtime) {
233
+ const data = loadProcessed();
234
+ data.files[filePath] = { mtime: mtime || Date.now(), processedAt: new Date().toISOString() };
235
+ saveProcessed(data);
236
+ }
237
+
238
+ function isProcessed(filePath, mtime) {
239
+ const data = loadProcessed();
240
+ const entry = data.files[filePath];
241
+ if (!entry) return false;
242
+ if (mtime && entry.mtime < mtime) return false; // file was modified
243
+ return true;
244
+ }
245
+
246
+ // ── File Ingestion ─────────────────────────────────────────────
247
+ async function ingestFile(filePath, store, opts = {}) {
248
+ const absPath = path.resolve(filePath);
249
+ const stat = fs.statSync(absPath);
250
+ const mtime = stat.mtimeMs;
251
+
252
+ if (!opts.force && isProcessed(absPath, mtime)) {
253
+ return { skipped: true, file: absPath };
254
+ }
255
+
256
+ const content = fs.readFileSync(absPath, 'utf8');
257
+ let text;
258
+
259
+ const ext = path.extname(absPath).toLowerCase();
260
+ if (ext === '.jsonl') {
261
+ text = parseJSONL(content);
262
+ } else if (ext === '.json') {
263
+ try {
264
+ const data = JSON.parse(content);
265
+ if (Array.isArray(data)) {
266
+ text = data.map(d => typeof d === 'string' ? d : JSON.stringify(d)).join('\n');
267
+ } else {
268
+ text = JSON.stringify(data);
269
+ }
270
+ } catch {
271
+ text = content;
272
+ }
273
+ } else {
274
+ text = content;
275
+ }
276
+
277
+ if (!text || text.trim().length < 20) {
278
+ return { skipped: true, file: absPath, reason: 'too short' };
279
+ }
280
+
281
+ const source = opts.source || `file:${path.basename(absPath)}`;
282
+ const result = await ingest(text, store, { ...opts, source });
283
+ markProcessed(absPath, mtime);
284
+ return { ...result, file: absPath };
285
+ }
286
+
287
+ // ── Directory Watcher (polling) ────────────────────────────────
288
+ function watchDirectory(dir, store, opts = {}) {
289
+ const interval = opts.interval || 10000;
290
+ const absDir = path.resolve(dir);
291
+
292
+ console.log(` Watching: ${absDir} (every ${interval / 1000}s)`);
293
+
294
+ async function scan() {
295
+ try {
296
+ const files = fs.readdirSync(absDir).filter(f => {
297
+ const ext = path.extname(f).toLowerCase();
298
+ return ['.txt', '.md', '.json', '.jsonl'].includes(ext);
299
+ });
300
+
301
+ for (const file of files) {
302
+ const filePath = path.join(absDir, file);
303
+ try {
304
+ const result = await ingestFile(filePath, store, opts);
305
+ if (!result.skipped) {
306
+ console.log(` Ingested: ${file} (${result.extracted} extracted, ${result.stored} stored, ${result.duplicates} duplicates)`);
307
+ }
308
+ } catch (e) {
309
+ console.error(` Error processing ${file}: ${e.message}`);
310
+ }
311
+ }
312
+ } catch (e) {
313
+ console.error(` Watch error: ${e.message}`);
314
+ }
315
+ }
316
+
317
+ scan(); // initial scan
318
+ return setInterval(scan, interval);
319
+ }
320
+
321
+ // ── OpenClaw Connector ─────────────────────────────────────────
322
+ function defaultOpenClawPath() {
323
+ return path.join(os.homedir(), '.openclaw', 'agents', 'main', 'sessions');
324
+ }
325
+
326
+ function watchOpenClaw(sessionsPath, store, opts = {}) {
327
+ const dir = sessionsPath || defaultOpenClawPath();
328
+ console.log(` Connecting to OpenClaw sessions: ${dir}`);
329
+ return watchDirectory(dir, store, { ...opts, source: 'openclaw' });
330
+ }
331
+
332
+ module.exports = {
333
+ ingest,
334
+ ingestFile,
335
+ callLLM,
336
+ chunkText,
337
+ jaccardSimilarity,
338
+ parseJSONL,
339
+ loadConfig,
340
+ saveConfig,
341
+ setConfigValue,
342
+ watchDirectory,
343
+ watchOpenClaw,
344
+ defaultOpenClawPath,
345
+ loadProcessed,
346
+ isProcessed,
347
+ markProcessed
348
+ };