@shadowforge0/aquifer-memory 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -127,18 +127,28 @@ const results = await aquifer.recall('auth middleware decision', {
127
127
  ## Architecture
128
128
 
129
129
  ```
130
- ┌─────────────────────────────────────────────────────────────┐
131
- createAquifer (entry)
132
- Config · Migration · Ingest · Recall · Enrich
133
- └────────┬──────────┬──────────┬──────────┬───────────────────┘
130
+ ┌──────────────────────────────────────────────────────────────┐
131
+ Agent Hosts
132
+ Claude Code · OpenClaw · Codex · OpenCode · ...
133
+ └──────────────────────┬───────────────────────────────────────┘
134
+ │ MCP (stdio or HTTP)
135
+ ┌──────────────────────▼───────────────────────────────────────┐
136
+ │ Aquifer MCP Server (canonical API) │
137
+ │ session_recall · session_feedback · memory_stats · ... │
138
+ └──────────────────────┬───────────────────────────────────────┘
139
+
140
+ ┌──────────────────────▼───────────────────────────────────────┐
141
+ │ createAquifer (engine) │
142
+ │ Config · Migration · Ingest · Recall · Enrich │
143
+ └────────┬──────────┬──────────┬──────────┬────────────────────┘
134
144
  │ │ │ │
135
145
  ┌────▼───┐ ┌────▼────┐ ┌──▼───┐ ┌───▼──────────┐
136
146
  │storage │ │hybrid- │ │entity│ │ pipeline/ │
137
147
  │ .js │ │rank.js │ │ .js │ │summarize.js │
138
148
  └────────┘ └─────────┘ └──────┘ │embed.js │
139
149
  │ │ │extract-ent.js │
140
- ┌────▼───────────┐ ┌───▼──┐ └───────────────┘
141
- │ PostgreSQL │ │ LLM │
150
+ ┌────▼───────────┐ ┌───▼──┐ │rerank.js │
151
+ │ PostgreSQL │ │ LLM │ └───────────────┘
142
152
  │ + pgvector │ │ API │
143
153
  └────────────────┘ └──────┘
144
154
 
@@ -151,11 +161,13 @@ const results = await aquifer.recall('auth middleware decision', {
151
161
  └──────────────────────────────────┘
152
162
  ```
153
163
 
164
+ **Integration model:** MCP is the primary integration surface. Agent hosts connect to Aquifer through the MCP server (`consumers/mcp.js`), which exposes `session_recall`, `session_feedback`, `memory_stats`, and `memory_pending`. The CLI wraps the same engine for command-line use. The OpenClaw plugin (`consumers/openclaw-plugin.js`) is retained as a compatibility adapter for session capture but is not the primary tool delivery path.
165
+
154
166
  ### File Reference
155
167
 
156
168
  | File | Purpose |
157
169
  |------|---------|
158
- | `index.js` | Entry point — exports `createAquifer`, `createEmbedder` |
170
+ | `index.js` | Entry point — exports `createAquifer`, `createEmbedder`, `createReranker`, `normalizeSession` |
159
171
  | `core/aquifer.js` | Main facade: `migrate()`, `ingest()`, `recall()`, `enrich()` |
160
172
  | `core/storage.js` | Session/summary/turn CRUD, FTS search, embedding search |
161
173
  | `core/entity.js` | Entity upsert, mention tracking, relation graph, normalization |
@@ -163,6 +175,8 @@ const results = await aquifer.recall('auth middleware decision', {
163
175
  | `pipeline/summarize.js` | LLM-powered session summarization with structured output |
164
176
  | `pipeline/embed.js` | Embedding client (any OpenAI-compatible API) |
165
177
  | `pipeline/extract-entities.js` | LLM-powered entity extraction (12 types) |
178
+ | `pipeline/rerank.js` | Cross-encoder reranking (TEI, Jina, OpenRouter) |
179
+ | `pipeline/normalize/` | Session normalization for Claude Code / gateway noise |
166
180
  | `schema/001-base.sql` | DDL: sessions, summaries, turn_embeddings, FTS indexes |
167
181
  | `schema/002-entities.sql` | DDL: entities, mentions, relations, entity_sessions |
168
182
  | `schema/003-trust-feedback.sql` | DDL: trust_score column, session_feedback audit trail |
@@ -395,9 +409,48 @@ createAquifer({
395
409
 
396
410
  Fallback chain: `config.entities.scope` → `'default'`.
397
411
 
398
- ### Consumers (CLI, MCP, OpenClaw plugin)
412
+ ### MCP Server (primary integration)
413
+
414
+ Agent hosts should connect through the Aquifer MCP server. For OpenClaw, add to `openclaw.json`:
415
+
416
+ ```json
417
+ {
418
+ "mcp": {
419
+ "servers": {
420
+ "aquifer": {
421
+ "command": "node",
422
+ "args": ["/path/to/aquifer/consumers/mcp.js"],
423
+ "env": {
424
+ "DATABASE_URL": "postgresql://...",
425
+ "AQUIFER_SCHEMA": "aquifer",
426
+ "AQUIFER_EMBED_BASE_URL": "http://localhost:11434/v1",
427
+ "AQUIFER_EMBED_MODEL": "bge-m3"
428
+ }
429
+ }
430
+ }
431
+ }
432
+ }
433
+ ```
434
+
435
+ Tools are exposed as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__memory_stats`, `aquifer__memory_pending` (server name prefix is added by the host).
436
+
437
+ For Claude Code, add to `.claude.json`:
438
+
439
+ ```json
440
+ {
441
+ "mcpServers": {
442
+ "aquifer": {
443
+ "type": "stdio",
444
+ "command": "node",
445
+ "args": ["/path/to/aquifer/consumers/mcp.js"]
446
+ }
447
+ }
448
+ }
449
+ ```
450
+
451
+ ### CLI (secondary)
399
452
 
400
- For consumer-based setup using environment variables instead of code:
453
+ For command-line use with environment variables:
401
454
 
402
455
  ```bash
403
456
  export DATABASE_URL="postgresql://..."
package/consumers/cli.js CHANGED
@@ -14,7 +14,6 @@
14
14
  */
15
15
 
16
16
  const { createAquiferFromConfig } = require('./shared/factory');
17
- const { loadConfig } = require('./shared/config');
18
17
 
19
18
  // ---------------------------------------------------------------------------
20
19
  // Argument parser (minimal, no deps)
@@ -120,30 +119,12 @@ async function cmdBackfill(aquifer, args) {
120
119
  const skipTurnEmbed = !!args.flags['skip-turn-embed'];
121
120
  const skipEntities = !!args.flags['skip-entities'];
122
121
 
123
- const config = aquifer._config || {};
124
- const schema = config.schema || 'aquifer';
125
- const tenantId = config.tenantId || 'default';
126
- const pool = aquifer._pool;
122
+ const pending = await aquifer.getPendingSessions({ limit });
127
123
 
128
- if (!pool) {
129
- console.error('Backfill requires direct pool access.');
130
- process.exit(1);
131
- }
132
-
133
- const qi = (id) => `"${id}"`;
134
- const { rows } = await pool.query(`
135
- SELECT session_id, agent_id, processing_status
136
- FROM ${qi(schema)}.sessions
137
- WHERE tenant_id = $1
138
- AND processing_status IN ('pending', 'failed')
139
- ORDER BY started_at DESC
140
- LIMIT $2
141
- `, [tenantId, limit]);
142
-
143
- console.log(`Found ${rows.length} sessions to backfill${dryRun ? ' (dry-run)' : ''}`);
124
+ console.log(`Found ${pending.length} sessions to backfill${dryRun ? ' (dry-run)' : ''}`);
144
125
 
145
126
  let enriched = 0, failed = 0;
146
- for (const row of rows) {
127
+ for (const row of pending) {
147
128
  if (dryRun) {
148
129
  console.log(` [dry-run] ${row.session_id} (${row.agent_id}) status=${row.processing_status}`);
149
130
  continue;
@@ -164,40 +145,12 @@ async function cmdBackfill(aquifer, args) {
164
145
  }
165
146
  }
166
147
 
167
- console.log(`\nDone. enriched=${enriched} failed=${failed} total=${rows.length}`);
148
+ console.log(`\nDone. enriched=${enriched} failed=${failed} total=${pending.length}`);
168
149
  if (failed > 0) process.exitCode = 2;
169
150
  }
170
151
 
171
152
  async function cmdStats(aquifer, args) {
172
- const config = aquifer._config || {};
173
- const schema = config.schema || 'aquifer';
174
- const tenantId = config.tenantId || 'default';
175
- const pool = aquifer._pool;
176
-
177
- if (!pool) {
178
- console.error('Stats requires direct pool access.');
179
- process.exit(1);
180
- }
181
-
182
- const qi = (id) => `"${id}"`;
183
- const [sessions, summaries, turns, entities] = await Promise.all([
184
- pool.query(`SELECT processing_status, COUNT(*)::int as count FROM ${qi(schema)}.sessions WHERE tenant_id = $1 GROUP BY processing_status`, [tenantId]),
185
- pool.query(`SELECT COUNT(*)::int as count FROM ${qi(schema)}.session_summaries WHERE tenant_id = $1`, [tenantId]),
186
- pool.query(`SELECT COUNT(*)::int as count FROM ${qi(schema)}.turn_embeddings WHERE tenant_id = $1`, [tenantId]),
187
- pool.query(`SELECT COUNT(*)::int as count FROM ${qi(schema)}.entities WHERE tenant_id = $1`, [tenantId]).catch(() => ({ rows: [{ count: 0 }] })),
188
- ]);
189
-
190
- const timeRange = await pool.query(`SELECT MIN(started_at) as earliest, MAX(started_at) as latest FROM ${qi(schema)}.sessions WHERE tenant_id = $1`, [tenantId]);
191
-
192
- const stats = {
193
- sessions: Object.fromEntries(sessions.rows.map(r => [r.processing_status, r.count])),
194
- sessionTotal: sessions.rows.reduce((s, r) => s + r.count, 0),
195
- summaries: summaries.rows[0]?.count || 0,
196
- turnEmbeddings: turns.rows[0]?.count || 0,
197
- entities: entities.rows[0]?.count || 0,
198
- earliest: timeRange.rows[0]?.earliest || null,
199
- latest: timeRange.rows[0]?.latest || null,
200
- };
153
+ const stats = await aquifer.getStats();
201
154
 
202
155
  if (args.flags.json) {
203
156
  console.log(JSON.stringify(stats, null, 2));
@@ -211,34 +164,14 @@ async function cmdStats(aquifer, args) {
211
164
  }
212
165
 
213
166
  async function cmdExport(aquifer, args) {
214
- const config = aquifer._config || {};
215
- const schema = config.schema || 'aquifer';
216
- const tenantId = config.tenantId || 'default';
217
- const pool = aquifer._pool;
218
167
  const output = args.flags.output || null;
219
168
  const limit = parseInt(args.flags.limit || '1000', 10);
220
169
 
221
- if (!pool) {
222
- console.error('Export requires direct pool access.');
223
- process.exit(1);
224
- }
225
-
226
- const qi = (id) => `"${id}"`;
227
- const where = [`s.tenant_id = $1`];
228
- const params = [tenantId];
229
-
230
- if (args.flags['agent-id']) { params.push(args.flags['agent-id']); where.push(`s.agent_id = $${params.length}`); }
231
- if (args.flags.source) { params.push(args.flags.source); where.push(`s.source = $${params.length}`); }
232
- params.push(limit);
233
-
234
- const { rows } = await pool.query(`
235
- SELECT s.*, ss.summary_text, ss.structured_summary
236
- FROM ${qi(schema)}.sessions s
237
- LEFT JOIN ${qi(schema)}.session_summaries ss ON ss.session_row_id = s.id
238
- WHERE ${where.join(' AND ')}
239
- ORDER BY s.started_at DESC
240
- LIMIT $${params.length}
241
- `, params);
170
+ const rows = await aquifer.exportSessions({
171
+ agentId: args.flags['agent-id'],
172
+ source: args.flags.source,
173
+ limit,
174
+ });
242
175
 
243
176
  const stream = output ? require('fs').createWriteStream(output) : process.stdout;
244
177
  for (const row of rows) {
@@ -340,7 +273,7 @@ Options:
340
273
  process.exit(1);
341
274
  }
342
275
  } finally {
343
- if (aquifer._pool) await aquifer._pool.end();
276
+ await aquifer.close();
344
277
  }
345
278
  }
346
279
 
package/consumers/mcp.js CHANGED
@@ -2,7 +2,12 @@
2
2
  'use strict';
3
3
 
4
4
  /**
5
- * Aquifer MCP Server — session_recall tool via Model Context Protocol.
5
+ * Aquifer MCP Server — canonical external contract for agent host integration.
6
+ *
7
+ * This is the primary integration surface for Aquifer. Agent hosts (Claude Code,
8
+ * Codex, OpenCode, etc.) should integrate through this MCP server.
9
+ *
10
+ * Tools: session_recall, session_feedback, memory_stats, memory_pending
6
11
  *
7
12
  * Usage:
8
13
  * npx aquifer mcp
@@ -69,7 +74,7 @@ async function main() {
69
74
 
70
75
  const server = new McpServer({
71
76
  name: 'aquifer-memory',
72
- version: '0.6.0',
77
+ version: '0.8.0',
73
78
  });
74
79
 
75
80
  server.tool(
@@ -84,6 +89,7 @@ async function main() {
84
89
  dateTo: z.string().optional().describe('End date YYYY-MM-DD'),
85
90
  entities: z.array(z.string()).optional().describe('Entity names to match'),
86
91
  entityMode: z.enum(['any', 'all']).optional().describe('"any" (default, boost) or "all" (only sessions with every entity)'),
92
+ mode: z.enum(['fts', 'hybrid', 'vector']).optional().describe('Recall mode: "fts" (keyword only, no embed needed), "hybrid" (default, FTS + vector), "vector" (vector only)'),
87
93
  },
88
94
  async (params) => {
89
95
  try {
@@ -100,6 +106,7 @@ async function main() {
100
106
  recallOpts.entities = params.entities;
101
107
  recallOpts.entityMode = params.entityMode || 'any';
102
108
  }
109
+ if (params.mode) recallOpts.mode = params.mode;
103
110
 
104
111
  const results = await aquifer.recall(params.query, recallOpts);
105
112
  const text = formatResults(results, params.query);
@@ -120,6 +127,7 @@ async function main() {
120
127
  sessionId: z.string().min(1).describe('Session ID to give feedback on'),
121
128
  verdict: z.enum(['helpful', 'unhelpful']).describe('Was the recalled session useful?'),
122
129
  note: z.string().optional().describe('Optional reason'),
130
+ agentId: z.string().optional().describe('Agent ID the session was stored under (e.g. "main"). Defaults to "agent" if omitted.'),
123
131
  },
124
132
  async (params) => {
125
133
  try {
@@ -127,6 +135,7 @@ async function main() {
127
135
  const result = await aquifer.feedback(params.sessionId, {
128
136
  verdict: params.verdict,
129
137
  note: params.note || undefined,
138
+ agentId: params.agentId || undefined,
130
139
  });
131
140
  return {
132
141
  content: [{ type: 'text', text: `Feedback: ${result.verdict} (trust ${result.trustBefore.toFixed(2)} → ${result.trustAfter.toFixed(2)})` }],
@@ -140,9 +149,64 @@ async function main() {
140
149
  }
141
150
  );
142
151
 
152
+ server.tool(
153
+ 'memory_stats',
154
+ 'Return storage statistics for the Aquifer memory store (session counts by status, summaries, turn embeddings, entities, date range).',
155
+ {},
156
+ async () => {
157
+ try {
158
+ const aquifer = getAquifer();
159
+ const stats = await aquifer.getStats();
160
+ const lines = [
161
+ `Sessions: ${stats.sessionTotal} total`,
162
+ ];
163
+ for (const [status, count] of Object.entries(stats.sessions)) {
164
+ lines.push(` ${status}: ${count}`);
165
+ }
166
+ lines.push(`Summaries: ${stats.summaries}`);
167
+ lines.push(`Turn embeddings: ${stats.turnEmbeddings}`);
168
+ lines.push(`Entities: ${stats.entities}`);
169
+ if (stats.earliest) lines.push(`Date range: ${new Date(stats.earliest).toISOString().slice(0, 10)} → ${new Date(stats.latest).toISOString().slice(0, 10)}`);
170
+ return { content: [{ type: 'text', text: lines.join('\n') }] };
171
+ } catch (err) {
172
+ return {
173
+ content: [{ type: 'text', text: `memory_stats error: ${err.message}` }],
174
+ isError: true,
175
+ };
176
+ }
177
+ }
178
+ );
179
+
180
+ server.tool(
181
+ 'memory_pending',
182
+ 'List sessions with pending or failed processing status.',
183
+ {
184
+ limit: z.number().int().min(1).max(200).optional().describe('Max results (default 20)'),
185
+ },
186
+ async (params) => {
187
+ try {
188
+ const aquifer = getAquifer();
189
+ const rows = await aquifer.getPendingSessions({ limit: params.limit ?? 20 });
190
+ if (rows.length === 0) {
191
+ return { content: [{ type: 'text', text: 'No pending or failed sessions.' }] };
192
+ }
193
+ const lines = [`${rows.length} pending/failed session(s):\n`];
194
+ for (const row of rows) {
195
+ lines.push(`${row.session_id} [${row.processing_status}] agent=${row.agent_id}`);
196
+ }
197
+ return { content: [{ type: 'text', text: lines.join('\n') }] };
198
+ } catch (err) {
199
+ return {
200
+ content: [{ type: 'text', text: `memory_pending error: ${err.message}` }],
201
+ isError: true,
202
+ };
203
+ }
204
+ }
205
+ );
206
+
143
207
  // Graceful shutdown
144
208
  const cleanup = async () => {
145
- if (_aquifer?._pool) await _aquifer._pool.end().catch(() => {});
209
+ if (_aquifer) await _aquifer.close().catch(() => {});
146
210
  process.exit(0);
147
211
  };
148
212
  process.on('SIGINT', cleanup);
@@ -153,7 +217,7 @@ async function main() {
153
217
 
154
218
  // Clean up pool when transport closes (stdin EOF)
155
219
  transport.onclose = async () => {
156
- if (_aquifer?._pool) await _aquifer._pool.end().catch(() => {});
220
+ if (_aquifer) await _aquifer.close().catch(() => {});
157
221
  };
158
222
  }
159
223
 
@@ -1,11 +1,17 @@
1
1
  'use strict';
2
2
 
3
3
  /**
4
- * Aquifer Memory — OpenClaw Plugin
4
+ * Aquifer Memory — OpenClaw Host Adapter
5
5
  *
6
- * Auto-captures sessions on before_reset and provides session_recall tool.
7
- * Install: add to openclaw.json plugins or extensions directory.
6
+ * Ingest adapter: auto-captures sessions on before_reset.
7
+ * Tool adapter: exposes session_recall/session_feedback via OpenClaw registerTool().
8
+ *
9
+ * Status: COMPATIBILITY ONLY. The official tool delivery path is mcp.servers.aquifer
10
+ * (see consumers/mcp.js). registerTool() exposure has OpenClaw upstream limitations
11
+ * that prevent reliable tool visibility. This plugin is retained for before_reset
12
+ * session capture; tool registration code is kept for future upstream fixes.
8
13
  *
14
+ * Install: add to openclaw.json plugins or extensions directory.
9
15
  * Config via plugin config, environment variables, or aquifer.config.json.
10
16
  */
11
17
 
@@ -169,6 +175,10 @@ module.exports = {
169
175
  } catch (enrichErr) {
170
176
  api.logger.warn(`[aquifer-memory] enrich failed for ${sessionId}: ${enrichErr.message}`);
171
177
  }
178
+ } else {
179
+ try {
180
+ await aquifer.skip(sessionId, { agentId, reason: `user_count=${norm.userCount} < min=${minUserMessages}` });
181
+ } catch (e) { api.logger.warn(`[aquifer-memory] skip failed for ${sessionId}: ${e.message}`); }
172
182
  }
173
183
 
174
184
  recentlyProcessed.set(dedupKey, Date.now());
@@ -189,8 +199,6 @@ module.exports = {
189
199
 
190
200
  // --- session_recall tool ---
191
201
 
192
- // --- session_recall tool ---
193
-
194
202
  api.registerTool((ctx) => {
195
203
  if ((ctx?.sessionKey || '').includes('subagent')) return null;
196
204
 
@@ -208,6 +216,7 @@ module.exports = {
208
216
  dateTo: { type: 'string', description: 'End date YYYY-MM-DD' },
209
217
  entities: { type: 'array', items: { type: 'string' }, description: 'Entity names to match' },
210
218
  entityMode: { type: 'string', enum: ['any', 'all'], description: '"any" (default, boost) or "all" (only sessions with every entity)' },
219
+ mode: { type: 'string', enum: ['fts', 'hybrid', 'vector'], description: 'Recall mode: "fts" (keyword only), "hybrid" (default), "vector" (vector only)' },
211
220
  },
212
221
  required: ['query'],
213
222
  },
@@ -225,6 +234,7 @@ module.exports = {
225
234
  recallOpts.entities = params.entities;
226
235
  recallOpts.entityMode = params.entityMode || 'any';
227
236
  }
237
+ if (params.mode) recallOpts.mode = params.mode;
228
238
 
229
239
  const results = await aquifer.recall(params.query, recallOpts);
230
240
  const text = formatRecallResults(results);
@@ -253,14 +263,17 @@ module.exports = {
253
263
  sessionId: { type: 'string', description: 'Session ID to give feedback on' },
254
264
  verdict: { type: 'string', enum: ['helpful', 'unhelpful'], description: 'Was the recalled session useful?' },
255
265
  note: { type: 'string', description: 'Optional reason' },
266
+ agentId: { type: 'string', description: 'Agent ID the session was stored under (e.g. "main"). Defaults to context agent or "agent" if omitted.' },
256
267
  },
257
268
  required: ['sessionId', 'verdict'],
258
269
  },
259
270
  async execute(_toolCallId, params) {
260
271
  try {
272
+ const resolvedAgentId = params.agentId || ctx?.agentId || undefined;
261
273
  const result = await aquifer.feedback(params.sessionId, {
262
274
  verdict: params.verdict,
263
275
  note: params.note || undefined,
276
+ agentId: resolvedAgentId,
264
277
  });
265
278
  return {
266
279
  content: [{ type: 'text', text: `Feedback: ${result.verdict} (trust ${result.trustBefore.toFixed(2)} → ${result.trustAfter.toFixed(2)})` }],
@@ -33,10 +33,10 @@ const DEFAULTS = {
33
33
  rank: { rrf: 0.65, timeDecay: 0.25, access: 0.10, entityBoost: 0.18 },
34
34
  rerank: {
35
35
  enabled: false,
36
- provider: null, // 'tei' | 'jina' | 'custom'
36
+ provider: null, // 'tei' | 'jina' | 'openrouter' | 'custom'
37
37
  baseUrl: null, // TEI base URL
38
- apiKey: null, // Jina API key
39
- model: null, // Jina model override
38
+ apiKey: null, // Jina / OpenRouter API key
39
+ model: null, // model override (Jina / OpenRouter)
40
40
  topK: 20,
41
41
  maxChars: 1600,
42
42
  timeoutMs: 2000,
@@ -71,12 +71,18 @@ function createAquiferFromConfig(overrides) {
71
71
  if (rc.model) rerankConfig.jinaModel = rc.model;
72
72
  rerankConfig.timeout = rc.timeoutMs || 2000;
73
73
  rerankConfig.maxRetries = rc.maxRetries ?? 1;
74
+ } else if (rc.provider === 'openrouter') {
75
+ rerankConfig.openrouterApiKey = rc.apiKey;
76
+ if (rc.model) rerankConfig.model = rc.model;
77
+ rerankConfig.timeout = rc.timeoutMs || 5000;
78
+ rerankConfig.maxRetries = rc.maxRetries ?? 1;
74
79
  }
75
80
  rerankOpts = rerankConfig;
76
81
  }
77
82
 
78
83
  const aquifer = createAquifer({
79
84
  db: pool,
85
+ ownsPool: true,
80
86
  schema: config.schema,
81
87
  tenantId: config.tenantId,
82
88
  embed: embedFn ? { fn: embedFn, dim: config.embed.dim || null } : null,
@@ -86,10 +92,6 @@ function createAquiferFromConfig(overrides) {
86
92
  rerank: rerankOpts,
87
93
  });
88
94
 
89
- // Attach pool for lifecycle management
90
- aquifer._pool = pool;
91
- aquifer._config = config;
92
-
93
95
  return aquifer;
94
96
  }
95
97
 
package/core/aquifer.js CHANGED
@@ -77,6 +77,7 @@ function createAquifer(config) {
77
77
  ownsPool = true;
78
78
  } else {
79
79
  pool = config.db;
80
+ ownsPool = !!config.ownsPool; // allow factory to claim ownership
80
81
  }
81
82
 
82
83
  // Embed config (lazy — only required for recall/enrich)
@@ -99,8 +100,18 @@ function createAquifer(config) {
99
100
  const entityPromptFn = config.entities && config.entities.prompt ? config.entities.prompt : null;
100
101
  const entityScope = (config.entities && config.entities.scope) || 'default';
101
102
 
102
- // FTS config (default: 'simple'; set to 'zhcfg' for Chinese tokenization)
103
- const ftsConfig = config.ftsConfig || 'simple';
103
+ // FTS config locked to 'simple'.
104
+ // The search_tsv trigger always uses to_tsvector('simple', ...), so query-time
105
+ // config must match. Warn and override if someone passes anything else.
106
+ const _rawFtsConfig = config.ftsConfig || 'simple';
107
+ if (_rawFtsConfig !== 'simple') {
108
+ console.warn(
109
+ `[aquifer] ftsConfig '${_rawFtsConfig}' is not currently supported. ` +
110
+ `The search_tsv index is built with 'simple'; only 'simple' is valid at query time. ` +
111
+ `Overriding to 'simple'.`
112
+ );
113
+ }
114
+ const ftsConfig = 'simple';
104
115
 
105
116
  // Rank weights
106
117
  const rankWeights = {
@@ -551,7 +562,16 @@ function createAquifer(config) {
551
562
 
552
563
  async recall(query, opts = {}) {
553
564
  if (!query) return [];
554
- requireEmbed('recall');
565
+
566
+ const VALID_MODES = ['fts', 'hybrid', 'vector'];
567
+ const mode = opts.mode !== undefined ? opts.mode : 'hybrid';
568
+ if (!VALID_MODES.includes(mode)) {
569
+ throw new Error(`Invalid recall mode: "${mode}". Must be one of: ${VALID_MODES.join(', ')}`);
570
+ }
571
+
572
+ if (mode === 'hybrid' || mode === 'vector') {
573
+ requireEmbed('recall');
574
+ }
555
575
 
556
576
  const {
557
577
  agentId,
@@ -582,10 +602,13 @@ function createAquifer(config) {
582
602
  const rerankTopK = rerankEnabled ? Math.max(limit, opts.rerankTopK || defaultRerankTopK) : limit;
583
603
  const fetchLimit = rerankTopK * 4;
584
604
 
585
- // 1. Embed query
586
- const queryVecResult = await embedFn([query]);
587
- const queryVec = queryVecResult[0];
588
- if (!queryVec || !queryVec.length) return []; // m3: guard empty array too
605
+ // 1. Embed query (only needed for hybrid/vector modes)
606
+ let queryVec = null;
607
+ if (mode === 'hybrid' || mode === 'vector') {
608
+ const queryVecResult = await embedFn([query]);
609
+ queryVec = queryVecResult[0];
610
+ if (!queryVec || !queryVec.length) return []; // m3: guard empty array too
611
+ }
589
612
 
590
613
  // 2. Entity intersection pre-filter (when entityMode === 'all')
591
614
  let candidateSessionIds = null; // null = no filter
@@ -661,17 +684,26 @@ function createAquifer(config) {
661
684
  } catch (_) { /* entity search failure non-fatal */ }
662
685
  }
663
686
 
664
- // 3. Run 3 search paths in parallel
687
+ // 3. Run search paths in parallel (conditioned on mode)
688
+ const runFts = mode === 'fts' || mode === 'hybrid';
689
+ const runVector = mode === 'vector' || mode === 'hybrid';
690
+
665
691
  const [ftsRows, embRows, turnResult] = await Promise.all([
666
- storage.searchSessions(pool, query, {
667
- schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit, ftsConfig,
668
- }).catch(() => []),
669
- embeddingSearchSummaries(queryVec, {
670
- agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
671
- }).catch(() => []),
672
- storage.searchTurnEmbeddings(pool, {
673
- schema, tenantId, queryVec, dateFrom, dateTo, agentIds: resolvedAgentIds, source, limit: fetchLimit,
674
- }).catch(() => ({ rows: [] })),
692
+ runFts
693
+ ? storage.searchSessions(pool, query, {
694
+ schema, tenantId, agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit, ftsConfig,
695
+ }).catch(() => [])
696
+ : Promise.resolve([]),
697
+ runVector
698
+ ? embeddingSearchSummaries(queryVec, {
699
+ agentIds: resolvedAgentIds, source, dateFrom, dateTo, limit: fetchLimit,
700
+ }).catch(() => [])
701
+ : Promise.resolve([]),
702
+ runVector
703
+ ? storage.searchTurnEmbeddings(pool, {
704
+ schema, tenantId, queryVec, dateFrom, dateTo, agentIds: resolvedAgentIds, source, limit: fetchLimit,
705
+ }).catch(() => ({ rows: [] }))
706
+ : Promise.resolve({ rows: [] }),
675
707
  ]);
676
708
 
677
709
  const turnRows = turnResult.rows || [];
@@ -836,6 +868,27 @@ function createAquifer(config) {
836
868
  return storage.getSession(pool, sessionId, agentId, opts, { schema, tenantId });
837
869
  },
838
870
 
871
+ async skip(sessionId, opts = {}) {
872
+ const agentId = opts.agentId || 'agent';
873
+ const reason = opts.reason || null;
874
+ // Atomic CAS: only skip if still pending (avoids race with concurrent enrich)
875
+ const result = await pool.query(
876
+ `UPDATE ${qi(schema)}.sessions
877
+ SET processing_status = 'skipped', processing_error = $1
878
+ WHERE session_id = $2 AND agent_id = $3 AND tenant_id = $4
879
+ AND processing_status = 'pending'
880
+ RETURNING id`,
881
+ [reason, sessionId, agentId, tenantId]
882
+ );
883
+ if (result.rows.length === 0) {
884
+ // Check if session exists at all
885
+ const existing = await storage.getSession(pool, sessionId, agentId, {}, { schema, tenantId });
886
+ if (!existing) throw new Error(`Session not found: ${sessionId} (agentId=${agentId})`);
887
+ return null; // exists but not pending — no-op
888
+ }
889
+ return { id: result.rows[0].id, sessionId, agentId, status: 'skipped' };
890
+ },
891
+
839
892
  async getSessionFull(sessionId) {
840
893
  // Try to find the session across agents by querying directly
841
894
  const result = await pool.query(
@@ -868,6 +921,93 @@ function createAquifer(config) {
868
921
  summary: sumResult.rows[0] || null,
869
922
  };
870
923
  },
924
+
925
+ // --- public config accessor ---
926
+
927
+ getConfig() {
928
+ return { schema, tenantId };
929
+ },
930
+
931
+ // --- admin query helpers ---
932
+
933
+ async getStats() {
934
+ const [sessions, summaries, turns, timeRange] = await Promise.all([
935
+ pool.query(
936
+ `SELECT processing_status, COUNT(*)::int as count
937
+ FROM ${qi(schema)}.sessions WHERE tenant_id = $1
938
+ GROUP BY processing_status`,
939
+ [tenantId]
940
+ ),
941
+ pool.query(
942
+ `SELECT COUNT(*)::int as count FROM ${qi(schema)}.session_summaries WHERE tenant_id = $1`,
943
+ [tenantId]
944
+ ),
945
+ pool.query(
946
+ `SELECT COUNT(*)::int as count FROM ${qi(schema)}.turn_embeddings WHERE tenant_id = $1`,
947
+ [tenantId]
948
+ ),
949
+ pool.query(
950
+ `SELECT MIN(started_at) as earliest, MAX(started_at) as latest
951
+ FROM ${qi(schema)}.sessions WHERE tenant_id = $1`,
952
+ [tenantId]
953
+ ),
954
+ ]);
955
+
956
+ let entityCount = 0;
957
+ try {
958
+ const entResult = await pool.query(
959
+ `SELECT COUNT(*)::int as count FROM ${qi(schema)}.entities WHERE tenant_id = $1`,
960
+ [tenantId]
961
+ );
962
+ entityCount = entResult.rows[0]?.count || 0;
963
+ } catch (_) { /* entities table may not exist */ }
964
+
965
+ return {
966
+ sessions: Object.fromEntries(sessions.rows.map(r => [r.processing_status, r.count])),
967
+ sessionTotal: sessions.rows.reduce((s, r) => s + r.count, 0),
968
+ summaries: summaries.rows[0]?.count || 0,
969
+ turnEmbeddings: turns.rows[0]?.count || 0,
970
+ entities: entityCount,
971
+ earliest: timeRange.rows[0]?.earliest || null,
972
+ latest: timeRange.rows[0]?.latest || null,
973
+ };
974
+ },
975
+
976
+ async getPendingSessions(opts = {}) {
977
+ const limit = opts.limit !== undefined ? opts.limit : 100;
978
+ const result = await pool.query(
979
+ `SELECT session_id, agent_id, processing_status
980
+ FROM ${qi(schema)}.sessions
981
+ WHERE tenant_id = $1
982
+ AND processing_status IN ('pending', 'failed')
983
+ ORDER BY started_at DESC
984
+ LIMIT $2`,
985
+ [tenantId, limit]
986
+ );
987
+ return result.rows;
988
+ },
989
+
990
+ async exportSessions(opts = {}) {
991
+ const { agentId, source, limit = 1000 } = opts;
992
+ const where = [`s.tenant_id = $1`];
993
+ const params = [tenantId];
994
+
995
+ if (agentId) { params.push(agentId); where.push(`s.agent_id = $${params.length}`); }
996
+ if (source) { params.push(source); where.push(`s.source = $${params.length}`); }
997
+ params.push(limit);
998
+
999
+ const result = await pool.query(
1000
+ `SELECT s.session_id, s.agent_id, s.source, s.started_at, s.msg_count,
1001
+ s.processing_status, ss.summary_text, ss.structured_summary
1002
+ FROM ${qi(schema)}.sessions s
1003
+ LEFT JOIN ${qi(schema)}.session_summaries ss ON ss.session_row_id = s.id
1004
+ WHERE ${where.join(' AND ')}
1005
+ ORDER BY s.started_at DESC
1006
+ LIMIT $${params.length}`,
1007
+ params
1008
+ );
1009
+ return result.rows;
1010
+ },
871
1011
  };
872
1012
 
873
1013
  return aquifer;
package/core/storage.js CHANGED
@@ -31,7 +31,7 @@ const TURN_NOISE_RE = [
31
31
  /^A new session was started via \/new/,
32
32
  ];
33
33
 
34
- const VALID_STATUSES = new Set(['pending', 'processing', 'succeeded', 'partial', 'failed']);
34
+ const VALID_STATUSES = new Set(['pending', 'processing', 'succeeded', 'partial', 'failed', 'skipped']);
35
35
 
36
36
  // ---------------------------------------------------------------------------
37
37
  // upsertSession
@@ -339,8 +339,17 @@ async function searchSessions(pool, query, {
339
339
  ftsConfig = 'simple',
340
340
  } = {}) {
341
341
  const clampedLimit = Math.max(1, Math.min(100, limit));
342
- // Sanitize ftsConfig to prevent SQL injection (must be a valid regconfig name)
343
- const safeFts = /^[a-zA-Z_][a-zA-Z0-9_]*$/.test(ftsConfig) ? ftsConfig : 'simple';
342
+ // FTS config is locked to 'simple' the search_tsv trigger always uses
343
+ // to_tsvector('simple', ...) so query semantics must match. Warn callers
344
+ // that pass a different value rather than silently honouring it.
345
+ if (ftsConfig !== 'simple') {
346
+ console.warn(
347
+ `[aquifer/storage] searchSessions: ftsConfig '${ftsConfig}' ignored. ` +
348
+ `Only 'simple' is supported (index is built with simple tokenizer). ` +
349
+ `Using 'simple'.`
350
+ );
351
+ }
352
+ const safeFts = 'simple';
344
353
 
345
354
  // Normalize agentId/agentIds
346
355
  const agentIds = rawAgentIds && rawAgentIds.length > 0
package/index.js CHANGED
@@ -3,5 +3,6 @@
3
3
  const { createAquifer } = require('./core/aquifer');
4
4
  const { createEmbedder } = require('./pipeline/embed');
5
5
  const { createReranker } = require('./pipeline/rerank');
6
+ const { normalizeSession, detectClient } = require('./pipeline/normalize');
6
7
 
7
- module.exports = { createAquifer, createEmbedder, createReranker };
8
+ module.exports = { createAquifer, createEmbedder, createReranker, normalizeSession, detectClient };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shadowforge0/aquifer-memory",
3
- "version": "0.7.0",
3
+ "version": "0.8.0",
4
4
  "description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph. Includes CLI, MCP server, and OpenClaw plugin.",
5
5
  "main": "index.js",
6
6
  "files": [
@@ -35,8 +35,8 @@
35
35
  "pg": "^8.13.0"
36
36
  },
37
37
  "optionalDependencies": {
38
- "@modelcontextprotocol/sdk": "^1.12.0",
39
- "zod": "^3.24.0"
38
+ "@modelcontextprotocol/sdk": "^1.29.0",
39
+ "zod": "^3.25.76"
40
40
  },
41
41
  "engines": {
42
42
  "node": ">=18.0.0"
@@ -0,0 +1,90 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Claude Code adapter — for Claude Code CLI sessions.
5
+ * Entry types are 'user'/'assistant' (split format: one content type per entry).
6
+ * Text and tool_use are separate entries, enabling narration detection via look-ahead.
7
+ */
8
+
9
+ const { extractContent } = require('../extract');
10
+ const { parseTimestamp } = require('../timestamp');
11
+ const { MAX_NARRATION_CHARS } = require('../constants');
12
+
13
+ module.exports = {
14
+ name: 'claude-code',
15
+
16
+ detect(entry) {
17
+ // Only count entry types that participate in normalize
18
+ return entry.type === 'user' || entry.type === 'assistant';
19
+ },
20
+
21
+ toIntermediate(entry, ctx) {
22
+ const { idx, rawEntries } = ctx;
23
+ const entryType = entry.type;
24
+
25
+ if (entryType !== 'user' && entryType !== 'assistant') {
26
+ return { idx, toolNames: [], adapterSkip: 'nonMessage' };
27
+ }
28
+
29
+ const role = entry.message?.role || entryType;
30
+
31
+ if (role === 'toolResult') {
32
+ return { idx, toolNames: [], adapterSkip: 'toolResult' };
33
+ }
34
+ if (role !== 'user' && role !== 'assistant') {
35
+ return { idx, role: null, toolNames: [], adapterSkip: 'noRole' };
36
+ }
37
+ if (entry.isMeta) {
38
+ return { idx, toolNames: [], adapterSkip: 'meta' };
39
+ }
40
+
41
+ const { text, commandName, toolNames } = extractContent(entry.message);
42
+
43
+ // CLI internal command output tags
44
+ if (text.includes('<local-command-caveat>') || text.includes('<local-command-stdout>') || text.includes('<local-command-stderr>')) {
45
+ return { idx, toolNames, adapterSkip: 'caveat' };
46
+ }
47
+
48
+ const isInterrupt = text.startsWith('[Request interrupted by user');
49
+
50
+ // Tool-use-only assistant entry (no visible text, only tool calls)
51
+ if (!text && toolNames.length > 0 && role === 'assistant') {
52
+ return { idx, toolNames, adapterSkip: 'toolOnly' };
53
+ }
54
+
55
+ // Narration detection: short text entry immediately followed by a tool_use entry.
56
+ // Claude Code splits text and tool_use into separate JSONL entries.
57
+ // A short text before a tool call is narration ("Now reading X...", "Let me check...").
58
+ if (role === 'assistant' && text && text.length < MAX_NARRATION_CHARS) {
59
+ let nextIsTool = false;
60
+ for (let j = idx + 1; j < rawEntries.length && j < idx + 3; j++) {
61
+ const ne = rawEntries[j];
62
+ if (ne.type === 'assistant') {
63
+ const nc = ne.message?.content;
64
+ if (Array.isArray(nc) && nc.some(x => x.type === 'tool_use')) nextIsTool = true;
65
+ break;
66
+ }
67
+ }
68
+ if (nextIsTool) {
69
+ return { idx, toolNames, adapterSkip: 'narration' };
70
+ }
71
+ }
72
+
73
+ return {
74
+ idx, role, text,
75
+ timestamp: parseTimestamp(entry),
76
+ toolNames, commandName, isInterrupt,
77
+ adapterSkip: null,
78
+ };
79
+ },
80
+
81
+ routinePatterns: [
82
+ /^<task-notification>/,
83
+ ],
84
+
85
+ skipCommands: [
86
+ '/model', '/cost', '/memory', '/permissions', '/diff', '/review',
87
+ '/doctor', '/login', '/logout', '/mcp', '/context', '/fast',
88
+ '/think', '/vim', '/exit',
89
+ ],
90
+ };
@@ -0,0 +1,67 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Gateway adapter — for AI gateway servers that produce type='message' entries.
5
+ * Content blocks combine text + thinking + toolCall in a single entry.
6
+ * Supports channel metadata stripping (Discord, Telegram, etc.).
7
+ */
8
+
9
+ const { extractContent } = require('../extract');
10
+ const { parseTimestamp } = require('../timestamp');
11
+
12
+ // Channel metadata prefix injected by gateway routing layers
13
+ const METADATA_PREFIX_RE = /^(?:Conversation info \(untrusted metadata\):[\s\S]*?```\s*\n\s*)?(?:Sender \(untrusted metadata\):[\s\S]*?```\s*\n\s*)?/;
14
+
15
+ function stripChannelMetadata(text) {
16
+ const stripped = text.replace(METADATA_PREFIX_RE, '').trim();
17
+ return stripped || text;
18
+ }
19
+
20
+ module.exports = {
21
+ name: 'gateway',
22
+
23
+ detect(entry) {
24
+ return entry.type === 'message';
25
+ },
26
+
27
+ toIntermediate(entry, ctx) {
28
+ const { idx } = ctx;
29
+
30
+ if (entry.type !== 'message') {
31
+ return { idx, toolNames: [], adapterSkip: 'nonMessage' };
32
+ }
33
+
34
+ const msg = entry.message;
35
+ const role = msg?.role;
36
+
37
+ if (role === 'toolResult') {
38
+ return { idx, toolNames: [], adapterSkip: 'toolResult' };
39
+ }
40
+ if (role !== 'user' && role !== 'assistant') {
41
+ return { idx, role: null, toolNames: [], adapterSkip: 'noRole' };
42
+ }
43
+
44
+ const { text, commandName, toolNames } = extractContent(msg);
45
+
46
+ let finalText = text;
47
+ const isInterrupt = text.startsWith('[Request interrupted by user');
48
+ if (role === 'user' && finalText && !isInterrupt) {
49
+ finalText = stripChannelMetadata(finalText);
50
+ }
51
+
52
+ return {
53
+ idx, role, text: finalText,
54
+ timestamp: parseTimestamp(entry),
55
+ toolNames, commandName, isInterrupt,
56
+ adapterSkip: null,
57
+ };
58
+ },
59
+
60
+ routinePatterns: [
61
+ /^HEARTBEAT_OK$/,
62
+ /^THINK_OK$/,
63
+ /^\[Queued messages while agent was busy\]/,
64
+ ],
65
+
66
+ skipCommands: [],
67
+ };
@@ -0,0 +1,12 @@
1
+ 'use strict';
2
+
3
+ // Commands that produce no conversational value — skip entirely
4
+ const SKIP_COMMANDS = new Set(['/clear', '/compact', '/help', '/status', '/config']);
5
+
6
+ // Commands that mark session boundaries — keep as boundary markers
7
+ const RESET_COMMANDS = new Set(['/new', '/reset']);
8
+
9
+ const MAX_MSG_CHARS = 8000;
10
+ const MAX_NARRATION_CHARS = 200;
11
+
12
+ module.exports = { SKIP_COMMANDS, RESET_COMMANDS, MAX_MSG_CHARS, MAX_NARRATION_CHARS };
@@ -0,0 +1,52 @@
1
+ 'use strict';
2
+
3
+ const gatewayAdapter = require('./adapters/gateway');
4
+ const claudeCodeAdapter = require('./adapters/claude-code');
5
+
6
+ const ADAPTERS = [gatewayAdapter, claudeCodeAdapter];
7
+
8
+ /**
9
+ * Auto-detect the client type from raw session entries.
10
+ * Samples the first 5 entries and picks the adapter with the most matches.
11
+ * @param {any[]} rawEntries
12
+ * @returns {string} Client name ('gateway' | 'claude-code')
13
+ * @throws {Error} If entries are empty, no adapter matches, or detection is ambiguous
14
+ */
15
+ function detectClient(rawEntries) {
16
+ if (!rawEntries || rawEntries.length === 0) {
17
+ throw new Error('Cannot detect client: empty entries');
18
+ }
19
+
20
+ const sample = rawEntries.slice(0, Math.min(5, rawEntries.length));
21
+ const scores = [];
22
+
23
+ for (const adapter of ADAPTERS) {
24
+ const count = sample.filter(e => adapter.detect(e)).length;
25
+ scores.push({ name: adapter.name, count });
26
+ }
27
+ scores.sort((a, b) => b.count - a.count);
28
+
29
+ if (scores[0].count === 0) {
30
+ throw new Error('Cannot detect session client type. Pass opts.client explicitly.');
31
+ }
32
+ if (scores.length > 1 && scores[0].count === scores[1].count) {
33
+ throw new Error(`Ambiguous client detection (${scores[0].name}=${scores[0].count}, ${scores[1].name}=${scores[1].count}). Pass opts.client explicitly.`);
34
+ }
35
+
36
+ return scores[0].name;
37
+ }
38
+
39
+ /**
40
+ * Get adapter by client name.
41
+ * @param {string} clientType
42
+ * @returns {object} Adapter object
43
+ * @throws {Error} If client type is unknown
44
+ */
45
+ function getAdapter(clientType) {
46
+ for (const adapter of ADAPTERS) {
47
+ if (adapter.name === clientType) return adapter;
48
+ }
49
+ throw new Error(`Unknown client type: "${clientType}". Known: ${ADAPTERS.map(a => a.name).join(', ')}`);
50
+ }
51
+
52
+ module.exports = { detectClient, getAdapter, ADAPTERS };
@@ -0,0 +1,49 @@
1
+ 'use strict';
2
+
3
+ // Content extraction utilities shared across adapters
4
+
5
+ function extractCommandName(content) {
6
+ const match = typeof content === 'string'
7
+ ? content.match(/<command-name>(\/\w+)<\/command-name>/)
8
+ : null;
9
+ return match ? match[1] : null;
10
+ }
11
+
12
+ /**
13
+ * Extract text, command name, and tool names from a message object.
14
+ * Handles both string content and content block arrays.
15
+ * @param {object} msg - Message object with .content field
16
+ * @returns {{ text: string, commandName: string|null, toolNames: string[] }}
17
+ */
18
+ function extractContent(msg) {
19
+ if (!msg) return { text: '', commandName: null, toolNames: [] };
20
+ const content = msg.content;
21
+ let commandName = null;
22
+ const toolNames = [];
23
+
24
+ if (typeof content === 'string') {
25
+ commandName = extractCommandName(content);
26
+ return { text: content.trim(), commandName, toolNames };
27
+ }
28
+
29
+ if (Array.isArray(content)) {
30
+ const texts = [];
31
+ for (const item of content) {
32
+ if (item.type === 'text' && item.text) {
33
+ const cmd = extractCommandName(item.text);
34
+ if (cmd) commandName = cmd;
35
+ texts.push(item.text);
36
+ }
37
+ // tool_use: Claude Code / Anthropic API format
38
+ // toolCall: gateway / OpenAI-style format
39
+ if ((item.type === 'tool_use' || item.type === 'toolCall') && item.name) {
40
+ toolNames.push(item.name);
41
+ }
42
+ }
43
+ return { text: texts.join('\n').trim(), commandName, toolNames };
44
+ }
45
+
46
+ return { text: '', commandName, toolNames };
47
+ }
48
+
49
+ module.exports = { extractContent, extractCommandName };
@@ -0,0 +1,129 @@
1
+ 'use strict';
2
+
3
+ const { SKIP_COMMANDS, RESET_COMMANDS, MAX_MSG_CHARS } = require('./constants');
4
+ const { detectClient, getAdapter } = require('./detect');
5
+
6
+ /**
7
+ * Normalize raw session entries into effective messages.
8
+ *
9
+ * Accepts raw JSONL entries from any supported client (gateway, Claude Code, etc.)
10
+ * and produces a clean, uniform array of conversational messages suitable for
11
+ * summarization, embedding, and recall.
12
+ *
13
+ * @param {any[]} rawEntries - Raw JSONL entries from a session file
14
+ * @param {object} [opts]
15
+ * @param {string} [opts.client] - Client type: 'gateway' | 'claude-code'. Auto-detected if omitted.
16
+ * @param {number} [opts.idleGapMs] - Idle gap threshold for boundary detection (default: 2 hours)
17
+ * @returns {{ normalized: object[], skipStats: object, boundaries: object[], toolsUsed: string[] }}
18
+ */
19
+ function normalizeSession(rawEntries, opts = {}) {
20
+ if (!rawEntries || rawEntries.length === 0) {
21
+ return {
22
+ normalized: [],
23
+ skipStats: { total: 0, nonMessage: 0, noRole: 0, meta: 0, caveat: 0,
24
+ empty: 0, toolOnly: 0, narration: 0, toolResult: 0, routine: 0, command: 0 },
25
+ boundaries: [],
26
+ toolsUsed: [],
27
+ };
28
+ }
29
+
30
+ const idleGapMs = opts.idleGapMs || 2 * 60 * 60 * 1000;
31
+
32
+ // 1. Select adapter
33
+ const clientType = opts.client || detectClient(rawEntries);
34
+ const adapter = getAdapter(clientType);
35
+
36
+ // 2. Merge adapter-specific constants with shared constants
37
+ const allSkipCommands = new Set([...SKIP_COMMANDS, ...(adapter.skipCommands || [])]);
38
+ const allRoutinePatterns = [...(adapter.routinePatterns || [])];
39
+
40
+ // 3. Main loop: adapter.toIntermediate → shared filter → collect
41
+ const normalized = [];
42
+ const skipStats = { total: 0, nonMessage: 0, noRole: 0, meta: 0, caveat: 0,
43
+ empty: 0, toolOnly: 0, narration: 0, toolResult: 0, routine: 0, command: 0 };
44
+ const toolsUsed = new Set();
45
+
46
+ for (let idx = 0; idx < rawEntries.length; idx++) {
47
+ skipStats.total++;
48
+ const parsed = adapter.toIntermediate(rawEntries[idx], { idx, rawEntries });
49
+
50
+ // Collect tool names even from skipped entries
51
+ if (parsed.toolNames?.length) {
52
+ for (const tn of parsed.toolNames) toolsUsed.add(tn);
53
+ }
54
+
55
+ // Adapter-determined skip
56
+ if (parsed.adapterSkip) {
57
+ if (!(parsed.adapterSkip in skipStats)) {
58
+ throw new Error(`Unknown adapterSkip reason: "${parsed.adapterSkip}" from ${clientType} adapter`);
59
+ }
60
+ skipStats[parsed.adapterSkip]++;
61
+ continue;
62
+ }
63
+
64
+ // Shared: invalid role
65
+ if (!parsed.role || (parsed.role !== 'user' && parsed.role !== 'assistant')) {
66
+ skipStats.noRole++;
67
+ continue;
68
+ }
69
+
70
+ // Shared: empty text (but keep interrupts)
71
+ if (!parsed.text && !parsed.isInterrupt) {
72
+ skipStats.empty++;
73
+ continue;
74
+ }
75
+
76
+ // Shared: routine patterns
77
+ if (!parsed.isInterrupt && parsed.text && allRoutinePatterns.some(re => re.test(parsed.text.trim()))) {
78
+ skipStats.routine++;
79
+ continue;
80
+ }
81
+
82
+ // Shared: skip commands
83
+ if (parsed.commandName && allSkipCommands.has(parsed.commandName)) {
84
+ skipStats.command++;
85
+ continue;
86
+ }
87
+
88
+ // Shared: truncate + reset command handling
89
+ const isResetCommand = !!(parsed.commandName && RESET_COMMANDS.has(parsed.commandName));
90
+ let finalText = isResetCommand ? '' : (parsed.text || '');
91
+ if (finalText.length > MAX_MSG_CHARS) {
92
+ finalText = finalText.slice(0, MAX_MSG_CHARS) + '\n[truncated]';
93
+ }
94
+
95
+ const msg = {
96
+ idx: parsed.idx,
97
+ role: parsed.role,
98
+ timestamp: parsed.timestamp,
99
+ text: finalText,
100
+ commandName: parsed.commandName || null,
101
+ isResetCommand,
102
+ };
103
+ if (parsed.isInterrupt) msg.isInterrupt = true;
104
+
105
+ normalized.push(msg);
106
+ }
107
+
108
+ // 4. Boundary detection
109
+ const boundaries = [];
110
+ for (let i = 0; i < normalized.length; i++) {
111
+ const cur = normalized[i];
112
+ const prev = i > 0 ? normalized[i - 1] : null;
113
+
114
+ if (cur.isResetCommand) {
115
+ boundaries.push({ type: 'command', at_index: i, reason: cur.commandName });
116
+ }
117
+
118
+ if (prev?.timestamp && cur.timestamp) {
119
+ const gapMs = new Date(cur.timestamp).getTime() - new Date(prev.timestamp).getTime();
120
+ if (gapMs > idleGapMs) {
121
+ boundaries.push({ type: 'idle_gap', at_index: i, gap_minutes: Math.round(gapMs / 60000) });
122
+ }
123
+ }
124
+ }
125
+
126
+ return { normalized, skipStats, boundaries, toolsUsed: [...toolsUsed] };
127
+ }
128
+
129
+ module.exports = { normalizeSession, detectClient };
@@ -0,0 +1,33 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Parse timestamp from a raw session entry.
5
+ * Handles multiple formats: ISO string (outer), epoch ms number (inner).
6
+ * Unified across all adapters to ensure consistent boundary detection.
7
+ * @param {object} entry - Raw session entry
8
+ * @returns {string|null} ISO8601 string or null
9
+ */
10
+ function parseTimestamp(entry) {
11
+ // Outer timestamp (ISO string) — common in CLI-based clients
12
+ const outerTs = entry.timestamp;
13
+ if (typeof outerTs === 'string') {
14
+ const d = new Date(outerTs);
15
+ if (!isNaN(d.getTime())) return d.toISOString();
16
+ }
17
+
18
+ // Inner timestamp (epoch ms) — common in gateway/server-side clients
19
+ const innerTs = entry.message?.timestamp;
20
+ if (typeof innerTs === 'number') {
21
+ return new Date(innerTs).toISOString();
22
+ }
23
+
24
+ // Inner timestamp can also be ISO string
25
+ if (typeof innerTs === 'string') {
26
+ const d = new Date(innerTs);
27
+ if (!isNaN(d.getTime())) return d.toISOString();
28
+ }
29
+
30
+ return null;
31
+ }
32
+
33
+ module.exports = { parseTimestamp };