@shadowforge0/aquifer-memory 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +167 -83
- package/consumers/cli.js +74 -78
- package/consumers/mcp.js +69 -5
- package/consumers/openclaw-plugin.js +18 -5
- package/consumers/shared/config.js +3 -3
- package/consumers/shared/factory.js +6 -4
- package/core/aquifer.js +157 -17
- package/core/storage.js +12 -3
- package/docs/setup.md +194 -0
- package/index.js +2 -1
- package/package.json +8 -8
- package/pipeline/normalize/adapters/claude-code.js +90 -0
- package/pipeline/normalize/adapters/gateway.js +67 -0
- package/pipeline/normalize/constants.js +12 -0
- package/pipeline/normalize/detect.js +52 -0
- package/pipeline/normalize/extract.js +49 -0
- package/pipeline/normalize/index.js +129 -0
- package/pipeline/normalize/timestamp.js +33 -0
- package/scripts/smoke.mjs +115 -0
package/docs/setup.md
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Aquifer Setup Guide
|
|
2
|
+
|
|
3
|
+
This guide walks you through installing Aquifer and verifying a complete write → enrich → recall cycle. By the end, you will have a working MCP memory server that an agent host can connect to.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
You need three things running before Aquifer can work:
|
|
8
|
+
|
|
9
|
+
1. **PostgreSQL 15+** with the **pgvector** extension installed
|
|
10
|
+
2. **Node.js 18+**
|
|
11
|
+
3. **An embedding endpoint** — Ollama (local), OpenAI, or any OpenAI-compatible API
|
|
12
|
+
|
|
13
|
+
## Step 1: Database
|
|
14
|
+
|
|
15
|
+
### Option A: Docker (recommended for local dev)
|
|
16
|
+
|
|
17
|
+
The repo includes a `docker-compose.yml` that starts PostgreSQL 16 with pgvector and Ollama with bge-m3 auto-pulled:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cd /path/to/aquifer
|
|
21
|
+
docker compose up -d
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
This gives you a database at `postgresql://aquifer:aquifer@localhost:5432/aquifer` with pgvector ready, plus an Ollama server with bge-m3 for embeddings. First run takes a few minutes while the model downloads.
|
|
25
|
+
|
|
26
|
+
### Option B: Existing PostgreSQL
|
|
27
|
+
|
|
28
|
+
Make sure pgvector is installed. Connect as a superuser and run:
|
|
29
|
+
|
|
30
|
+
```sql
|
|
31
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
If your PostgreSQL was installed from a package manager, you may need to install the pgvector package separately. See [pgvector installation](https://github.com/pgvector/pgvector#installation).
|
|
35
|
+
|
|
36
|
+
## Step 2: Install Aquifer
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npm install @shadowforge0/aquifer-memory
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
All dependencies including MCP SDK and zod are installed automatically.
|
|
43
|
+
|
|
44
|
+
## Step 3: Configure
|
|
45
|
+
|
|
46
|
+
Aquifer reads configuration from three sources (in priority order):
|
|
47
|
+
|
|
48
|
+
1. Config file: `aquifer.config.json` in the working directory, or set `AQUIFER_CONFIG=/path/to/config.json`
|
|
49
|
+
2. Environment variables (see below)
|
|
50
|
+
3. Programmatic overrides via `createAquifer()`
|
|
51
|
+
|
|
52
|
+
### Minimum env vars for MCP recall
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
export DATABASE_URL="postgresql://aquifer:aquifer@localhost:5432/aquifer"
|
|
56
|
+
export AQUIFER_EMBED_BASE_URL="http://localhost:11434/v1"
|
|
57
|
+
export AQUIFER_EMBED_MODEL="bge-m3"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Optional but common
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# PG schema (default: aquifer) — useful for running multiple instances in one database
|
|
64
|
+
export AQUIFER_SCHEMA="aquifer"
|
|
65
|
+
|
|
66
|
+
# LLM for built-in summarization — without this, enrich requires a custom summaryFn
|
|
67
|
+
export AQUIFER_LLM_BASE_URL="http://localhost:11434/v1"
|
|
68
|
+
export AQUIFER_LLM_MODEL="llama3.1"
|
|
69
|
+
|
|
70
|
+
# Knowledge graph
|
|
71
|
+
export AQUIFER_ENTITIES_ENABLED="true"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Copy `.env.example` from the repo root for a full annotated list.
|
|
75
|
+
|
|
76
|
+
## Step 4: Verify everything works
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
npx aquifer quickstart
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This single command runs migrations, commits a test session, embeds it, recalls it, and cleans up. If it prints `✓ Aquifer is working`, your setup is correct.
|
|
83
|
+
|
|
84
|
+
You can also run individual steps manually: `npx aquifer migrate`, `npx aquifer stats`, etc.
|
|
85
|
+
|
|
86
|
+
## Step 5: Start the MCP server
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
npx aquifer mcp
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The server starts on stdio and waits for MCP client connections. There is no visible output on success — the server is ready when the process stays running without error.
|
|
93
|
+
|
|
94
|
+
### Verify with the library API (optional)
|
|
95
|
+
|
|
96
|
+
If you want to test the library directly instead of the CLI:
|
|
97
|
+
|
|
98
|
+
```javascript
|
|
99
|
+
const { createAquifer, createEmbedder } = require('@shadowforge0/aquifer-memory');
|
|
100
|
+
|
|
101
|
+
const embedder = createEmbedder({
|
|
102
|
+
provider: 'ollama',
|
|
103
|
+
ollamaUrl: 'http://localhost:11434',
|
|
104
|
+
model: 'bge-m3',
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
const aquifer = createAquifer({
|
|
108
|
+
db: process.env.DATABASE_URL,
|
|
109
|
+
schema: 'aquifer',
|
|
110
|
+
embed: { fn: (texts) => embedder.embedBatch(texts) },
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
await aquifer.migrate();
|
|
114
|
+
|
|
115
|
+
// Commit a test session
|
|
116
|
+
await aquifer.commit('test-001', [
|
|
117
|
+
{ role: 'user', content: 'We decided to use PostgreSQL for the memory store.' },
|
|
118
|
+
{ role: 'assistant', content: 'Good choice — PG gives us ACID, FTS, and pgvector in one place.' },
|
|
119
|
+
], { agentId: 'test' });
|
|
120
|
+
|
|
121
|
+
// Enrich (embed turns — summarization needs LLM config)
|
|
122
|
+
await aquifer.enrich('test-001', { agentId: 'test', skipSummary: true });
|
|
123
|
+
|
|
124
|
+
// Recall
|
|
125
|
+
const results = await aquifer.recall('PostgreSQL memory', { limit: 3 });
|
|
126
|
+
console.log('Results:', results.length); // Should be >= 1
|
|
127
|
+
|
|
128
|
+
await aquifer.close();
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Connecting a host
|
|
132
|
+
|
|
133
|
+
Once the MCP server is verified, connect your agent host:
|
|
134
|
+
|
|
135
|
+
### Claude Code
|
|
136
|
+
|
|
137
|
+
Add to `.claude.json` (project-level) or user-level MCP config:
|
|
138
|
+
|
|
139
|
+
```json
|
|
140
|
+
{
|
|
141
|
+
"mcpServers": {
|
|
142
|
+
"aquifer": {
|
|
143
|
+
"type": "stdio",
|
|
144
|
+
"command": "node",
|
|
145
|
+
"args": ["/absolute/path/to/aquifer/consumers/mcp.js"],
|
|
146
|
+
"env": {
|
|
147
|
+
"DATABASE_URL": "postgresql://aquifer:aquifer@localhost:5432/aquifer",
|
|
148
|
+
"AQUIFER_EMBED_BASE_URL": "http://localhost:11434/v1",
|
|
149
|
+
"AQUIFER_EMBED_MODEL": "bge-m3"
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Tools appear as `mcp__aquifer__session_recall`, `mcp__aquifer__session_feedback`, `mcp__aquifer__memory_stats`, `mcp__aquifer__memory_pending`.
|
|
157
|
+
|
|
158
|
+
### OpenClaw
|
|
159
|
+
|
|
160
|
+
Add to `openclaw.json`:
|
|
161
|
+
|
|
162
|
+
```json
|
|
163
|
+
{
|
|
164
|
+
"mcp": {
|
|
165
|
+
"servers": {
|
|
166
|
+
"aquifer": {
|
|
167
|
+
"command": "node",
|
|
168
|
+
"args": ["/absolute/path/to/aquifer/consumers/mcp.js"],
|
|
169
|
+
"env": {
|
|
170
|
+
"DATABASE_URL": "postgresql://...",
|
|
171
|
+
"AQUIFER_EMBED_BASE_URL": "http://localhost:11434/v1",
|
|
172
|
+
"AQUIFER_EMBED_MODEL": "bge-m3"
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Tools materialize as `aquifer__session_recall`, `aquifer__session_feedback`, `aquifer__memory_stats`, `aquifer__memory_pending`.
|
|
181
|
+
|
|
182
|
+
Do **not** use the OpenClaw plugin (`consumers/openclaw-plugin.js`) for tool delivery. The plugin is retained for session capture via `before_reset` only.
|
|
183
|
+
|
|
184
|
+
## Troubleshooting
|
|
185
|
+
|
|
186
|
+
**`error: type "vector" does not exist`** — pgvector is not installed. Use the `pgvector/pgvector` Docker image, or install the extension manually: `CREATE EXTENSION IF NOT EXISTS vector;` (requires superuser).
|
|
187
|
+
|
|
188
|
+
**`aquifer mcp requires @modelcontextprotocol/sdk and zod`** — These are regular dependencies and should be installed automatically. Run `npm install` again to ensure all deps are present.
|
|
189
|
+
|
|
190
|
+
**Recall returns empty results** — Sessions must be enriched before they are searchable. Run `npx aquifer stats` and check that summaries and/or turn embeddings exist. If not, run `npx aquifer backfill` to enrich pending sessions.
|
|
191
|
+
|
|
192
|
+
**`ECONNREFUSED` on embed calls** — Your embedding endpoint is not reachable. For Ollama: make sure it is running (`ollama serve`) and the model is pulled (`ollama pull bge-m3`).
|
|
193
|
+
|
|
194
|
+
**Enrich fails with "no LLM configured"** — The built-in summarizer needs `AQUIFER_LLM_BASE_URL` + `AQUIFER_LLM_MODEL`. Alternatively, pass `skipSummary: true` to enrich without summarization (turn embeddings still work), or provide your own `summaryFn`.
|
package/index.js
CHANGED
|
@@ -3,5 +3,6 @@
|
|
|
3
3
|
const { createAquifer } = require('./core/aquifer');
|
|
4
4
|
const { createEmbedder } = require('./pipeline/embed');
|
|
5
5
|
const { createReranker } = require('./pipeline/rerank');
|
|
6
|
+
const { normalizeSession, detectClient } = require('./pipeline/normalize');
|
|
6
7
|
|
|
7
|
-
module.exports = { createAquifer, createEmbedder, createReranker };
|
|
8
|
+
module.exports = { createAquifer, createEmbedder, createReranker, normalizeSession, detectClient };
|
package/package.json
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@shadowforge0/aquifer-memory",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph.
|
|
3
|
+
"version": "0.9.0",
|
|
4
|
+
"description": "PG-native long-term memory for AI agents. Turn-level embedding, hybrid RRF ranking, optional knowledge graph. MCP server, CLI, and library API.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"files": [
|
|
7
7
|
"index.js",
|
|
8
8
|
"core/",
|
|
9
9
|
"pipeline/",
|
|
10
10
|
"schema/",
|
|
11
|
-
"consumers/"
|
|
11
|
+
"consumers/",
|
|
12
|
+
"docs/",
|
|
13
|
+
"scripts/"
|
|
12
14
|
],
|
|
13
15
|
"bin": {
|
|
14
16
|
"aquifer": "./consumers/cli.js"
|
|
@@ -32,11 +34,9 @@
|
|
|
32
34
|
},
|
|
33
35
|
"author": "shadowforge0",
|
|
34
36
|
"dependencies": {
|
|
35
|
-
"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
"@modelcontextprotocol/sdk": "^1.12.0",
|
|
39
|
-
"zod": "^3.24.0"
|
|
37
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
38
|
+
"pg": "^8.13.0",
|
|
39
|
+
"zod": "^3.25.76"
|
|
40
40
|
},
|
|
41
41
|
"engines": {
|
|
42
42
|
"node": ">=18.0.0"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Claude Code adapter — for Claude Code CLI sessions.
|
|
5
|
+
* Entry types are 'user'/'assistant' (split format: one content type per entry).
|
|
6
|
+
* Text and tool_use are separate entries, enabling narration detection via look-ahead.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const { extractContent } = require('../extract');
|
|
10
|
+
const { parseTimestamp } = require('../timestamp');
|
|
11
|
+
const { MAX_NARRATION_CHARS } = require('../constants');
|
|
12
|
+
|
|
13
|
+
module.exports = {
|
|
14
|
+
name: 'claude-code',
|
|
15
|
+
|
|
16
|
+
detect(entry) {
|
|
17
|
+
// Only count entry types that participate in normalize
|
|
18
|
+
return entry.type === 'user' || entry.type === 'assistant';
|
|
19
|
+
},
|
|
20
|
+
|
|
21
|
+
toIntermediate(entry, ctx) {
|
|
22
|
+
const { idx, rawEntries } = ctx;
|
|
23
|
+
const entryType = entry.type;
|
|
24
|
+
|
|
25
|
+
if (entryType !== 'user' && entryType !== 'assistant') {
|
|
26
|
+
return { idx, toolNames: [], adapterSkip: 'nonMessage' };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const role = entry.message?.role || entryType;
|
|
30
|
+
|
|
31
|
+
if (role === 'toolResult') {
|
|
32
|
+
return { idx, toolNames: [], adapterSkip: 'toolResult' };
|
|
33
|
+
}
|
|
34
|
+
if (role !== 'user' && role !== 'assistant') {
|
|
35
|
+
return { idx, role: null, toolNames: [], adapterSkip: 'noRole' };
|
|
36
|
+
}
|
|
37
|
+
if (entry.isMeta) {
|
|
38
|
+
return { idx, toolNames: [], adapterSkip: 'meta' };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const { text, commandName, toolNames } = extractContent(entry.message);
|
|
42
|
+
|
|
43
|
+
// CLI internal command output tags
|
|
44
|
+
if (text.includes('<local-command-caveat>') || text.includes('<local-command-stdout>') || text.includes('<local-command-stderr>')) {
|
|
45
|
+
return { idx, toolNames, adapterSkip: 'caveat' };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const isInterrupt = text.startsWith('[Request interrupted by user');
|
|
49
|
+
|
|
50
|
+
// Tool-use-only assistant entry (no visible text, only tool calls)
|
|
51
|
+
if (!text && toolNames.length > 0 && role === 'assistant') {
|
|
52
|
+
return { idx, toolNames, adapterSkip: 'toolOnly' };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Narration detection: short text entry immediately followed by a tool_use entry.
|
|
56
|
+
// Claude Code splits text and tool_use into separate JSONL entries.
|
|
57
|
+
// A short text before a tool call is narration ("Now reading X...", "Let me check...").
|
|
58
|
+
if (role === 'assistant' && text && text.length < MAX_NARRATION_CHARS) {
|
|
59
|
+
let nextIsTool = false;
|
|
60
|
+
for (let j = idx + 1; j < rawEntries.length && j < idx + 3; j++) {
|
|
61
|
+
const ne = rawEntries[j];
|
|
62
|
+
if (ne.type === 'assistant') {
|
|
63
|
+
const nc = ne.message?.content;
|
|
64
|
+
if (Array.isArray(nc) && nc.some(x => x.type === 'tool_use')) nextIsTool = true;
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (nextIsTool) {
|
|
69
|
+
return { idx, toolNames, adapterSkip: 'narration' };
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
idx, role, text,
|
|
75
|
+
timestamp: parseTimestamp(entry),
|
|
76
|
+
toolNames, commandName, isInterrupt,
|
|
77
|
+
adapterSkip: null,
|
|
78
|
+
};
|
|
79
|
+
},
|
|
80
|
+
|
|
81
|
+
routinePatterns: [
|
|
82
|
+
/^<task-notification>/,
|
|
83
|
+
],
|
|
84
|
+
|
|
85
|
+
skipCommands: [
|
|
86
|
+
'/model', '/cost', '/memory', '/permissions', '/diff', '/review',
|
|
87
|
+
'/doctor', '/login', '/logout', '/mcp', '/context', '/fast',
|
|
88
|
+
'/think', '/vim', '/exit',
|
|
89
|
+
],
|
|
90
|
+
};
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Gateway adapter — for AI gateway servers that produce type='message' entries.
|
|
5
|
+
* Content blocks combine text + thinking + toolCall in a single entry.
|
|
6
|
+
* Supports channel metadata stripping (Discord, Telegram, etc.).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const { extractContent } = require('../extract');
|
|
10
|
+
const { parseTimestamp } = require('../timestamp');
|
|
11
|
+
|
|
12
|
+
// Channel metadata prefix injected by gateway routing layers
|
|
13
|
+
const METADATA_PREFIX_RE = /^(?:Conversation info \(untrusted metadata\):[\s\S]*?```\s*\n\s*)?(?:Sender \(untrusted metadata\):[\s\S]*?```\s*\n\s*)?/;
|
|
14
|
+
|
|
15
|
+
function stripChannelMetadata(text) {
|
|
16
|
+
const stripped = text.replace(METADATA_PREFIX_RE, '').trim();
|
|
17
|
+
return stripped || text;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
module.exports = {
|
|
21
|
+
name: 'gateway',
|
|
22
|
+
|
|
23
|
+
detect(entry) {
|
|
24
|
+
return entry.type === 'message';
|
|
25
|
+
},
|
|
26
|
+
|
|
27
|
+
toIntermediate(entry, ctx) {
|
|
28
|
+
const { idx } = ctx;
|
|
29
|
+
|
|
30
|
+
if (entry.type !== 'message') {
|
|
31
|
+
return { idx, toolNames: [], adapterSkip: 'nonMessage' };
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const msg = entry.message;
|
|
35
|
+
const role = msg?.role;
|
|
36
|
+
|
|
37
|
+
if (role === 'toolResult') {
|
|
38
|
+
return { idx, toolNames: [], adapterSkip: 'toolResult' };
|
|
39
|
+
}
|
|
40
|
+
if (role !== 'user' && role !== 'assistant') {
|
|
41
|
+
return { idx, role: null, toolNames: [], adapterSkip: 'noRole' };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const { text, commandName, toolNames } = extractContent(msg);
|
|
45
|
+
|
|
46
|
+
let finalText = text;
|
|
47
|
+
const isInterrupt = text.startsWith('[Request interrupted by user');
|
|
48
|
+
if (role === 'user' && finalText && !isInterrupt) {
|
|
49
|
+
finalText = stripChannelMetadata(finalText);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
idx, role, text: finalText,
|
|
54
|
+
timestamp: parseTimestamp(entry),
|
|
55
|
+
toolNames, commandName, isInterrupt,
|
|
56
|
+
adapterSkip: null,
|
|
57
|
+
};
|
|
58
|
+
},
|
|
59
|
+
|
|
60
|
+
routinePatterns: [
|
|
61
|
+
/^HEARTBEAT_OK$/,
|
|
62
|
+
/^THINK_OK$/,
|
|
63
|
+
/^\[Queued messages while agent was busy\]/,
|
|
64
|
+
],
|
|
65
|
+
|
|
66
|
+
skipCommands: [],
|
|
67
|
+
};
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Commands that produce no conversational value — skip entirely
|
|
4
|
+
const SKIP_COMMANDS = new Set(['/clear', '/compact', '/help', '/status', '/config']);
|
|
5
|
+
|
|
6
|
+
// Commands that mark session boundaries — keep as boundary markers
|
|
7
|
+
const RESET_COMMANDS = new Set(['/new', '/reset']);
|
|
8
|
+
|
|
9
|
+
const MAX_MSG_CHARS = 8000;
|
|
10
|
+
const MAX_NARRATION_CHARS = 200;
|
|
11
|
+
|
|
12
|
+
module.exports = { SKIP_COMMANDS, RESET_COMMANDS, MAX_MSG_CHARS, MAX_NARRATION_CHARS };
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const gatewayAdapter = require('./adapters/gateway');
|
|
4
|
+
const claudeCodeAdapter = require('./adapters/claude-code');
|
|
5
|
+
|
|
6
|
+
const ADAPTERS = [gatewayAdapter, claudeCodeAdapter];
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Auto-detect the client type from raw session entries.
|
|
10
|
+
* Samples the first 5 entries and picks the adapter with the most matches.
|
|
11
|
+
* @param {any[]} rawEntries
|
|
12
|
+
* @returns {string} Client name ('gateway' | 'claude-code')
|
|
13
|
+
* @throws {Error} If entries are empty, no adapter matches, or detection is ambiguous
|
|
14
|
+
*/
|
|
15
|
+
function detectClient(rawEntries) {
|
|
16
|
+
if (!rawEntries || rawEntries.length === 0) {
|
|
17
|
+
throw new Error('Cannot detect client: empty entries');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const sample = rawEntries.slice(0, Math.min(5, rawEntries.length));
|
|
21
|
+
const scores = [];
|
|
22
|
+
|
|
23
|
+
for (const adapter of ADAPTERS) {
|
|
24
|
+
const count = sample.filter(e => adapter.detect(e)).length;
|
|
25
|
+
scores.push({ name: adapter.name, count });
|
|
26
|
+
}
|
|
27
|
+
scores.sort((a, b) => b.count - a.count);
|
|
28
|
+
|
|
29
|
+
if (scores[0].count === 0) {
|
|
30
|
+
throw new Error('Cannot detect session client type. Pass opts.client explicitly.');
|
|
31
|
+
}
|
|
32
|
+
if (scores.length > 1 && scores[0].count === scores[1].count) {
|
|
33
|
+
throw new Error(`Ambiguous client detection (${scores[0].name}=${scores[0].count}, ${scores[1].name}=${scores[1].count}). Pass opts.client explicitly.`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return scores[0].name;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Get adapter by client name.
|
|
41
|
+
* @param {string} clientType
|
|
42
|
+
* @returns {object} Adapter object
|
|
43
|
+
* @throws {Error} If client type is unknown
|
|
44
|
+
*/
|
|
45
|
+
function getAdapter(clientType) {
|
|
46
|
+
for (const adapter of ADAPTERS) {
|
|
47
|
+
if (adapter.name === clientType) return adapter;
|
|
48
|
+
}
|
|
49
|
+
throw new Error(`Unknown client type: "${clientType}". Known: ${ADAPTERS.map(a => a.name).join(', ')}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
module.exports = { detectClient, getAdapter, ADAPTERS };
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Content extraction utilities shared across adapters
|
|
4
|
+
|
|
5
|
+
function extractCommandName(content) {
|
|
6
|
+
const match = typeof content === 'string'
|
|
7
|
+
? content.match(/<command-name>(\/\w+)<\/command-name>/)
|
|
8
|
+
: null;
|
|
9
|
+
return match ? match[1] : null;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Extract text, command name, and tool names from a message object.
|
|
14
|
+
* Handles both string content and content block arrays.
|
|
15
|
+
* @param {object} msg - Message object with .content field
|
|
16
|
+
* @returns {{ text: string, commandName: string|null, toolNames: string[] }}
|
|
17
|
+
*/
|
|
18
|
+
function extractContent(msg) {
|
|
19
|
+
if (!msg) return { text: '', commandName: null, toolNames: [] };
|
|
20
|
+
const content = msg.content;
|
|
21
|
+
let commandName = null;
|
|
22
|
+
const toolNames = [];
|
|
23
|
+
|
|
24
|
+
if (typeof content === 'string') {
|
|
25
|
+
commandName = extractCommandName(content);
|
|
26
|
+
return { text: content.trim(), commandName, toolNames };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (Array.isArray(content)) {
|
|
30
|
+
const texts = [];
|
|
31
|
+
for (const item of content) {
|
|
32
|
+
if (item.type === 'text' && item.text) {
|
|
33
|
+
const cmd = extractCommandName(item.text);
|
|
34
|
+
if (cmd) commandName = cmd;
|
|
35
|
+
texts.push(item.text);
|
|
36
|
+
}
|
|
37
|
+
// tool_use: Claude Code / Anthropic API format
|
|
38
|
+
// toolCall: gateway / OpenAI-style format
|
|
39
|
+
if ((item.type === 'tool_use' || item.type === 'toolCall') && item.name) {
|
|
40
|
+
toolNames.push(item.name);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return { text: texts.join('\n').trim(), commandName, toolNames };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return { text: '', commandName, toolNames };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
module.exports = { extractContent, extractCommandName };
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { SKIP_COMMANDS, RESET_COMMANDS, MAX_MSG_CHARS } = require('./constants');
|
|
4
|
+
const { detectClient, getAdapter } = require('./detect');
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Normalize raw session entries into effective messages.
|
|
8
|
+
*
|
|
9
|
+
* Accepts raw JSONL entries from any supported client (gateway, Claude Code, etc.)
|
|
10
|
+
* and produces a clean, uniform array of conversational messages suitable for
|
|
11
|
+
* summarization, embedding, and recall.
|
|
12
|
+
*
|
|
13
|
+
* @param {any[]} rawEntries - Raw JSONL entries from a session file
|
|
14
|
+
* @param {object} [opts]
|
|
15
|
+
* @param {string} [opts.client] - Client type: 'gateway' | 'claude-code'. Auto-detected if omitted.
|
|
16
|
+
* @param {number} [opts.idleGapMs] - Idle gap threshold for boundary detection (default: 2 hours)
|
|
17
|
+
* @returns {{ normalized: object[], skipStats: object, boundaries: object[], toolsUsed: string[] }}
|
|
18
|
+
*/
|
|
19
|
+
function normalizeSession(rawEntries, opts = {}) {
|
|
20
|
+
if (!rawEntries || rawEntries.length === 0) {
|
|
21
|
+
return {
|
|
22
|
+
normalized: [],
|
|
23
|
+
skipStats: { total: 0, nonMessage: 0, noRole: 0, meta: 0, caveat: 0,
|
|
24
|
+
empty: 0, toolOnly: 0, narration: 0, toolResult: 0, routine: 0, command: 0 },
|
|
25
|
+
boundaries: [],
|
|
26
|
+
toolsUsed: [],
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const idleGapMs = opts.idleGapMs || 2 * 60 * 60 * 1000;
|
|
31
|
+
|
|
32
|
+
// 1. Select adapter
|
|
33
|
+
const clientType = opts.client || detectClient(rawEntries);
|
|
34
|
+
const adapter = getAdapter(clientType);
|
|
35
|
+
|
|
36
|
+
// 2. Merge adapter-specific constants with shared constants
|
|
37
|
+
const allSkipCommands = new Set([...SKIP_COMMANDS, ...(adapter.skipCommands || [])]);
|
|
38
|
+
const allRoutinePatterns = [...(adapter.routinePatterns || [])];
|
|
39
|
+
|
|
40
|
+
// 3. Main loop: adapter.toIntermediate → shared filter → collect
|
|
41
|
+
const normalized = [];
|
|
42
|
+
const skipStats = { total: 0, nonMessage: 0, noRole: 0, meta: 0, caveat: 0,
|
|
43
|
+
empty: 0, toolOnly: 0, narration: 0, toolResult: 0, routine: 0, command: 0 };
|
|
44
|
+
const toolsUsed = new Set();
|
|
45
|
+
|
|
46
|
+
for (let idx = 0; idx < rawEntries.length; idx++) {
|
|
47
|
+
skipStats.total++;
|
|
48
|
+
const parsed = adapter.toIntermediate(rawEntries[idx], { idx, rawEntries });
|
|
49
|
+
|
|
50
|
+
// Collect tool names even from skipped entries
|
|
51
|
+
if (parsed.toolNames?.length) {
|
|
52
|
+
for (const tn of parsed.toolNames) toolsUsed.add(tn);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Adapter-determined skip
|
|
56
|
+
if (parsed.adapterSkip) {
|
|
57
|
+
if (!(parsed.adapterSkip in skipStats)) {
|
|
58
|
+
throw new Error(`Unknown adapterSkip reason: "${parsed.adapterSkip}" from ${clientType} adapter`);
|
|
59
|
+
}
|
|
60
|
+
skipStats[parsed.adapterSkip]++;
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Shared: invalid role
|
|
65
|
+
if (!parsed.role || (parsed.role !== 'user' && parsed.role !== 'assistant')) {
|
|
66
|
+
skipStats.noRole++;
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Shared: empty text (but keep interrupts)
|
|
71
|
+
if (!parsed.text && !parsed.isInterrupt) {
|
|
72
|
+
skipStats.empty++;
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Shared: routine patterns
|
|
77
|
+
if (!parsed.isInterrupt && parsed.text && allRoutinePatterns.some(re => re.test(parsed.text.trim()))) {
|
|
78
|
+
skipStats.routine++;
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Shared: skip commands
|
|
83
|
+
if (parsed.commandName && allSkipCommands.has(parsed.commandName)) {
|
|
84
|
+
skipStats.command++;
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Shared: truncate + reset command handling
|
|
89
|
+
const isResetCommand = !!(parsed.commandName && RESET_COMMANDS.has(parsed.commandName));
|
|
90
|
+
let finalText = isResetCommand ? '' : (parsed.text || '');
|
|
91
|
+
if (finalText.length > MAX_MSG_CHARS) {
|
|
92
|
+
finalText = finalText.slice(0, MAX_MSG_CHARS) + '\n[truncated]';
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const msg = {
|
|
96
|
+
idx: parsed.idx,
|
|
97
|
+
role: parsed.role,
|
|
98
|
+
timestamp: parsed.timestamp,
|
|
99
|
+
text: finalText,
|
|
100
|
+
commandName: parsed.commandName || null,
|
|
101
|
+
isResetCommand,
|
|
102
|
+
};
|
|
103
|
+
if (parsed.isInterrupt) msg.isInterrupt = true;
|
|
104
|
+
|
|
105
|
+
normalized.push(msg);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// 4. Boundary detection
|
|
109
|
+
const boundaries = [];
|
|
110
|
+
for (let i = 0; i < normalized.length; i++) {
|
|
111
|
+
const cur = normalized[i];
|
|
112
|
+
const prev = i > 0 ? normalized[i - 1] : null;
|
|
113
|
+
|
|
114
|
+
if (cur.isResetCommand) {
|
|
115
|
+
boundaries.push({ type: 'command', at_index: i, reason: cur.commandName });
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (prev?.timestamp && cur.timestamp) {
|
|
119
|
+
const gapMs = new Date(cur.timestamp).getTime() - new Date(prev.timestamp).getTime();
|
|
120
|
+
if (gapMs > idleGapMs) {
|
|
121
|
+
boundaries.push({ type: 'idle_gap', at_index: i, gap_minutes: Math.round(gapMs / 60000) });
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { normalized, skipStats, boundaries, toolsUsed: [...toolsUsed] };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
module.exports = { normalizeSession, detectClient };
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Parse timestamp from a raw session entry.
|
|
5
|
+
* Handles multiple formats: ISO string (outer), epoch ms number (inner).
|
|
6
|
+
* Unified across all adapters to ensure consistent boundary detection.
|
|
7
|
+
* @param {object} entry - Raw session entry
|
|
8
|
+
* @returns {string|null} ISO8601 string or null
|
|
9
|
+
*/
|
|
10
|
+
function parseTimestamp(entry) {
|
|
11
|
+
// Outer timestamp (ISO string) — common in CLI-based clients
|
|
12
|
+
const outerTs = entry.timestamp;
|
|
13
|
+
if (typeof outerTs === 'string') {
|
|
14
|
+
const d = new Date(outerTs);
|
|
15
|
+
if (!isNaN(d.getTime())) return d.toISOString();
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Inner timestamp (epoch ms) — common in gateway/server-side clients
|
|
19
|
+
const innerTs = entry.message?.timestamp;
|
|
20
|
+
if (typeof innerTs === 'number') {
|
|
21
|
+
return new Date(innerTs).toISOString();
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Inner timestamp can also be ISO string
|
|
25
|
+
if (typeof innerTs === 'string') {
|
|
26
|
+
const d = new Date(innerTs);
|
|
27
|
+
if (!isNaN(d.getTime())) return d.toISOString();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
module.exports = { parseTimestamp };
|