memory-lancedb-pro 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/README.md +130 -0
- package/README_CN.md +111 -1
- package/openclaw.plugin.json +1 -1
- package/package.json +2 -2
- package/scripts/jsonl_distill.py +471 -0
- package/src/adaptive-retrieval.ts +27 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.0.6
|
|
4
|
+
|
|
5
|
+
- Fix: auto-recall injection now correctly skips cron prompts wrapped as `[cron:...] run ...` (reduces token usage for cron jobs).
|
|
6
|
+
- Fix: JSONL distill extractor filters more transcript/system noise (BOOT.md, HEARTBEAT, CLAUDE_CODE_DONE, queued blocks) to avoid polluting distillation batches.
|
|
7
|
+
|
|
8
|
+
## 1.0.5
|
|
9
|
+
|
|
10
|
+
- Add: optional JSONL session distillation workflow (incremental cursor + batch format) via `scripts/jsonl_distill.py`.
|
|
11
|
+
- Docs: document the JSONL distiller setup in README (EN) and README_CN (ZH).
|
|
12
|
+
|
|
3
13
|
## 1.0.4
|
|
4
14
|
|
|
5
15
|
- Fix: `embedding.dimensions` is now parsed robustly (number / numeric string / env-var string), so it properly overrides hardcoded model dims (fixes Ollama `nomic-embed-text` dimension mismatch).
|
package/README.md
CHANGED
|
@@ -374,6 +374,136 @@ Cross-encoder reranking supports multiple providers via `rerankProvider`:
|
|
|
374
374
|
|
|
375
375
|
---
|
|
376
376
|
|
|
377
|
+
## Optional: JSONL Session Distillation (Auto-memories from chat logs)
|
|
378
|
+
|
|
379
|
+
OpenClaw already persists **full session transcripts** as JSONL files:
|
|
380
|
+
|
|
381
|
+
- `~/.openclaw/agents/<agentId>/sessions/*.jsonl`
|
|
382
|
+
|
|
383
|
+
This plugin focuses on **high-quality long-term memory**. If you dump raw transcripts into LanceDB, retrieval quality quickly degrades.
|
|
384
|
+
|
|
385
|
+
Instead, you can run an **hourly distiller** that:
|
|
386
|
+
|
|
387
|
+
1) Incrementally reads only the **newly appended tail** of each session JSONL (byte-offset cursor)
|
|
388
|
+
2) Filters noise (tool output, injected `<relevant-memories>`, logs, boilerplate)
|
|
389
|
+
3) Uses a dedicated agent to **distill** reusable lessons / rules / preferences into short atomic memories
|
|
390
|
+
4) Stores them via `memory_store` into the right **scope** (`global` or `agent:<agentId>`)
|
|
391
|
+
|
|
392
|
+
### What you get
|
|
393
|
+
|
|
394
|
+
- ✅ Fully automatic (cron)
|
|
395
|
+
- ✅ Multi-agent support (main + bots)
|
|
396
|
+
- ✅ No re-reading: cursor ensures the next run only processes new lines
|
|
397
|
+
- ✅ Memory hygiene: quality gate + dedupe + per-run caps
|
|
398
|
+
|
|
399
|
+
### Script
|
|
400
|
+
|
|
401
|
+
This repo includes the extractor script:
|
|
402
|
+
|
|
403
|
+
- `scripts/jsonl_distill.py`
|
|
404
|
+
|
|
405
|
+
It produces a small **batch JSON** file under:
|
|
406
|
+
|
|
407
|
+
- `~/.openclaw/state/jsonl-distill/batches/`
|
|
408
|
+
|
|
409
|
+
and keeps a cursor here:
|
|
410
|
+
|
|
411
|
+
- `~/.openclaw/state/jsonl-distill/cursor.json`
|
|
412
|
+
|
|
413
|
+
The script is **safe**: it never modifies session logs.
|
|
414
|
+
|
|
415
|
+
By default it skips historical reset snapshots (`*.reset.*`) and excludes the distiller agent itself (`memory-distiller`) to prevent self-ingestion loops.
|
|
416
|
+
|
|
417
|
+
### Recommended setup (dedicated distiller agent)
|
|
418
|
+
|
|
419
|
+
#### 1) Create a dedicated agent
|
|
420
|
+
|
|
421
|
+
```bash
|
|
422
|
+
openclaw agents add memory-distiller \
|
|
423
|
+
--non-interactive \
|
|
424
|
+
--workspace ~/.openclaw/workspace-memory-distiller \
|
|
425
|
+
--model openai-codex/gpt-5.2
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
#### 2) Initialize cursor (Mode A: start from now)
|
|
429
|
+
|
|
430
|
+
This marks all existing JSONL files as "already read" by setting offsets to EOF.
|
|
431
|
+
|
|
432
|
+
```bash
|
|
433
|
+
# Set PLUGIN_DIR to where this plugin is installed.
|
|
434
|
+
# - If you cloned into your OpenClaw workspace (recommended):
|
|
435
|
+
# PLUGIN_DIR="$HOME/.openclaw/workspace/plugins/memory-lancedb-pro"
|
|
436
|
+
# - Otherwise, check: `openclaw plugins info memory-lancedb-pro` and locate the directory.
|
|
437
|
+
PLUGIN_DIR="/path/to/memory-lancedb-pro"
|
|
438
|
+
|
|
439
|
+
python3 "$PLUGIN_DIR/scripts/jsonl_distill.py" init
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
#### 3) Create an hourly cron job (Asia/Shanghai)
|
|
443
|
+
|
|
444
|
+
Tip: start the message with `run ...` so `memory-lancedb-pro`'s adaptive retrieval will skip auto-recall injection (saves tokens).
|
|
445
|
+
|
|
446
|
+
```bash
|
|
447
|
+
# IMPORTANT: replace <PLUGIN_DIR> in the template below with your actual plugin path.
|
|
448
|
+
MSG=$(cat <<'EOF'
|
|
449
|
+
run jsonl memory distill
|
|
450
|
+
|
|
451
|
+
Goal: distill NEW chat content from OpenClaw session JSONL files into high-quality LanceDB memories using memory_store.
|
|
452
|
+
|
|
453
|
+
Hard rules:
|
|
454
|
+
- Incremental only: call the extractor script; do NOT scan full history.
|
|
455
|
+
- Store only reusable memories; skip routine chatter.
|
|
456
|
+
- English memory text + final line: Keywords (zh): ...
|
|
457
|
+
- < 500 chars, atomic.
|
|
458
|
+
- <= 3 memories per agent per run; <= 3 global per run.
|
|
459
|
+
- Scope: global for broadly reusable; otherwise agent:<agentId>.
|
|
460
|
+
|
|
461
|
+
Workflow:
|
|
462
|
+
1) exec: python3 <PLUGIN_DIR>/scripts/jsonl_distill.py run
|
|
463
|
+
2) If noop: stop.
|
|
464
|
+
3) Read batchFile (created/pending)
|
|
465
|
+
4) memory_store(...) for selected memories
|
|
466
|
+
5) exec: python3 <PLUGIN_DIR>/scripts/jsonl_distill.py commit --batch-file <batchFile>
|
|
467
|
+
EOF
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
openclaw cron add \
|
|
471
|
+
--agent memory-distiller \
|
|
472
|
+
--name "jsonl-memory-distill (hourly)" \
|
|
473
|
+
--cron "0 * * * *" \
|
|
474
|
+
--tz "Asia/Shanghai" \
|
|
475
|
+
--session isolated \
|
|
476
|
+
--wake now \
|
|
477
|
+
--timeout-seconds 420 \
|
|
478
|
+
--stagger 5m \
|
|
479
|
+
--no-deliver \
|
|
480
|
+
--message "$MSG"
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
#### 4) Debug run
|
|
484
|
+
|
|
485
|
+
```bash
|
|
486
|
+
openclaw cron run <jobId> --expect-final --timeout 180000
|
|
487
|
+
openclaw cron runs --id <jobId> --limit 5
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
### Scope strategy (recommended)
|
|
491
|
+
|
|
492
|
+
When distilling **all agents**, always set `scope` explicitly when calling `memory_store`:
|
|
493
|
+
|
|
494
|
+
- Broadly reusable → `scope=global`
|
|
495
|
+
- Agent-specific → `scope=agent:<agentId>`
|
|
496
|
+
|
|
497
|
+
This prevents cross-bot memory pollution.
|
|
498
|
+
|
|
499
|
+
### Rollback
|
|
500
|
+
|
|
501
|
+
- Disable/remove cron job: `openclaw cron disable <jobId>` / `openclaw cron rm <jobId>`
|
|
502
|
+
- Delete agent: `openclaw agents delete memory-distiller`
|
|
503
|
+
- Remove cursor state: `rm -rf ~/.openclaw/state/jsonl-distill/`
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
377
507
|
## CLI Commands
|
|
378
508
|
|
|
379
509
|
```bash
|
package/README_CN.md
CHANGED
|
@@ -325,7 +325,117 @@ openclaw config get plugins.slots.memory
|
|
|
325
325
|
| **Jina**(推荐) | `jina-embeddings-v5-text-small` | `https://api.jina.ai/v1` | 1024 |
|
|
326
326
|
| **OpenAI** | `text-embedding-3-small` | `https://api.openai.com/v1` | 1536 |
|
|
327
327
|
| **Google Gemini** | `gemini-embedding-001` | `https://generativelanguage.googleapis.com/v1beta/openai/` | 3072 |
|
|
328
|
-
| **Ollama**(本地) | `nomic-embed-text` | `http://localhost:11434/v1` |
|
|
328
|
+
| **Ollama**(本地) | `nomic-embed-text` | `http://localhost:11434/v1` | _与本地模型输出一致_(建议显式设置 `embedding.dimensions`) |
|
|
329
|
+
|
|
330
|
+
---
|
|
331
|
+
|
|
332
|
+
## (可选)从 Session JSONL 自动蒸馏记忆(全自动)
|
|
333
|
+
|
|
334
|
+
OpenClaw 会把每个 Agent 的完整会话自动落盘为 JSONL:
|
|
335
|
+
|
|
336
|
+
- `~/.openclaw/agents/<agentId>/sessions/*.jsonl`
|
|
337
|
+
|
|
338
|
+
但 JSONL 含大量噪声(tool 输出、系统块、重复回调等),**不建议直接把原文塞进 LanceDB**。
|
|
339
|
+
|
|
340
|
+
本插件提供一个安全的 extractor 脚本 `scripts/jsonl_distill.py`,配合 OpenClaw 的 `cron` + 独立 distiller agent,实现“增量蒸馏 → 高质量记忆入库”:
|
|
341
|
+
|
|
342
|
+
- 只读取每个 JSONL 文件**新增尾巴**(byte offset cursor),避免重复和 token 浪费
|
|
343
|
+
- 生成一个小型 batch JSON
|
|
344
|
+
- 由 distiller agent 把 batch 蒸馏成短、原子、可复用的记忆,再用 `memory_store` 写入
|
|
345
|
+
|
|
346
|
+
### 你会得到什么
|
|
347
|
+
|
|
348
|
+
- ✅ 全自动(每小时)
|
|
349
|
+
- ✅ 多 Agent 支持(main + 各 bot)
|
|
350
|
+
- ✅ 只处理新增内容(不回读)
|
|
351
|
+
- ✅ 防自我吞噬:默认排除 `memory-distiller` 自己的 session
|
|
352
|
+
|
|
353
|
+
### 脚本输出位置
|
|
354
|
+
|
|
355
|
+
- Cursor:`~/.openclaw/state/jsonl-distill/cursor.json`
|
|
356
|
+
- Batches:`~/.openclaw/state/jsonl-distill/batches/`
|
|
357
|
+
|
|
358
|
+
> 脚本只读 session JSONL,不会修改原始日志。
|
|
359
|
+
|
|
360
|
+
### 推荐部署(独立 distiller agent)
|
|
361
|
+
|
|
362
|
+
#### 1)创建 distiller agent(示例用 gpt-5.2)
|
|
363
|
+
|
|
364
|
+
```bash
|
|
365
|
+
openclaw agents add memory-distiller \
|
|
366
|
+
--non-interactive \
|
|
367
|
+
--workspace ~/.openclaw/workspace-memory-distiller \
|
|
368
|
+
--model openai-codex/gpt-5.2
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
#### 2)初始化 cursor(模式 A:从现在开始,不回溯历史)
|
|
372
|
+
|
|
373
|
+
先确定插件目录(PLUGIN_DIR):
|
|
374
|
+
|
|
375
|
+
```bash
|
|
376
|
+
# 如果你按推荐方式 clone 到 workspace:
|
|
377
|
+
# PLUGIN_DIR="$HOME/.openclaw/workspace/plugins/memory-lancedb-pro"
|
|
378
|
+
PLUGIN_DIR="/path/to/memory-lancedb-pro"
|
|
379
|
+
|
|
380
|
+
python3 "$PLUGIN_DIR/scripts/jsonl_distill.py" init
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
#### 3)创建每小时 Cron(Asia/Shanghai)
|
|
384
|
+
|
|
385
|
+
建议 cron message 以 `run ...` 开头,这样本插件的自适应检索会跳过自动 recall 注入(节省 token)。
|
|
386
|
+
|
|
387
|
+
```bash
|
|
388
|
+
MSG=$(cat <<'EOF'
|
|
389
|
+
run jsonl memory distill
|
|
390
|
+
|
|
391
|
+
Goal: Distill ONLY new content from OpenClaw session JSONL tails into high-quality LanceDB memories.
|
|
392
|
+
|
|
393
|
+
Hard rules:
|
|
394
|
+
- Incremental only: exec the extractor. Do NOT scan full history.
|
|
395
|
+
- If extractor returns action=noop: stop immediately.
|
|
396
|
+
- Store only reusable memories (rules, pitfalls, decisions, preferences, stable facts). Skip routine chatter.
|
|
397
|
+
- Each memory: idiomatic English + final line `Keywords (zh): ...` (3-8 short phrases).
|
|
398
|
+
- Keep each memory < 500 chars and atomic.
|
|
399
|
+
- Caps: <= 3 memories per agent per run; <= 3 global per run.
|
|
400
|
+
- Scope:
|
|
401
|
+
- broadly reusable -> global
|
|
402
|
+
- agent-specific -> agent:<agentId>
|
|
403
|
+
|
|
404
|
+
Workflow:
|
|
405
|
+
1) exec: python3 <PLUGIN_DIR>/scripts/jsonl_distill.py run
|
|
406
|
+
2) Determine batch file (created/pending)
|
|
407
|
+
3) memory_store(...) for selected memories
|
|
408
|
+
4) exec: python3 <PLUGIN_DIR>/scripts/jsonl_distill.py commit --batch-file <batchFile>
|
|
409
|
+
EOF
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
openclaw cron add \
|
|
413
|
+
--agent memory-distiller \
|
|
414
|
+
--name "jsonl-memory-distill (hourly)" \
|
|
415
|
+
--cron "0 * * * *" \
|
|
416
|
+
--tz "Asia/Shanghai" \
|
|
417
|
+
--session isolated \
|
|
418
|
+
--wake now \
|
|
419
|
+
--timeout-seconds 420 \
|
|
420
|
+
--stagger 5m \
|
|
421
|
+
--no-deliver \
|
|
422
|
+
--message "$MSG"
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
### scope 策略(非常重要)
|
|
426
|
+
|
|
427
|
+
当蒸馏“所有 agents”时,务必显式设置 scope:
|
|
428
|
+
|
|
429
|
+
- 跨 agent 通用规则/偏好/坑 → `scope=global`
|
|
430
|
+
- agent 私有 → `scope=agent:<agentId>`
|
|
431
|
+
|
|
432
|
+
否则不同 bot 的记忆会相互污染。
|
|
433
|
+
|
|
434
|
+
### 回滚
|
|
435
|
+
|
|
436
|
+
- 禁用/删除 cron:`openclaw cron disable <jobId>` / `openclaw cron rm <jobId>`
|
|
437
|
+
- 删除 distiller agent:`openclaw agents delete memory-distiller`
|
|
438
|
+
- 删除 cursor 状态:`rm -rf ~/.openclaw/state/jsonl-distill/`
|
|
329
439
|
|
|
330
440
|
---
|
|
331
441
|
|
package/openclaw.plugin.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"id": "memory-lancedb-pro",
|
|
3
3
|
"name": "Memory (LanceDB Pro)",
|
|
4
4
|
"description": "Enhanced LanceDB-backed long-term memory with hybrid retrieval, multi-scope isolation, and management CLI",
|
|
5
|
-
"version": "1.0.
|
|
5
|
+
"version": "1.0.6",
|
|
6
6
|
"kind": "memory",
|
|
7
7
|
"configSchema": {
|
|
8
8
|
"type": "object",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "memory-lancedb-pro",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "OpenClaw enhanced LanceDB memory plugin with hybrid retrieval (Vector + BM25), cross-encoder rerank, multi-scope isolation, and management CLI",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.ts",
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
],
|
|
19
19
|
"repository": {
|
|
20
20
|
"type": "git",
|
|
21
|
-
"url": "https://github.com/win4r/memory-lancedb-pro"
|
|
21
|
+
"url": "git+https://github.com/win4r/memory-lancedb-pro.git"
|
|
22
22
|
},
|
|
23
23
|
"author": "win4r",
|
|
24
24
|
"license": "MIT",
|
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""jsonl_distill.py
|
|
3
|
+
|
|
4
|
+
Incrementally extract new chat messages from OpenClaw session JSONL files and
|
|
5
|
+
write a compact batch file for a distiller agent to turn into LanceDB memories.
|
|
6
|
+
|
|
7
|
+
Design goals:
|
|
8
|
+
- Read only the newly-appended tail of each session file (byte-offset cursor).
|
|
9
|
+
- Avoid token waste: if there is no new content, produce no batch.
|
|
10
|
+
- Safety: never delete/modify session logs.
|
|
11
|
+
- Robustness: handle file rotation/truncation using inode+size checks.
|
|
12
|
+
|
|
13
|
+
This script does NOT call any LLM or write to LanceDB. It only prepares data
|
|
14
|
+
for the distiller agent.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import hashlib
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
DEFAULT_STATE_DIR = Path.home() / ".openclaw" / "state" / "jsonl-distill"
|
|
32
|
+
DEFAULT_AGENTS_DIR = Path.home() / ".openclaw" / "agents"
|
|
33
|
+
|
|
34
|
+
# Prevent self-ingestion loops: the distiller agent itself should never be a source.
|
|
35
|
+
EXCLUDED_AGENT_IDS = {
|
|
36
|
+
"memory-distiller",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
NOISE_PREFIXES = (
|
|
41
|
+
"✅ New session started",
|
|
42
|
+
"NO_REPLY",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _now_ms() -> int:
|
|
47
|
+
return int(time.time() * 1000)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _sha256(s: str) -> str:
|
|
51
|
+
return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _read_jsonl_lines(path: Path, start_offset: int, max_bytes: int) -> Tuple[List[str], int]:
|
|
55
|
+
"""Read up to max_bytes from path starting at start_offset. Returns (lines, end_offset)."""
|
|
56
|
+
lines: List[str] = []
|
|
57
|
+
with path.open("rb") as f:
|
|
58
|
+
f.seek(start_offset)
|
|
59
|
+
data = f.read(max_bytes)
|
|
60
|
+
end_offset = f.tell()
|
|
61
|
+
|
|
62
|
+
if not data:
|
|
63
|
+
return [], end_offset
|
|
64
|
+
|
|
65
|
+
# Ensure we end on a newline boundary to avoid partial JSON lines.
|
|
66
|
+
if not data.endswith(b"\n"):
|
|
67
|
+
last_nl = data.rfind(b"\n")
|
|
68
|
+
if last_nl == -1:
|
|
69
|
+
# No complete line in this chunk.
|
|
70
|
+
return [], start_offset
|
|
71
|
+
data = data[: last_nl + 1]
|
|
72
|
+
end_offset = start_offset + len(data)
|
|
73
|
+
|
|
74
|
+
text = data.decode("utf-8", errors="replace")
|
|
75
|
+
for line in text.splitlines():
|
|
76
|
+
line = line.strip()
|
|
77
|
+
if line:
|
|
78
|
+
lines.append(line)
|
|
79
|
+
return lines, end_offset
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _extract_text_blocks(content: Any) -> str:
|
|
83
|
+
if content is None:
|
|
84
|
+
return ""
|
|
85
|
+
if isinstance(content, str):
|
|
86
|
+
return content
|
|
87
|
+
if isinstance(content, list):
|
|
88
|
+
parts: List[str] = []
|
|
89
|
+
for block in content:
|
|
90
|
+
if isinstance(block, dict) and block.get("type") == "text":
|
|
91
|
+
t = block.get("text")
|
|
92
|
+
if isinstance(t, str) and t:
|
|
93
|
+
parts.append(t)
|
|
94
|
+
return "\n".join(parts)
|
|
95
|
+
return ""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _clean_text(s: str) -> str:
|
|
99
|
+
s = s.strip()
|
|
100
|
+
if not s:
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
# Drop injected memory blocks entirely.
|
|
104
|
+
if "<relevant-memories>" in s:
|
|
105
|
+
s = re.sub(r"<relevant-memories>[\s\S]*?</relevant-memories>", "", s)
|
|
106
|
+
|
|
107
|
+
# Strip OpenClaw transcript headers that add noise but not meaning.
|
|
108
|
+
# Keep the actual user content that follows.
|
|
109
|
+
s = re.sub(r"^Conversation info \(untrusted metadata\):\s*\n+", "", s, flags=re.IGNORECASE)
|
|
110
|
+
s = re.sub(r"^Replied message \(untrusted, for context\):\s*\n+", "", s, flags=re.IGNORECASE)
|
|
111
|
+
|
|
112
|
+
# Drop embedded JSON blocks (often metadata) to reduce token waste.
|
|
113
|
+
s = re.sub(r"```json[\s\S]*?```", "", s)
|
|
114
|
+
|
|
115
|
+
# Collapse whitespace.
|
|
116
|
+
s = re.sub(r"\n{3,}", "\n\n", s)
|
|
117
|
+
return s.strip()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _is_noise(s: str) -> bool:
|
|
121
|
+
if not s:
|
|
122
|
+
return True
|
|
123
|
+
for p in NOISE_PREFIXES:
|
|
124
|
+
if s.startswith(p):
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
lower = s.lower()
|
|
128
|
+
|
|
129
|
+
# Drop transcript/system boilerplate that should never become memories.
|
|
130
|
+
if "[queued messages while agent was busy]" in lower:
|
|
131
|
+
return True
|
|
132
|
+
if "you are running a boot check" in lower or "boot.md — gateway startup health check" in lower:
|
|
133
|
+
return True
|
|
134
|
+
if "read heartbeat.md" in lower:
|
|
135
|
+
return True
|
|
136
|
+
if "[claude_code_done]" in lower or "claude_code_done" in lower:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
# Skip overly long blocks (logs / dumps). The distiller can still capture the essence later.
|
|
140
|
+
if len(s) > 2000:
|
|
141
|
+
return True
|
|
142
|
+
|
|
143
|
+
# Skip pure code fences (usually tool output).
|
|
144
|
+
if s.strip().startswith("```") and s.strip().endswith("```"):
|
|
145
|
+
return True
|
|
146
|
+
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class CursorEntry:
|
|
152
|
+
inode: int
|
|
153
|
+
committed: int
|
|
154
|
+
pending: Optional[int] = None
|
|
155
|
+
pending_batch: Optional[str] = None
|
|
156
|
+
last_size: Optional[int] = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _load_cursor(cursor_path: Path) -> Dict[str, Any]:
|
|
160
|
+
if not cursor_path.exists():
|
|
161
|
+
return {"version": 1, "files": {}, "createdAtMs": _now_ms(), "updatedAtMs": _now_ms()}
|
|
162
|
+
return json.loads(cursor_path.read_text("utf-8"))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _save_cursor(cursor_path: Path, cursor: Dict[str, Any]) -> None:
|
|
166
|
+
cursor["updatedAtMs"] = _now_ms()
|
|
167
|
+
cursor_path.parent.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
tmp = cursor_path.with_suffix(".tmp")
|
|
169
|
+
tmp.write_text(json.dumps(cursor, ensure_ascii=False, indent=2) + "\n", "utf-8")
|
|
170
|
+
tmp.replace(cursor_path)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _list_session_files(agents_dir: Path) -> List[Tuple[str, Path]]:
|
|
174
|
+
results: List[Tuple[str, Path]] = []
|
|
175
|
+
if not agents_dir.exists():
|
|
176
|
+
return results
|
|
177
|
+
|
|
178
|
+
for agent_dir in sorted(agents_dir.iterdir()):
|
|
179
|
+
if not agent_dir.is_dir():
|
|
180
|
+
continue
|
|
181
|
+
agent_id = agent_dir.name
|
|
182
|
+
if agent_id in EXCLUDED_AGENT_IDS:
|
|
183
|
+
continue
|
|
184
|
+
sessions_dir = agent_dir / "sessions"
|
|
185
|
+
if not sessions_dir.exists():
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
for f in sorted(sessions_dir.iterdir()):
|
|
189
|
+
name = f.name
|
|
190
|
+
if not f.is_file():
|
|
191
|
+
continue
|
|
192
|
+
if not name.endswith(".jsonl"):
|
|
193
|
+
continue
|
|
194
|
+
if ".reset." in name:
|
|
195
|
+
# Reset snapshots are historical; we start from now and focus on live session tails.
|
|
196
|
+
continue
|
|
197
|
+
if name.endswith(".lock") or ".deleted." in name:
|
|
198
|
+
continue
|
|
199
|
+
results.append((agent_id, f))
|
|
200
|
+
|
|
201
|
+
return results
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def init_from_now(state_dir: Path, agents_dir: Path) -> Dict[str, Any]:
|
|
205
|
+
cursor_path = state_dir / "cursor.json"
|
|
206
|
+
cursor = _load_cursor(cursor_path)
|
|
207
|
+
files = cursor.setdefault("files", {})
|
|
208
|
+
|
|
209
|
+
for agent_id, f in _list_session_files(agents_dir):
|
|
210
|
+
st = f.stat()
|
|
211
|
+
key = str(f)
|
|
212
|
+
files[key] = {
|
|
213
|
+
"agentId": agent_id,
|
|
214
|
+
"inode": int(st.st_ino),
|
|
215
|
+
"committed": int(st.st_size),
|
|
216
|
+
"pending": None,
|
|
217
|
+
"pendingBatch": None,
|
|
218
|
+
"lastSize": int(st.st_size),
|
|
219
|
+
"updatedAtMs": _now_ms(),
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
_save_cursor(cursor_path, cursor)
|
|
223
|
+
return {
|
|
224
|
+
"ok": True,
|
|
225
|
+
"action": "init",
|
|
226
|
+
"cursorPath": str(cursor_path),
|
|
227
|
+
"trackedFiles": len(files),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def run_extract(state_dir: Path, agents_dir: Path, max_bytes_per_file: int, max_messages_per_agent: int) -> Dict[str, Any]:
|
|
232
|
+
cursor_path = state_dir / "cursor.json"
|
|
233
|
+
cursor = _load_cursor(cursor_path)
|
|
234
|
+
files: Dict[str, Any] = cursor.setdefault("files", {})
|
|
235
|
+
|
|
236
|
+
# If there is a pending batch, return it and do not read new data.
|
|
237
|
+
pending_batches = sorted({v.get("pendingBatch") for v in files.values() if v.get("pendingBatch")})
|
|
238
|
+
pending_batches = [b for b in pending_batches if b]
|
|
239
|
+
if pending_batches:
|
|
240
|
+
return {
|
|
241
|
+
"ok": True,
|
|
242
|
+
"action": "pending",
|
|
243
|
+
"batchFiles": pending_batches,
|
|
244
|
+
"cursorPath": str(cursor_path),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
# Collect new messages.
|
|
248
|
+
per_agent_msgs: Dict[str, List[Dict[str, Any]]] = {}
|
|
249
|
+
touched_files: List[Dict[str, Any]] = []
|
|
250
|
+
|
|
251
|
+
for agent_id, f in _list_session_files(agents_dir):
|
|
252
|
+
key = str(f)
|
|
253
|
+
st = f.stat()
|
|
254
|
+
inode = int(st.st_ino)
|
|
255
|
+
size = int(st.st_size)
|
|
256
|
+
|
|
257
|
+
entry = files.get(key)
|
|
258
|
+
committed = 0
|
|
259
|
+
if entry and entry.get("inode") == inode:
|
|
260
|
+
committed = int(entry.get("committed") or 0)
|
|
261
|
+
# Handle truncation.
|
|
262
|
+
if size < committed:
|
|
263
|
+
committed = 0
|
|
264
|
+
else:
|
|
265
|
+
# New file not tracked yet: start from EOF (A-mode behavior).
|
|
266
|
+
committed = size
|
|
267
|
+
|
|
268
|
+
if size <= committed:
|
|
269
|
+
# Nothing new.
|
|
270
|
+
files[key] = {
|
|
271
|
+
"agentId": agent_id,
|
|
272
|
+
"inode": inode,
|
|
273
|
+
"committed": committed,
|
|
274
|
+
"pending": None,
|
|
275
|
+
"pendingBatch": None,
|
|
276
|
+
"lastSize": size,
|
|
277
|
+
"updatedAtMs": _now_ms(),
|
|
278
|
+
}
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
lines, end_offset = _read_jsonl_lines(f, committed, max_bytes_per_file)
|
|
282
|
+
if not lines:
|
|
283
|
+
# Might have hit partial line boundary; do not advance.
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
extracted: List[Dict[str, Any]] = []
|
|
287
|
+
for line in lines:
|
|
288
|
+
try:
|
|
289
|
+
obj = json.loads(line)
|
|
290
|
+
except Exception:
|
|
291
|
+
continue
|
|
292
|
+
if obj.get("type") != "message":
|
|
293
|
+
continue
|
|
294
|
+
msg = obj.get("message")
|
|
295
|
+
if not isinstance(msg, dict):
|
|
296
|
+
continue
|
|
297
|
+
role = msg.get("role")
|
|
298
|
+
if role not in ("user", "assistant"):
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
text = _extract_text_blocks(msg.get("content"))
|
|
302
|
+
text = _clean_text(text)
|
|
303
|
+
if _is_noise(text):
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
extracted.append({
|
|
307
|
+
"ts": obj.get("timestamp") or msg.get("timestamp"),
|
|
308
|
+
"role": role,
|
|
309
|
+
"text": text,
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
if not extracted:
|
|
313
|
+
# Advance committed to end_offset anyway to avoid re-reading pure noise.
|
|
314
|
+
files[key] = {
|
|
315
|
+
"agentId": agent_id,
|
|
316
|
+
"inode": inode,
|
|
317
|
+
"committed": end_offset,
|
|
318
|
+
"pending": None,
|
|
319
|
+
"pendingBatch": None,
|
|
320
|
+
"lastSize": size,
|
|
321
|
+
"updatedAtMs": _now_ms(),
|
|
322
|
+
}
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
per_agent_msgs.setdefault(agent_id, []).extend(extracted)
|
|
326
|
+
touched_files.append({
|
|
327
|
+
"path": key,
|
|
328
|
+
"agentId": agent_id,
|
|
329
|
+
"inode": inode,
|
|
330
|
+
"committed": committed,
|
|
331
|
+
"pending": end_offset,
|
|
332
|
+
"size": size,
|
|
333
|
+
})
|
|
334
|
+
|
|
335
|
+
# Cap messages per agent to keep token usage stable.
|
|
336
|
+
for agent_id, msgs in per_agent_msgs.items():
|
|
337
|
+
if len(msgs) > max_messages_per_agent:
|
|
338
|
+
per_agent_msgs[agent_id] = msgs[-max_messages_per_agent:]
|
|
339
|
+
|
|
340
|
+
if not per_agent_msgs:
|
|
341
|
+
_save_cursor(cursor_path, cursor)
|
|
342
|
+
return {
|
|
343
|
+
"ok": True,
|
|
344
|
+
"action": "noop",
|
|
345
|
+
"cursorPath": str(cursor_path),
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
batches_dir = state_dir / "batches"
|
|
349
|
+
batches_dir.mkdir(parents=True, exist_ok=True)
|
|
350
|
+
batch_id = time.strftime("%Y%m%d-%H%M%S")
|
|
351
|
+
batch_path = batches_dir / f"batch-{batch_id}.json"
|
|
352
|
+
|
|
353
|
+
batch_obj = {
|
|
354
|
+
"version": 1,
|
|
355
|
+
"createdAtMs": _now_ms(),
|
|
356
|
+
"agents": [
|
|
357
|
+
{
|
|
358
|
+
"agentId": agent_id,
|
|
359
|
+
"messages": per_agent_msgs.get(agent_id, []),
|
|
360
|
+
}
|
|
361
|
+
for agent_id in sorted(per_agent_msgs.keys())
|
|
362
|
+
],
|
|
363
|
+
"touchedFiles": touched_files,
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
batch_path.write_text(json.dumps(batch_obj, ensure_ascii=False, indent=2) + "\n", "utf-8")
|
|
367
|
+
|
|
368
|
+
# Write pending offsets.
|
|
369
|
+
for tf in touched_files:
|
|
370
|
+
key = tf["path"]
|
|
371
|
+
files[key] = {
|
|
372
|
+
"agentId": tf["agentId"],
|
|
373
|
+
"inode": tf["inode"],
|
|
374
|
+
"committed": tf["committed"],
|
|
375
|
+
"pending": tf["pending"],
|
|
376
|
+
"pendingBatch": str(batch_path),
|
|
377
|
+
"lastSize": tf["size"],
|
|
378
|
+
"updatedAtMs": _now_ms(),
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
_save_cursor(cursor_path, cursor)
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
"ok": True,
|
|
385
|
+
"action": "created",
|
|
386
|
+
"batchFile": str(batch_path),
|
|
387
|
+
"agents": len(per_agent_msgs),
|
|
388
|
+
"cursorPath": str(cursor_path),
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def commit_batch(state_dir: Path, batch_file: Path) -> Dict[str, Any]:
|
|
393
|
+
cursor_path = state_dir / "cursor.json"
|
|
394
|
+
cursor = _load_cursor(cursor_path)
|
|
395
|
+
files: Dict[str, Any] = cursor.setdefault("files", {})
|
|
396
|
+
|
|
397
|
+
committed_files = 0
|
|
398
|
+
for key, v in list(files.items()):
|
|
399
|
+
if v.get("pendingBatch") != str(batch_file):
|
|
400
|
+
continue
|
|
401
|
+
pending = v.get("pending")
|
|
402
|
+
if pending is None:
|
|
403
|
+
continue
|
|
404
|
+
v["committed"] = int(pending)
|
|
405
|
+
v["pending"] = None
|
|
406
|
+
v["pendingBatch"] = None
|
|
407
|
+
v["updatedAtMs"] = _now_ms()
|
|
408
|
+
files[key] = v
|
|
409
|
+
committed_files += 1
|
|
410
|
+
|
|
411
|
+
_save_cursor(cursor_path, cursor)
|
|
412
|
+
try:
|
|
413
|
+
batch_file.unlink()
|
|
414
|
+
except Exception:
|
|
415
|
+
pass
|
|
416
|
+
|
|
417
|
+
return {
|
|
418
|
+
"ok": True,
|
|
419
|
+
"action": "committed",
|
|
420
|
+
"cursorPath": str(cursor_path),
|
|
421
|
+
"committedFiles": committed_files,
|
|
422
|
+
"batchFile": str(batch_file),
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def main() -> int:
|
|
427
|
+
ap = argparse.ArgumentParser()
|
|
428
|
+
ap.add_argument("--state-dir", default=str(DEFAULT_STATE_DIR))
|
|
429
|
+
ap.add_argument("--agents-dir", default=str(DEFAULT_AGENTS_DIR))
|
|
430
|
+
|
|
431
|
+
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
432
|
+
|
|
433
|
+
s_init = sub.add_parser("init", help="Initialize cursor to EOF for all current session files")
|
|
434
|
+
|
|
435
|
+
s_run = sub.add_parser("run", help="Extract incremental message tail and create a batch file")
|
|
436
|
+
s_run.add_argument("--max-bytes-per-file", type=int, default=256_000)
|
|
437
|
+
s_run.add_argument("--max-messages-per-agent", type=int, default=30)
|
|
438
|
+
|
|
439
|
+
s_commit = sub.add_parser("commit", help="Commit a processed batch (advance committed offsets)")
|
|
440
|
+
s_commit.add_argument("--batch-file", required=True)
|
|
441
|
+
|
|
442
|
+
args = ap.parse_args()
|
|
443
|
+
|
|
444
|
+
state_dir = Path(args.state_dir).expanduser().resolve()
|
|
445
|
+
agents_dir = Path(args.agents_dir).expanduser().resolve()
|
|
446
|
+
|
|
447
|
+
if args.cmd == "init":
|
|
448
|
+
out = init_from_now(state_dir, agents_dir)
|
|
449
|
+
print(json.dumps(out, ensure_ascii=False))
|
|
450
|
+
return 0
|
|
451
|
+
|
|
452
|
+
if args.cmd == "run":
|
|
453
|
+
out = run_extract(
|
|
454
|
+
state_dir,
|
|
455
|
+
agents_dir,
|
|
456
|
+
max_bytes_per_file=int(args.max_bytes_per_file),
|
|
457
|
+
max_messages_per_agent=int(args.max_messages_per_agent),
|
|
458
|
+
)
|
|
459
|
+
print(json.dumps(out, ensure_ascii=False))
|
|
460
|
+
return 0
|
|
461
|
+
|
|
462
|
+
if args.cmd == "commit":
|
|
463
|
+
out = commit_batch(state_dir, Path(args.batch_file).expanduser().resolve())
|
|
464
|
+
print(json.dumps(out, ensure_ascii=False))
|
|
465
|
+
return 0
|
|
466
|
+
|
|
467
|
+
raise RuntimeError("unreachable")
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
if __name__ == "__main__":
|
|
471
|
+
raise SystemExit(main())
|
|
@@ -32,12 +32,38 @@ const FORCE_RETRIEVE_PATTERNS = [
|
|
|
32
32
|
/(你记得|之前|上次|以前|还记得|提到过|说过)/i,
|
|
33
33
|
];
|
|
34
34
|
|
|
35
|
+
/**
|
|
36
|
+
* Normalize the raw prompt before applying skip/force rules.
|
|
37
|
+
*
|
|
38
|
+
* OpenClaw may wrap cron prompts like:
|
|
39
|
+
* "[cron:<jobId> <jobName>] run ..."
|
|
40
|
+
*
|
|
41
|
+
* We strip such prefixes so command-style prompts are properly detected and we
|
|
42
|
+
* can skip auto-recall injection (saves tokens).
|
|
43
|
+
*/
|
|
44
|
+
function normalizeQuery(query: string): string {
|
|
45
|
+
let s = query.trim();
|
|
46
|
+
|
|
47
|
+
// Strip OpenClaw cron wrapper prefix.
|
|
48
|
+
s = s.replace(/^\[cron:[^\]]+\]\s*/i, "");
|
|
49
|
+
|
|
50
|
+
// Strip OpenClaw injected metadata header used in some transcripts.
|
|
51
|
+
if (/^Conversation info \(untrusted metadata\):/i.test(s)) {
|
|
52
|
+
s = s.replace(/^Conversation info \(untrusted metadata\):\s*/i, "");
|
|
53
|
+
// If there is a blank-line separator, keep only the part after it.
|
|
54
|
+
const parts = s.split(/\n\s*\n/, 2);
|
|
55
|
+
if (parts.length === 2) s = parts[1];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return s.trim();
|
|
59
|
+
}
|
|
60
|
+
|
|
35
61
|
/**
|
|
36
62
|
* Determine if a query should skip memory retrieval.
|
|
37
63
|
* Returns true if retrieval should be skipped.
|
|
38
64
|
*/
|
|
39
65
|
export function shouldSkipRetrieval(query: string): boolean {
|
|
40
|
-
const trimmed = query
|
|
66
|
+
const trimmed = normalizeQuery(query);
|
|
41
67
|
|
|
42
68
|
// Force retrieve if query has memory-related intent (checked FIRST,
|
|
43
69
|
// before length check, so short CJK queries like "你记得吗" aren't skipped)
|