nautilus-compass 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nautilus_compass-1.0.0/LICENSE +21 -0
- nautilus_compass-1.0.0/LICENSE-ANCHORS +17 -0
- nautilus_compass-1.0.0/PKG-INFO +319 -0
- nautilus_compass-1.0.0/README.md +279 -0
- nautilus_compass-1.0.0/__init__.py +13 -0
- nautilus_compass-1.0.0/anchor_generator.py +186 -0
- nautilus_compass-1.0.0/anchors.json +67 -0
- nautilus_compass-1.0.0/anchors_adapted.json +403 -0
- nautilus_compass-1.0.0/anchors_finance.json +57 -0
- nautilus_compass-1.0.0/anchors_legal.json +57 -0
- nautilus_compass-1.0.0/anchors_medical.json +57 -0
- nautilus_compass-1.0.0/anchors_platform_base.json +57 -0
- nautilus_compass-1.0.0/anchors_vc.json +57 -0
- nautilus_compass-1.0.0/anchors_zenmind.json +57 -0
- nautilus_compass-1.0.0/audit_kpi.py +79 -0
- nautilus_compass-1.0.0/compass_http.py +578 -0
- nautilus_compass-1.0.0/compass_http_v09.py +1018 -0
- nautilus_compass-1.0.0/compass_raid.py +249 -0
- nautilus_compass-1.0.0/compass_verify.py +115 -0
- nautilus_compass-1.0.0/daemon.py +533 -0
- nautilus_compass-1.0.0/daemon_anchor_apply.py +100 -0
- nautilus_compass-1.0.0/daemon_anchor_loader.py +112 -0
- nautilus_compass-1.0.0/daemon_start.sh +38 -0
- nautilus_compass-1.0.0/daemon_stop.sh +12 -0
- nautilus_compass-1.0.0/deeptest.py +188 -0
- nautilus_compass-1.0.0/drift_history.py +191 -0
- nautilus_compass-1.0.0/feedback.py +300 -0
- nautilus_compass-1.0.0/hook.sh +25 -0
- nautilus_compass-1.0.0/install.sh +46 -0
- nautilus_compass-1.0.0/install_bge.sh +54 -0
- nautilus_compass-1.0.0/links_finder.py +154 -0
- nautilus_compass-1.0.0/llm_distill.py +133 -0
- nautilus_compass-1.0.0/mcp_client.py +343 -0
- nautilus_compass-1.0.0/mcp_server.py +1242 -0
- nautilus_compass-1.0.0/merkle_chain.py +362 -0
- nautilus_compass-1.0.0/mid_session_hook.py +196 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/PKG-INFO +319 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/SOURCES.txt +117 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/dependency_links.txt +1 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/entry_points.txt +7 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/requires.txt +21 -0
- nautilus_compass-1.0.0/nautilus_compass.egg-info/top_level.txt +1 -0
- nautilus_compass-1.0.0/output_drift.py +144 -0
- nautilus_compass-1.0.0/pyproject.toml +90 -0
- nautilus_compass-1.0.0/recall.py +745 -0
- nautilus_compass-1.0.0/selftest.py +103 -0
- nautilus_compass-1.0.0/session_search.py +142 -0
- nautilus_compass-1.0.0/session_writer.py +380 -0
- nautilus_compass-1.0.0/setup.cfg +4 -0
- nautilus_compass-1.0.0/stake_publisher.py +190 -0
- nautilus_compass-1.0.0/stop_hook.py +152 -0
- nautilus_compass-1.0.0/strategy_store.py +274 -0
- nautilus_compass-1.0.0/tests/test_a2a_adapter.py +153 -0
- nautilus_compass-1.0.0/tests/test_a2a_peer_demo.py +210 -0
- nautilus_compass-1.0.0/tests/test_a2a_tls_demo.py +50 -0
- nautilus_compass-1.0.0/tests/test_auth_v091.py +167 -0
- nautilus_compass-1.0.0/tests/test_compass_v09.py +203 -0
- nautilus_compass-1.0.0/tests/test_crypto.py +212 -0
- nautilus_compass-1.0.0/tests/test_e2e_encryption.py +230 -0
- nautilus_compass-1.0.0/tests/test_full_capability_demo.py +55 -0
- nautilus_compass-1.0.0/tests/test_http_server_e2e.py +219 -0
- nautilus_compass-1.0.0/tests/test_mcp_client.py +157 -0
- nautilus_compass-1.0.0/tests/test_mcp_client_backoff.py +212 -0
- nautilus_compass-1.0.0/tests/test_mcp_logging.py +251 -0
- nautilus_compass-1.0.0/tests/test_mcp_progress.py +216 -0
- nautilus_compass-1.0.0/tests/test_mcp_rate_limit.py +313 -0
- nautilus_compass-1.0.0/tests/test_mcp_rbac.py +242 -0
- nautilus_compass-1.0.0/tests/test_mcp_resources.py +221 -0
- nautilus_compass-1.0.0/tests/test_mcp_server.py +409 -0
- nautilus_compass-1.0.0/tests/test_mcp_tls.py +263 -0
- nautilus_compass-1.0.0/tests/test_merkle_chain.py +245 -0
- nautilus_compass-1.0.0/tests/test_plugin_manifest.py +117 -0
- nautilus_compass-1.0.0/tests/test_release_notes_extract.py +79 -0
- nautilus_compass-1.0.0/tests/test_sqlite_migration.py +233 -0
- nautilus_compass-1.0.0/tests/test_stake_publisher.py +204 -0
- nautilus_compass-1.0.0/tests/test_third_party_client.py +73 -0
- nautilus_compass-1.0.0/tests/test_version_sync.py +44 -0
- nautilus_compass-1.0.0/time_anchor.py +144 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Chunxiao Wang (Yiluo Technology Co., Ltd.)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
CC0 1.0 Universal (Public Domain Dedication)
|
|
2
|
+
|
|
3
|
+
The behavioral anchor files in this repository (`anchors*.json`,
|
|
4
|
+
`positive_anchors.json`, `negative_anchors.json`, and any file matching
|
|
5
|
+
`anchors_*.json`) are released into the public domain under the Creative
|
|
6
|
+
Commons CC0 1.0 Universal Public Domain Dedication.
|
|
7
|
+
|
|
8
|
+
You are encouraged to replace these defaults with your own behavioral
|
|
9
|
+
anchors tuned to your specific deployment context. No attribution is
|
|
10
|
+
required when reusing or modifying the anchor files; attribution is
|
|
11
|
+
welcome but not necessary.
|
|
12
|
+
|
|
13
|
+
Full text: https://creativecommons.org/publicdomain/zero/1.0/
|
|
14
|
+
|
|
15
|
+
The rest of the codebase (Python source, Node MCP wrapper, documentation,
|
|
16
|
+
LaTeX papers under `paper/`, scripts, tests) remains MIT-licensed; see
|
|
17
|
+
`LICENSE` for those terms.
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nautilus-compass
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Cross-agent memory layer for the Nautilus platform · MCP/A2A protocol · drift-aware writer · LongMemEval-S benchmark · E2EE-ready
|
|
5
|
+
Author: chunxiaoxx
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/chunxiaoxx/nautilus-compass
|
|
8
|
+
Project-URL: Issues, https://github.com/chunxiaoxx/nautilus-compass/issues
|
|
9
|
+
Keywords: claude-code,memory,persona-vectors,drift-detection,rag,embedding,bge,agent-memory,mcp,a2a,nautilus-platform,cross-agent,longmemeval
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
License-File: LICENSE-ANCHORS
|
|
24
|
+
Requires-Dist: sentence-transformers>=2.7
|
|
25
|
+
Provides-Extra: modelscope
|
|
26
|
+
Requires-Dist: modelscope>=1.10; extra == "modelscope"
|
|
27
|
+
Provides-Extra: rerank
|
|
28
|
+
Requires-Dist: sentence-transformers>=2.7; extra == "rerank"
|
|
29
|
+
Provides-Extra: fast-download
|
|
30
|
+
Requires-Dist: hf_transfer>=0.1; extra == "fast-download"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff; extra == "dev"
|
|
35
|
+
Provides-Extra: e2ee
|
|
36
|
+
Requires-Dist: pynacl>=1.5; extra == "e2ee"
|
|
37
|
+
Provides-Extra: nautilus
|
|
38
|
+
Requires-Dist: nautilus-agent>=0.5; extra == "nautilus"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# nautilus-compass
|
|
42
|
+
|
|
43
|
+
> **Cross-agent memory layer with drift detection** for LLM agents.
|
|
44
|
+
> Memory plugin for Claude Code/Desktop · Cline · Cursor · Continue.dev · Zed ·
|
|
45
|
+
> stops your AI from repeating mistakes you've already flagged.
|
|
46
|
+
|
|
47
|
+
🇬🇧 English (this file) · [🇨🇳 中文](README.zh-CN.md)
|
|
48
|
+
|
|
49
|
+
[](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml)
|
|
50
|
+
[](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml)
|
|
51
|
+
[](paper/RESULTS_v0.8.md)
|
|
52
|
+
[](paper/sections/paper2_06_5_evermembench.tex)
|
|
53
|
+
[](#how-it-works)
|
|
54
|
+
[](CHANGELOG.md)
|
|
55
|
+
[](docs/mcp-usage.md)
|
|
56
|
+
[](examples/a2a_tls_demo.py)
|
|
57
|
+
[](LICENSE)
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 30-second pitch
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Traditional memory systems (mem0 / Letta / claude-mem / Zep):
|
|
65
|
+
"I can recall the right past memory more accurately."
|
|
66
|
+
|
|
67
|
+
nautilus-compass adds one more step:
|
|
68
|
+
"Memory recalled + detect if the AI is about to repeat a known mistake
|
|
69
|
+
+ remind it of what worked last time."
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**In one line**: when the AI is about to forget a rule you set, take a
|
|
73
|
+
shortcut you flagged, or fabricate a prior agreement, it gets stopped
|
|
74
|
+
by its own history of failure patterns.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## What problem does this solve
|
|
79
|
+
|
|
80
|
+
### A. Long sessions drift
|
|
81
|
+
|
|
82
|
+
You told Claude at session start: *"never claim deployment success
|
|
83
|
+
without verification."* Fifty prompts later Claude says *"deployed
|
|
84
|
+
successfully ✅"* — without verifying. The memory rule was there; the
|
|
85
|
+
AI forgot it under context pressure.
|
|
86
|
+
|
|
87
|
+
### B. White-box drift detection isn't reachable
|
|
88
|
+
|
|
89
|
+
[Persona Vectors (Anthropic, 2025)](https://arxiv.org/abs/2507.21509)
|
|
90
|
+
proved that LLM activations contain directions for sycophancy and
|
|
91
|
+
hallucination. But that requires model weights — closed APIs (Claude,
|
|
92
|
+
GPT-4) don't expose them. There has been no production black-box
|
|
93
|
+
equivalent that runs in a Claude Code hook.
|
|
94
|
+
|
|
95
|
+
### C. Memory plugins solve only half the problem
|
|
96
|
+
|
|
97
|
+
Mem0, Letta, claude-mem, Zep all compete on *"recall the most relevant
|
|
98
|
+
past memory."* But memory recalled doesn't stop the AI from breaking
|
|
99
|
+
the rule **this time** — that other half has been unsolved.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## How it works
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
User prompt: "Fix bug X for me"
|
|
107
|
+
│
|
|
108
|
+
▼
|
|
109
|
+
┌─────────────────────────────────────┐
|
|
110
|
+
│ UserPromptSubmit Hook (this plugin)│
|
|
111
|
+
└─────────────────────────────────────┘
|
|
112
|
+
│
|
|
113
|
+
┌────────────┼────────────┐
|
|
114
|
+
▼ ▼ ▼
|
|
115
|
+
┌────────┐ ┌─────────┐ ┌──────────┐
|
|
116
|
+
│ recall │ │ drift │ │ profile │
|
|
117
|
+
│ memory │ │ check │ │ aggregate│
|
|
118
|
+
└────────┘ └─────────┘ └──────────┘
|
|
119
|
+
│
|
|
120
|
+
▼
|
|
121
|
+
Hooks inject results into Claude's system prompt:
|
|
122
|
+
- Time-bucketed past memory (BGE-m3 semantic recall)
|
|
123
|
+
- Drift score + nearest negative anchor (if score < threshold)
|
|
124
|
+
- Profile facts ("you have 3 unfinished tasks in this repo")
|
|
125
|
+
│
|
|
126
|
+
▼
|
|
127
|
+
Claude answers — with full context loaded
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The drift detector compares each prompt against an anchor set
|
|
131
|
+
(25 positive + 35 negative behavioral patterns drawn from real failure
|
|
132
|
+
transcripts) using BGE-m3 cosine similarity. AUC 0.83 on held-out, 50ms
|
|
133
|
+
p95 hook latency.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Headline numbers
|
|
138
|
+
|
|
139
|
+
| Benchmark | Score | Compare against |
|
|
140
|
+
|---|---|---|
|
|
141
|
+
| **LongMemEval-S** (n=500) | **56.6%** (locked at v0.8) | ties Zep SOTA band, +12 pts vs Gemini-2.5-pro baseline |
|
|
142
|
+
| **EverMemBench-Dynamic** (n=500) | **44.4% (Run 1) / 47.3% (Run 2)** | tops every reported Table 4 baseline (Mem0 37.09, Zep 39.97, MemOS 42.55) |
|
|
143
|
+
| **Drift detector AUC** | **0.83 held-out / 0.92 in-set** | first black-box drift score that runs in a Claude Code hook |
|
|
144
|
+
| **Reproduction cost** | **~$3.50** for 500 LongMemEval questions | under 1/15 of GPT-4o-judged stacks |
|
|
145
|
+
| **p95 hook latency** | **<50 ms** | safe for every-prompt invocation |
|
|
146
|
+
|
|
147
|
+
We deliberately report Run 1 (44.4%) as the abstract headline for
|
|
148
|
+
EverMemBench to avoid cherry-picking; the cross-run mean (45.84%) clears
|
|
149
|
+
MemOS by +3.3 pts. See `paper/sections/paper2_06_5_evermembench.tex`
|
|
150
|
+
for honest dual-run + Gemini cross-judge sensitivity analysis.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Quickstart
|
|
155
|
+
|
|
156
|
+
### Install in Claude Code
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
git clone https://github.com/chunxiaoxx/nautilus-compass ~/.claude/plugins/nautilus-compass
|
|
160
|
+
bash ~/.claude/plugins/nautilus-compass/install.sh
|
|
161
|
+
|
|
162
|
+
# Start the BGE-m3 daemon (one-time per boot)
|
|
163
|
+
bash ~/.claude/plugins/nautilus-compass/daemon_start.sh
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
The installer wires three hooks into `~/.claude/settings.json`:
|
|
167
|
+
- `UserPromptSubmit` → injects time-bucketed memory recall + drift
|
|
168
|
+
- `PostToolUse` → mid-session writer
|
|
169
|
+
- `Stop` → end-of-session summary writer
|
|
170
|
+
|
|
171
|
+
Five user-facing slash commands appear in Claude Code:
|
|
172
|
+
`/compass-verify` · `/compass-drift` · `/compass-recall` ·
|
|
173
|
+
`/compass-search` · `/compass-status`.
|
|
174
|
+
|
|
175
|
+
### Install in any other MCP client
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
python ~/.claude/plugins/nautilus-compass/scripts/install_to_agent.py
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Auto-detects Claude Desktop, Cursor, Cline, Continue.dev, Zed Editor and
|
|
182
|
+
patches their MCP config. See [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md)
|
|
183
|
+
for per-agent copy-paste configs and [`docs/mcp-usage.md`](docs/mcp-usage.md)
|
|
184
|
+
for the raw protocol specification.
|
|
185
|
+
|
|
186
|
+
### Cloud-hosted alternative (no local install)
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
curl https://compass.nautilus.social/.well-known/agent.json
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Returns the standard A2A discovery descriptor. Sign up at
|
|
193
|
+
`compass.nautilus.social/signup` for a hosted gateway with multi-user
|
|
194
|
+
sync, audit log, and managed BGE-m3 deployment.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## What's exposed (7 MCP tools)
|
|
199
|
+
|
|
200
|
+
| Tool | Purpose | Latency |
|
|
201
|
+
|---|---|---|
|
|
202
|
+
| `ingest_obs(name, body, agent_id?)` | Write observation with auto-anchor + drift signal | ~150 ms |
|
|
203
|
+
| `recall(query, project?, top_k?)` | BGE-m3 semantic + keyword search | ~200 ms |
|
|
204
|
+
| `session_search(query, since?)` | Time-bucketed session-log search | ~80 ms |
|
|
205
|
+
| `profile(user_id?)` | Work-profile aggregate (topics, agents, drift trend) | ~100 ms |
|
|
206
|
+
| `drift_check(prompt, project?)` | Black-box drift score against anchors | <50 ms |
|
|
207
|
+
| `drift_history(since?, agent_id?)` | Drift score timeline for trend audit | ~30 ms |
|
|
208
|
+
| `feedback_log(direction, reason)` | Log positive/negative anchor signal | <20 ms |
|
|
209
|
+
|
|
210
|
+
The MCP server speaks JSON-RPC 2.0 over stdio / TCP / TLS / mTLS.
|
|
211
|
+
Per-token RBAC, per-token rate limiting, `notifications/{progress,
|
|
212
|
+
cancelled, message}`, `logging/setLevel`, and `resources/*` for session-log
|
|
213
|
+
streaming are all spec-complete.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Comparison
|
|
218
|
+
|
|
219
|
+
| Capability | this | mem0 | Letta | Zep | claude-mem | MemOS | Smriti |
|
|
220
|
+
|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
221
|
+
| Cross-agent memory | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | archive-only |
|
|
222
|
+
| MCP A2A protocol native | ✅ TLS+mTLS+RBAC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
223
|
+
| Drift detection | ✅ AUC 0.83 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
224
|
+
| Merkle integrity audit log | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
225
|
+
| LongMemEval-S verified | ✅ 56.6% (locked) | n/r | n/r | n/r | ❌ | n/r | ❌ |
|
|
226
|
+
| EverMemBench verified | ✅ 44.4-47.3% | 37.09 | n/r | 39.97 | n/r | 42.55 | ❌ |
|
|
227
|
+
| Self-host + hosted both | ✅ | ☁ only | ✅ | ☁ only | ✅ | OSS only | OSS only |
|
|
228
|
+
| License | MIT | Apache | Apache | proprietary | MIT | Apache | MIT |
|
|
229
|
+
|
|
230
|
+
`n/r` = not reported in their published evaluations. Smriti is a team
|
|
231
|
+
conversation archive with git-based sharing — different scope from a
|
|
232
|
+
runtime memory layer, so most rows are intentionally out-of-scope rather
|
|
233
|
+
than missing features.
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Documentation
|
|
238
|
+
|
|
239
|
+
- [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md) — per-agent install configs (6 platforms + 3 frameworks)
|
|
240
|
+
- [`docs/mcp-usage.md`](docs/mcp-usage.md) — raw MCP protocol guide, TLS setup, RBAC
|
|
241
|
+
- [`docs/PLATFORM_HANDSHAKE.md`](docs/PLATFORM_HANDSHAKE.md) — OSS↔SaaS coordination contract
|
|
242
|
+
- [`paper/`](paper/) — two papers (drift detection + memory pipeline) and supporting eval scripts
|
|
243
|
+
- [`CHANGELOG.md`](CHANGELOG.md) — versioned release notes
|
|
244
|
+
- [`CONTRIBUTING.md`](CONTRIBUTING.md) — adding new domain anchors / running benchmarks
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Citation
|
|
249
|
+
|
|
250
|
+
If you use this work, please cite:
|
|
251
|
+
|
|
252
|
+
**Paper 1 · drift detection**:
|
|
253
|
+
|
|
254
|
+
```bibtex
|
|
255
|
+
@misc{nautiluscompass-drift-2026,
|
|
256
|
+
title = {Nautilus Compass: Black-box Persona Drift Detection
|
|
257
|
+
for Production LLM Agents},
|
|
258
|
+
author = {Chunxiao Wang},
|
|
259
|
+
year = {2026},
|
|
260
|
+
note = {Yiluo Technology Co., Ltd.},
|
|
261
|
+
howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
|
|
262
|
+
}
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
**Paper 2 · memory pipeline + EverMemBench cross-bench**:
|
|
266
|
+
|
|
267
|
+
```bibtex
|
|
268
|
+
@misc{nautiluscompass-memrecall-2026,
|
|
269
|
+
title = {Closing the Memory Recall Gap with Chinese LLMs:
|
|
270
|
+
A Multi-Stage Retrieval Pipeline Achieving Zep-SOTA Performance
|
|
271
|
+
on LongMemEval-S at 1/15 Cost},
|
|
272
|
+
author = {Chunxiao Wang},
|
|
273
|
+
year = {2026},
|
|
274
|
+
note = {Yiluo Technology Co., Ltd.},
|
|
275
|
+
howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
The `howpublished` field will be updated to the arXiv identifier once
|
|
280
|
+
the preprints are live.
|
|
281
|
+
|
|
282
|
+
We also build on prior work — please cite as appropriate:
|
|
283
|
+
|
|
284
|
+
- BGE-m3 / BGE-Reranker (Chen et al., BAAI 2024)
|
|
285
|
+
- **Persona Vectors** (Chen et al., Anthropic, [arXiv:2507.21509](https://arxiv.org/abs/2507.21509)) — *complementary white-box approach, not the same as ours*
|
|
286
|
+
- DPT-Agent strategy distillation ([arXiv:2502.11882](https://arxiv.org/abs/2502.11882))
|
|
287
|
+
- A-MEM dynamic links ([arXiv:2502.12110](https://arxiv.org/abs/2502.12110))
|
|
288
|
+
- LongMemEval (Wu et al., NeurIPS 2024)
|
|
289
|
+
- EverMemBench (Hu et al., 2026)
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## License
|
|
294
|
+
|
|
295
|
+
- **Code, plugin, MCP wrapper, papers, scripts** — MIT (see [`LICENSE`](LICENSE))
|
|
296
|
+
- **Behavioral anchor files** (`anchors*.json`) — CC0 1.0 Universal (see [`LICENSE-ANCHORS`](LICENSE-ANCHORS))
|
|
297
|
+
|
|
298
|
+
You may use this in any project, commercial or otherwise, with attribution.
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## Star history
|
|
303
|
+
|
|
304
|
+
[](https://star-history.com/#chunxiaoxx/nautilus-compass&Date)
|
|
305
|
+
|
|
306
|
+
## Contributors
|
|
307
|
+
|
|
308
|
+
<a href="https://github.com/chunxiaoxx/nautilus-compass/graphs/contributors">
|
|
309
|
+
<img src="https://contrib.rocks/image?repo=chunxiaoxx/nautilus-compass" alt="Contributors" />
|
|
310
|
+
</a>
|
|
311
|
+
|
|
312
|
+
PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
313
|
+
|
|
314
|
+
## Contact
|
|
315
|
+
|
|
316
|
+
- **Author**: Chunxiao Wang · Yiluo Technology Co., Ltd. · `chunxiaoxx@gmail.com`
|
|
317
|
+
- **Issues**: [github.com/chunxiaoxx/nautilus-compass/issues](https://github.com/chunxiaoxx/nautilus-compass/issues)
|
|
318
|
+
- **Hosted gateway**: [compass.nautilus.social](https://compass.nautilus.social)
|
|
319
|
+
- **中文文档**: [README.zh-CN.md](README.zh-CN.md)
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# nautilus-compass
|
|
2
|
+
|
|
3
|
+
> **Cross-agent memory layer with drift detection** for LLM agents.
|
|
4
|
+
> Memory plugin for Claude Code/Desktop · Cline · Cursor · Continue.dev · Zed ·
|
|
5
|
+
> stops your AI from repeating mistakes you've already flagged.
|
|
6
|
+
|
|
7
|
+
🇬🇧 English (this file) · [🇨🇳 中文](README.zh-CN.md)
|
|
8
|
+
|
|
9
|
+
[](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml)
|
|
10
|
+
[](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml)
|
|
11
|
+
[](paper/RESULTS_v0.8.md)
|
|
12
|
+
[](paper/sections/paper2_06_5_evermembench.tex)
|
|
13
|
+
[](#how-it-works)
|
|
14
|
+
[](CHANGELOG.md)
|
|
15
|
+
[](docs/mcp-usage.md)
|
|
16
|
+
[](examples/a2a_tls_demo.py)
|
|
17
|
+
[](LICENSE)
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 30-second pitch
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
Traditional memory systems (mem0 / Letta / claude-mem / Zep):
|
|
25
|
+
"I can recall the right past memory more accurately."
|
|
26
|
+
|
|
27
|
+
nautilus-compass adds one more step:
|
|
28
|
+
"Memory recalled + detect if the AI is about to repeat a known mistake
|
|
29
|
+
+ remind it of what worked last time."
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**In one line**: when the AI is about to forget a rule you set, take a
|
|
33
|
+
shortcut you flagged, or fabricate a prior agreement, it gets stopped
|
|
34
|
+
by its own history of failure patterns.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## What problem does this solve
|
|
39
|
+
|
|
40
|
+
### A. Long sessions drift
|
|
41
|
+
|
|
42
|
+
You told Claude at session start: *"never claim deployment success
|
|
43
|
+
without verification."* Fifty prompts later Claude says *"deployed
|
|
44
|
+
successfully ✅"* — without verifying. The memory rule was there; the
|
|
45
|
+
AI forgot it under context pressure.
|
|
46
|
+
|
|
47
|
+
### B. White-box drift detection isn't reachable
|
|
48
|
+
|
|
49
|
+
[Persona Vectors (Anthropic, 2025)](https://arxiv.org/abs/2507.21509)
|
|
50
|
+
proved that LLM activations contain directions for sycophancy and
|
|
51
|
+
hallucination. But that requires model weights — closed APIs (Claude,
|
|
52
|
+
GPT-4) don't expose them. There has been no production black-box
|
|
53
|
+
equivalent that runs in a Claude Code hook.
|
|
54
|
+
|
|
55
|
+
### C. Memory plugins solve only half the problem
|
|
56
|
+
|
|
57
|
+
Mem0, Letta, claude-mem, Zep all compete on *"recall the most relevant
|
|
58
|
+
past memory."* But memory recalled doesn't stop the AI from breaking
|
|
59
|
+
the rule **this time** — that other half has been unsolved.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## How it works
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
User prompt: "Fix bug X for me"
|
|
67
|
+
│
|
|
68
|
+
▼
|
|
69
|
+
┌─────────────────────────────────────┐
|
|
70
|
+
│ UserPromptSubmit Hook (this plugin)│
|
|
71
|
+
└─────────────────────────────────────┘
|
|
72
|
+
│
|
|
73
|
+
┌────────────┼────────────┐
|
|
74
|
+
▼ ▼ ▼
|
|
75
|
+
┌────────┐ ┌─────────┐ ┌──────────┐
|
|
76
|
+
│ recall │ │ drift │ │ profile │
|
|
77
|
+
│ memory │ │ check │ │ aggregate│
|
|
78
|
+
└────────┘ └─────────┘ └──────────┘
|
|
79
|
+
│
|
|
80
|
+
▼
|
|
81
|
+
Hooks inject results into Claude's system prompt:
|
|
82
|
+
- Time-bucketed past memory (BGE-m3 semantic recall)
|
|
83
|
+
- Drift score + nearest negative anchor (if score < threshold)
|
|
84
|
+
- Profile facts ("you have 3 unfinished tasks in this repo")
|
|
85
|
+
│
|
|
86
|
+
▼
|
|
87
|
+
Claude answers — with full context loaded
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The drift detector compares each prompt against an anchor set
|
|
91
|
+
(25 positive + 35 negative behavioral patterns drawn from real failure
|
|
92
|
+
transcripts) using BGE-m3 cosine similarity. AUC 0.83 on held-out, 50ms
|
|
93
|
+
p95 hook latency.
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Headline numbers
|
|
98
|
+
|
|
99
|
+
| Benchmark | Score | Compare against |
|
|
100
|
+
|---|---|---|
|
|
101
|
+
| **LongMemEval-S** (n=500) | **56.6%** (locked at v0.8) | ties Zep SOTA band, +12 pts vs Gemini-2.5-pro baseline |
|
|
102
|
+
| **EverMemBench-Dynamic** (n=500) | **44.4% (Run 1) / 47.3% (Run 2)** | tops every reported Table 4 baseline (Mem0 37.09, Zep 39.97, MemOS 42.55) |
|
|
103
|
+
| **Drift detector AUC** | **0.83 held-out / 0.92 in-set** | first black-box drift score that runs in a Claude Code hook |
|
|
104
|
+
| **Reproduction cost** | **~$3.50** for 500 LongMemEval questions | under 1/15 of GPT-4o-judged stacks |
|
|
105
|
+
| **p95 hook latency** | **<50 ms** | safe for every-prompt invocation |
|
|
106
|
+
|
|
107
|
+
We deliberately report Run 1 (44.4%) as the abstract headline for
|
|
108
|
+
EverMemBench to avoid cherry-picking; the cross-run mean (45.84%) clears
|
|
109
|
+
MemOS by +3.3 pts. See `paper/sections/paper2_06_5_evermembench.tex`
|
|
110
|
+
for honest dual-run + Gemini cross-judge sensitivity analysis.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Quickstart
|
|
115
|
+
|
|
116
|
+
### Install in Claude Code
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
git clone https://github.com/chunxiaoxx/nautilus-compass ~/.claude/plugins/nautilus-compass
|
|
120
|
+
bash ~/.claude/plugins/nautilus-compass/install.sh
|
|
121
|
+
|
|
122
|
+
# Start the BGE-m3 daemon (one-time per boot)
|
|
123
|
+
bash ~/.claude/plugins/nautilus-compass/daemon_start.sh
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
The installer wires three hooks into `~/.claude/settings.json`:
|
|
127
|
+
- `UserPromptSubmit` → injects time-bucketed memory recall + drift
|
|
128
|
+
- `PostToolUse` → mid-session writer
|
|
129
|
+
- `Stop` → end-of-session summary writer
|
|
130
|
+
|
|
131
|
+
Five user-facing slash commands appear in Claude Code:
|
|
132
|
+
`/compass-verify` · `/compass-drift` · `/compass-recall` ·
|
|
133
|
+
`/compass-search` · `/compass-status`.
|
|
134
|
+
|
|
135
|
+
### Install in any other MCP client
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
python ~/.claude/plugins/nautilus-compass/scripts/install_to_agent.py
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Auto-detects Claude Desktop, Cursor, Cline, Continue.dev, Zed Editor and
|
|
142
|
+
patches their MCP config. See [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md)
|
|
143
|
+
for per-agent copy-paste configs and [`docs/mcp-usage.md`](docs/mcp-usage.md)
|
|
144
|
+
for the raw protocol specification.
|
|
145
|
+
|
|
146
|
+
### Cloud-hosted alternative (no local install)
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
curl https://compass.nautilus.social/.well-known/agent.json
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Returns the standard A2A discovery descriptor. Sign up at
|
|
153
|
+
`compass.nautilus.social/signup` for a hosted gateway with multi-user
|
|
154
|
+
sync, audit log, and managed BGE-m3 deployment.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## What's exposed (7 MCP tools)
|
|
159
|
+
|
|
160
|
+
| Tool | Purpose | Latency |
|
|
161
|
+
|---|---|---|
|
|
162
|
+
| `ingest_obs(name, body, agent_id?)` | Write observation with auto-anchor + drift signal | ~150 ms |
|
|
163
|
+
| `recall(query, project?, top_k?)` | BGE-m3 semantic + keyword search | ~200 ms |
|
|
164
|
+
| `session_search(query, since?)` | Time-bucketed session-log search | ~80 ms |
|
|
165
|
+
| `profile(user_id?)` | Work-profile aggregate (topics, agents, drift trend) | ~100 ms |
|
|
166
|
+
| `drift_check(prompt, project?)` | Black-box drift score against anchors | <50 ms |
|
|
167
|
+
| `drift_history(since?, agent_id?)` | Drift score timeline for trend audit | ~30 ms |
|
|
168
|
+
| `feedback_log(direction, reason)` | Log positive/negative anchor signal | <20 ms |
|
|
169
|
+
|
|
170
|
+
The MCP server speaks JSON-RPC 2.0 over stdio / TCP / TLS / mTLS.
|
|
171
|
+
Per-token RBAC, per-token rate limiting, `notifications/{progress,
|
|
172
|
+
cancelled, message}`, `logging/setLevel`, and `resources/*` for session-log
|
|
173
|
+
streaming are all spec-complete.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Comparison
|
|
178
|
+
|
|
179
|
+
| Capability | this | mem0 | Letta | Zep | claude-mem | MemOS | Smriti |
|
|
180
|
+
|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
181
|
+
| Cross-agent memory | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | archive-only |
|
|
182
|
+
| MCP A2A protocol native | ✅ TLS+mTLS+RBAC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
183
|
+
| Drift detection | ✅ AUC 0.83 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
184
|
+
| Merkle integrity audit log | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
|
185
|
+
| LongMemEval-S verified | ✅ 56.6% (locked) | n/r | n/r | n/r | ❌ | n/r | ❌ |
|
|
186
|
+
| EverMemBench verified | ✅ 44.4-47.3% | 37.09 | n/r | 39.97 | n/r | 42.55 | ❌ |
|
|
187
|
+
| Self-host + hosted both | ✅ | ☁ only | ✅ | ☁ only | ✅ | OSS only | OSS only |
|
|
188
|
+
| License | MIT | Apache | Apache | proprietary | MIT | Apache | MIT |
|
|
189
|
+
|
|
190
|
+
`n/r` = not reported in their published evaluations. Smriti is a team
|
|
191
|
+
conversation archive with git-based sharing — different scope from a
|
|
192
|
+
runtime memory layer, so most rows are intentionally out-of-scope rather
|
|
193
|
+
than missing features.
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Documentation
|
|
198
|
+
|
|
199
|
+
- [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md) — per-agent install configs (6 platforms + 3 frameworks)
|
|
200
|
+
- [`docs/mcp-usage.md`](docs/mcp-usage.md) — raw MCP protocol guide, TLS setup, RBAC
|
|
201
|
+
- [`docs/PLATFORM_HANDSHAKE.md`](docs/PLATFORM_HANDSHAKE.md) — OSS↔SaaS coordination contract
|
|
202
|
+
- [`paper/`](paper/) — two papers (drift detection + memory pipeline) and supporting eval scripts
|
|
203
|
+
- [`CHANGELOG.md`](CHANGELOG.md) — versioned release notes
|
|
204
|
+
- [`CONTRIBUTING.md`](CONTRIBUTING.md) — adding new domain anchors / running benchmarks
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Citation
|
|
209
|
+
|
|
210
|
+
If you use this work, please cite:
|
|
211
|
+
|
|
212
|
+
**Paper 1 · drift detection**:
|
|
213
|
+
|
|
214
|
+
```bibtex
|
|
215
|
+
@misc{nautiluscompass-drift-2026,
|
|
216
|
+
title = {Nautilus Compass: Black-box Persona Drift Detection
|
|
217
|
+
for Production LLM Agents},
|
|
218
|
+
author = {Chunxiao Wang},
|
|
219
|
+
year = {2026},
|
|
220
|
+
note = {Yiluo Technology Co., Ltd.},
|
|
221
|
+
howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
|
|
222
|
+
}
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
**Paper 2 · memory pipeline + EverMemBench cross-bench**:
|
|
226
|
+
|
|
227
|
+
```bibtex
|
|
228
|
+
@misc{nautiluscompass-memrecall-2026,
|
|
229
|
+
title = {Closing the Memory Recall Gap with Chinese LLMs:
|
|
230
|
+
A Multi-Stage Retrieval Pipeline Achieving Zep-SOTA Performance
|
|
231
|
+
on LongMemEval-S at 1/15 Cost},
|
|
232
|
+
author = {Chunxiao Wang},
|
|
233
|
+
year = {2026},
|
|
234
|
+
note = {Yiluo Technology Co., Ltd.},
|
|
235
|
+
howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
The `howpublished` field will be updated to the arXiv identifier once
|
|
240
|
+
the preprints are live.
|
|
241
|
+
|
|
242
|
+
We also build on prior work — please cite as appropriate:
|
|
243
|
+
|
|
244
|
+
- BGE-m3 / BGE-Reranker (Chen et al., BAAI 2024)
|
|
245
|
+
- **Persona Vectors** (Chen et al., Anthropic, [arXiv:2507.21509](https://arxiv.org/abs/2507.21509)) — *complementary white-box approach, not the same as ours*
|
|
246
|
+
- DPT-Agent strategy distillation ([arXiv:2502.11882](https://arxiv.org/abs/2502.11882))
|
|
247
|
+
- A-MEM dynamic links ([arXiv:2502.12110](https://arxiv.org/abs/2502.12110))
|
|
248
|
+
- LongMemEval (Wu et al., NeurIPS 2024)
|
|
249
|
+
- EverMemBench (Hu et al., 2026)
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## License
|
|
254
|
+
|
|
255
|
+
- **Code, plugin, MCP wrapper, papers, scripts** — MIT (see [`LICENSE`](LICENSE))
|
|
256
|
+
- **Behavioral anchor files** (`anchors*.json`) — CC0 1.0 Universal (see [`LICENSE-ANCHORS`](LICENSE-ANCHORS))
|
|
257
|
+
|
|
258
|
+
You may use this in any project, commercial or otherwise, with attribution.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Star history
|
|
263
|
+
|
|
264
|
+
[](https://star-history.com/#chunxiaoxx/nautilus-compass&Date)
|
|
265
|
+
|
|
266
|
+
## Contributors
|
|
267
|
+
|
|
268
|
+
<a href="https://github.com/chunxiaoxx/nautilus-compass/graphs/contributors">
|
|
269
|
+
<img src="https://contrib.rocks/image?repo=chunxiaoxx/nautilus-compass" alt="Contributors" />
|
|
270
|
+
</a>
|
|
271
|
+
|
|
272
|
+
PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
273
|
+
|
|
274
|
+
## Contact
|
|
275
|
+
|
|
276
|
+
- **Author**: Chunxiao Wang · Yiluo Technology Co., Ltd. · `chunxiaoxx@gmail.com`
|
|
277
|
+
- **Issues**: [github.com/chunxiaoxx/nautilus-compass/issues](https://github.com/chunxiaoxx/nautilus-compass/issues)
|
|
278
|
+
- **Hosted gateway**: [compass.nautilus.social](https://compass.nautilus.social)
|
|
279
|
+
- **中文文档**: [README.zh-CN.md](README.zh-CN.md)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""nautilus_compass · cross-agent memory layer for the Nautilus platform.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
· BGE-m3 dense recall + bge-reranker-v2-m3 cross-encoder reranking
|
|
5
|
+
· MCP server (Claude Desktop / Cline / Cursor compatible)
|
|
6
|
+
· A2A protocol adapter for cross-agent message routing
|
|
7
|
+
· Drift detector (anchor-based · AUC=0.92)
|
|
8
|
+
· Session writer / search / drift-history utilities
|
|
9
|
+
|
|
10
|
+
Submodules are exposed lazily; import only what you need to keep startup fast.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.9.5"
|