nautilus-compass 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. nautilus_compass-1.0.0/LICENSE +21 -0
  2. nautilus_compass-1.0.0/LICENSE-ANCHORS +17 -0
  3. nautilus_compass-1.0.0/PKG-INFO +319 -0
  4. nautilus_compass-1.0.0/README.md +279 -0
  5. nautilus_compass-1.0.0/__init__.py +13 -0
  6. nautilus_compass-1.0.0/anchor_generator.py +186 -0
  7. nautilus_compass-1.0.0/anchors.json +67 -0
  8. nautilus_compass-1.0.0/anchors_adapted.json +403 -0
  9. nautilus_compass-1.0.0/anchors_finance.json +57 -0
  10. nautilus_compass-1.0.0/anchors_legal.json +57 -0
  11. nautilus_compass-1.0.0/anchors_medical.json +57 -0
  12. nautilus_compass-1.0.0/anchors_platform_base.json +57 -0
  13. nautilus_compass-1.0.0/anchors_vc.json +57 -0
  14. nautilus_compass-1.0.0/anchors_zenmind.json +57 -0
  15. nautilus_compass-1.0.0/audit_kpi.py +79 -0
  16. nautilus_compass-1.0.0/compass_http.py +578 -0
  17. nautilus_compass-1.0.0/compass_http_v09.py +1018 -0
  18. nautilus_compass-1.0.0/compass_raid.py +249 -0
  19. nautilus_compass-1.0.0/compass_verify.py +115 -0
  20. nautilus_compass-1.0.0/daemon.py +533 -0
  21. nautilus_compass-1.0.0/daemon_anchor_apply.py +100 -0
  22. nautilus_compass-1.0.0/daemon_anchor_loader.py +112 -0
  23. nautilus_compass-1.0.0/daemon_start.sh +38 -0
  24. nautilus_compass-1.0.0/daemon_stop.sh +12 -0
  25. nautilus_compass-1.0.0/deeptest.py +188 -0
  26. nautilus_compass-1.0.0/drift_history.py +191 -0
  27. nautilus_compass-1.0.0/feedback.py +300 -0
  28. nautilus_compass-1.0.0/hook.sh +25 -0
  29. nautilus_compass-1.0.0/install.sh +46 -0
  30. nautilus_compass-1.0.0/install_bge.sh +54 -0
  31. nautilus_compass-1.0.0/links_finder.py +154 -0
  32. nautilus_compass-1.0.0/llm_distill.py +133 -0
  33. nautilus_compass-1.0.0/mcp_client.py +343 -0
  34. nautilus_compass-1.0.0/mcp_server.py +1242 -0
  35. nautilus_compass-1.0.0/merkle_chain.py +362 -0
  36. nautilus_compass-1.0.0/mid_session_hook.py +196 -0
  37. nautilus_compass-1.0.0/nautilus_compass.egg-info/PKG-INFO +319 -0
  38. nautilus_compass-1.0.0/nautilus_compass.egg-info/SOURCES.txt +117 -0
  39. nautilus_compass-1.0.0/nautilus_compass.egg-info/dependency_links.txt +1 -0
  40. nautilus_compass-1.0.0/nautilus_compass.egg-info/entry_points.txt +7 -0
  41. nautilus_compass-1.0.0/nautilus_compass.egg-info/requires.txt +21 -0
  42. nautilus_compass-1.0.0/nautilus_compass.egg-info/top_level.txt +1 -0
  43. nautilus_compass-1.0.0/output_drift.py +144 -0
  44. nautilus_compass-1.0.0/pyproject.toml +90 -0
  45. nautilus_compass-1.0.0/recall.py +745 -0
  46. nautilus_compass-1.0.0/selftest.py +103 -0
  47. nautilus_compass-1.0.0/session_search.py +142 -0
  48. nautilus_compass-1.0.0/session_writer.py +380 -0
  49. nautilus_compass-1.0.0/setup.cfg +4 -0
  50. nautilus_compass-1.0.0/stake_publisher.py +190 -0
  51. nautilus_compass-1.0.0/stop_hook.py +152 -0
  52. nautilus_compass-1.0.0/strategy_store.py +274 -0
  53. nautilus_compass-1.0.0/tests/test_a2a_adapter.py +153 -0
  54. nautilus_compass-1.0.0/tests/test_a2a_peer_demo.py +210 -0
  55. nautilus_compass-1.0.0/tests/test_a2a_tls_demo.py +50 -0
  56. nautilus_compass-1.0.0/tests/test_auth_v091.py +167 -0
  57. nautilus_compass-1.0.0/tests/test_compass_v09.py +203 -0
  58. nautilus_compass-1.0.0/tests/test_crypto.py +212 -0
  59. nautilus_compass-1.0.0/tests/test_e2e_encryption.py +230 -0
  60. nautilus_compass-1.0.0/tests/test_full_capability_demo.py +55 -0
  61. nautilus_compass-1.0.0/tests/test_http_server_e2e.py +219 -0
  62. nautilus_compass-1.0.0/tests/test_mcp_client.py +157 -0
  63. nautilus_compass-1.0.0/tests/test_mcp_client_backoff.py +212 -0
  64. nautilus_compass-1.0.0/tests/test_mcp_logging.py +251 -0
  65. nautilus_compass-1.0.0/tests/test_mcp_progress.py +216 -0
  66. nautilus_compass-1.0.0/tests/test_mcp_rate_limit.py +313 -0
  67. nautilus_compass-1.0.0/tests/test_mcp_rbac.py +242 -0
  68. nautilus_compass-1.0.0/tests/test_mcp_resources.py +221 -0
  69. nautilus_compass-1.0.0/tests/test_mcp_server.py +409 -0
  70. nautilus_compass-1.0.0/tests/test_mcp_tls.py +263 -0
  71. nautilus_compass-1.0.0/tests/test_merkle_chain.py +245 -0
  72. nautilus_compass-1.0.0/tests/test_plugin_manifest.py +117 -0
  73. nautilus_compass-1.0.0/tests/test_release_notes_extract.py +79 -0
  74. nautilus_compass-1.0.0/tests/test_sqlite_migration.py +233 -0
  75. nautilus_compass-1.0.0/tests/test_stake_publisher.py +204 -0
  76. nautilus_compass-1.0.0/tests/test_third_party_client.py +73 -0
  77. nautilus_compass-1.0.0/tests/test_version_sync.py +44 -0
  78. nautilus_compass-1.0.0/time_anchor.py +144 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Chunxiao Wang (Yiluo Technology Co., Ltd.)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ CC0 1.0 Universal (Public Domain Dedication)
2
+
3
+ The behavioral anchor files in this repository (`anchors*.json`,
4
+ `positive_anchors.json`, `negative_anchors.json`, and any file matching
5
+ `anchors_*.json`) are released into the public domain under the Creative
6
+ Commons CC0 1.0 Universal Public Domain Dedication.
7
+
8
+ You are encouraged to replace these defaults with your own behavioral
9
+ anchors tuned to your specific deployment context. No attribution is
10
+ required when reusing or modifying the anchor files; attribution is
11
+ welcome but not necessary.
12
+
13
+ Full text: https://creativecommons.org/publicdomain/zero/1.0/
14
+
15
+ The rest of the codebase (Python source, Node MCP wrapper, documentation,
16
+ LaTeX papers under `paper/`, scripts, tests) remains MIT-licensed; see
17
+ `LICENSE` for those terms.
@@ -0,0 +1,319 @@
1
+ Metadata-Version: 2.4
2
+ Name: nautilus-compass
3
+ Version: 1.0.0
4
+ Summary: Cross-agent memory layer for the Nautilus platform · MCP/A2A protocol · drift-aware writer · LongMemEval-S benchmark · E2EE-ready
5
+ Author: chunxiaoxx
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/chunxiaoxx/nautilus-compass
8
+ Project-URL: Issues, https://github.com/chunxiaoxx/nautilus-compass/issues
9
+ Keywords: claude-code,memory,persona-vectors,drift-detection,rag,embedding,bge,agent-memory,mcp,a2a,nautilus-platform,cross-agent,longmemeval
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ License-File: LICENSE-ANCHORS
24
+ Requires-Dist: sentence-transformers>=2.7
25
+ Provides-Extra: modelscope
26
+ Requires-Dist: modelscope>=1.10; extra == "modelscope"
27
+ Provides-Extra: rerank
28
+ Requires-Dist: sentence-transformers>=2.7; extra == "rerank"
29
+ Provides-Extra: fast-download
30
+ Requires-Dist: hf_transfer>=0.1; extra == "fast-download"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: ruff; extra == "dev"
35
+ Provides-Extra: e2ee
36
+ Requires-Dist: pynacl>=1.5; extra == "e2ee"
37
+ Provides-Extra: nautilus
38
+ Requires-Dist: nautilus-agent>=0.5; extra == "nautilus"
39
+ Dynamic: license-file
40
+
41
+ # nautilus-compass
42
+
43
+ > **Cross-agent memory layer with drift detection** for LLM agents.
44
+ > Memory plugin for Claude Code/Desktop · Cline · Cursor · Continue.dev · Zed ·
45
+ > stops your AI from repeating mistakes you've already flagged.
46
+
47
+ 🇬🇧 English (this file) · [🇨🇳 中文](README.zh-CN.md)
48
+
49
+ [![CI](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml)
50
+ [![arXiv build](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml/badge.svg?branch=main)](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml)
51
+ [![LongMemEval-S](https://img.shields.io/badge/LongMemEval--S-56.6%25-brightgreen)](paper/RESULTS_v0.8.md)
52
+ [![EverMemBench](https://img.shields.io/badge/EverMemBench-44.4%E2%80%9347.3%25-brightgreen)](paper/sections/paper2_06_5_evermembench.tex)
53
+ [![drift-AUC](https://img.shields.io/badge/drift_AUC-0.83_held--out-brightgreen)](#how-it-works)
54
+ [![version](https://img.shields.io/badge/version-1.0.0_stable-blue)](CHANGELOG.md)
55
+ [![MCP](https://img.shields.io/badge/MCP-7%20tools%20%C2%B7%20TLS%20%C2%B7%20RBAC-blue)](docs/mcp-usage.md)
56
+ [![A2A](https://img.shields.io/badge/A2A-mTLS%20%C2%B7%20scoped%20peers-blue)](examples/a2a_tls_demo.py)
57
+ [![license](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
58
+
59
+ ---
60
+
61
+ ## 30-second pitch
62
+
63
+ ```
64
+ Traditional memory systems (mem0 / Letta / claude-mem / Zep):
65
+ "I can recall the right past memory more accurately."
66
+
67
+ nautilus-compass adds one more step:
68
+ "Memory recalled + detect if the AI is about to repeat a known mistake
69
+ + remind it of what worked last time."
70
+ ```
71
+
72
+ **In one line**: when the AI is about to forget a rule you set, take a
73
+ shortcut you flagged, or fabricate a prior agreement, it gets stopped
74
+ by its own history of failure patterns.
75
+
76
+ ---
77
+
78
+ ## What problem does this solve
79
+
80
+ ### A. Long sessions drift
81
+
82
+ You told Claude at session start: *"never claim deployment success
83
+ without verification."* Fifty prompts later Claude says *"deployed
84
+ successfully ✅"* — without verifying. The memory rule was there; the
85
+ AI forgot it under context pressure.
86
+
87
+ ### B. White-box drift detection isn't reachable
88
+
89
+ [Persona Vectors (Anthropic, 2025)](https://arxiv.org/abs/2507.21509)
90
+ proved that LLM activations contain directions for sycophancy and
91
+ hallucination. But that requires model weights — closed APIs (Claude,
92
+ GPT-4) don't expose them. There has been no production black-box
93
+ equivalent that runs in a Claude Code hook.
94
+
95
+ ### C. Memory plugins solve only half the problem
96
+
97
+ Mem0, Letta, claude-mem, Zep all compete on *"recall the most relevant
98
+ past memory."* But memory recalled doesn't stop the AI from breaking
99
+ the rule **this time** — that other half has been unsolved.
100
+
101
+ ---
102
+
103
+ ## How it works
104
+
105
+ ```
106
+ User prompt: "Fix bug X for me"
107
+
108
+
109
+ ┌─────────────────────────────────────┐
110
+ │ UserPromptSubmit Hook (this plugin)│
111
+ └─────────────────────────────────────┘
112
+
113
+ ┌────────────┼────────────┐
114
+ ▼ ▼ ▼
115
+ ┌────────┐ ┌─────────┐ ┌──────────┐
116
+ │ recall │ │ drift │ │ profile │
117
+ │ memory │ │ check │ │ aggregate│
118
+ └────────┘ └─────────┘ └──────────┘
119
+
120
+
121
+ Hooks inject results into Claude's system prompt:
122
+ - Time-bucketed past memory (BGE-m3 semantic recall)
123
+ - Drift score + nearest negative anchor (if score < threshold)
124
+ - Profile facts ("you have 3 unfinished tasks in this repo")
125
+
126
+
127
+ Claude answers — with full context loaded
128
+ ```
129
+
130
+ The drift detector compares each prompt against an anchor set
131
+ (25 positive + 35 negative behavioral patterns drawn from real failure
132
+ transcripts) using BGE-m3 cosine similarity. AUC 0.83 on held-out, 50ms
133
+ p95 hook latency.
134
+
135
+ ---
136
+
137
+ ## Headline numbers
138
+
139
+ | Benchmark | Score | Compare against |
140
+ |---|---|---|
141
+ | **LongMemEval-S** (n=500) | **56.6%** (locked at v0.8) | ties Zep SOTA band, +12 pts vs Gemini-2.5-pro baseline |
142
+ | **EverMemBench-Dynamic** (n=500) | **44.4% (Run 1) / 47.3% (Run 2)** | tops every reported Table 4 baseline (Mem0 37.09, Zep 39.97, MemOS 42.55) |
143
+ | **Drift detector AUC** | **0.83 held-out / 0.92 in-set** | first black-box drift score that runs in a Claude Code hook |
144
+ | **Reproduction cost** | **~$3.50** for 500 LongMemEval questions | under 1/15 of GPT-4o-judged stacks |
145
+ | **p95 hook latency** | **<50 ms** | safe for every-prompt invocation |
146
+
147
+ We deliberately report Run 1 (44.4%) as the abstract headline for
148
+ EverMemBench to avoid cherry-picking; the cross-run mean (45.84%) clears
149
+ MemOS by +3.3 pts. See `paper/sections/paper2_06_5_evermembench.tex`
150
+ for honest dual-run + Gemini cross-judge sensitivity analysis.
151
+
152
+ ---
153
+
154
+ ## Quickstart
155
+
156
+ ### Install in Claude Code
157
+
158
+ ```bash
159
+ git clone https://github.com/chunxiaoxx/nautilus-compass ~/.claude/plugins/nautilus-compass
160
+ bash ~/.claude/plugins/nautilus-compass/install.sh
161
+
162
+ # Start the BGE-m3 daemon (one-time per boot)
163
+ bash ~/.claude/plugins/nautilus-compass/daemon_start.sh
164
+ ```
165
+
166
+ The installer wires three hooks into `~/.claude/settings.json`:
167
+ - `UserPromptSubmit` → injects time-bucketed memory recall + drift
168
+ - `PostToolUse` → mid-session writer
169
+ - `Stop` → end-of-session summary writer
170
+
171
+ Five user-facing slash commands appear in Claude Code:
172
+ `/compass-verify` · `/compass-drift` · `/compass-recall` ·
173
+ `/compass-search` · `/compass-status`.
174
+
175
+ ### Install in any other MCP client
176
+
177
+ ```bash
178
+ python ~/.claude/plugins/nautilus-compass/scripts/install_to_agent.py
179
+ ```
180
+
181
+ Auto-detects Claude Desktop, Cursor, Cline, Continue.dev, Zed Editor and
182
+ patches their MCP config. See [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md)
183
+ for per-agent copy-paste configs and [`docs/mcp-usage.md`](docs/mcp-usage.md)
184
+ for the raw protocol specification.
185
+
186
+ ### Cloud-hosted alternative (no local install)
187
+
188
+ ```bash
189
+ curl https://compass.nautilus.social/.well-known/agent.json
190
+ ```
191
+
192
+ Returns the standard A2A discovery descriptor. Sign up at
193
+ `compass.nautilus.social/signup` for a hosted gateway with multi-user
194
+ sync, audit log, and managed BGE-m3 deployment.
195
+
196
+ ---
197
+
198
+ ## What's exposed (7 MCP tools)
199
+
200
+ | Tool | Purpose | Latency |
201
+ |---|---|---|
202
+ | `ingest_obs(name, body, agent_id?)` | Write observation with auto-anchor + drift signal | ~150 ms |
203
+ | `recall(query, project?, top_k?)` | BGE-m3 semantic + keyword search | ~200 ms |
204
+ | `session_search(query, since?)` | Time-bucketed session-log search | ~80 ms |
205
+ | `profile(user_id?)` | Work-profile aggregate (topics, agents, drift trend) | ~100 ms |
206
+ | `drift_check(prompt, project?)` | Black-box drift score against anchors | <50 ms |
207
+ | `drift_history(since?, agent_id?)` | Drift score timeline for trend audit | ~30 ms |
208
+ | `feedback_log(direction, reason)` | Log positive/negative anchor signal | <20 ms |
209
+
210
+ The MCP server speaks JSON-RPC 2.0 over stdio / TCP / TLS / mTLS.
211
+ Per-token RBAC, per-token rate limiting, `notifications/{progress,
212
+ cancelled, message}`, `logging/setLevel`, and `resources/*` for session-log
213
+ streaming are all spec-complete.
214
+
215
+ ---
216
+
217
+ ## Comparison
218
+
219
+ | Capability | this | mem0 | Letta | Zep | claude-mem | MemOS | Smriti |
220
+ |---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
221
+ | Cross-agent memory | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | archive-only |
222
+ | MCP A2A protocol native | ✅ TLS+mTLS+RBAC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
223
+ | Drift detection | ✅ AUC 0.83 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
224
+ | Merkle integrity audit log | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
225
+ | LongMemEval-S verified | ✅ 56.6% (locked) | n/r | n/r | n/r | ❌ | n/r | ❌ |
226
+ | EverMemBench verified | ✅ 44.4-47.3% | 37.09 | n/r | 39.97 | n/r | 42.55 | ❌ |
227
+ | Self-host + hosted both | ✅ | ☁ only | ✅ | ☁ only | ✅ | OSS only | OSS only |
228
+ | License | MIT | Apache | Apache | proprietary | MIT | Apache | MIT |
229
+
230
+ `n/r` = not reported in their published evaluations. Smriti is a team
231
+ conversation archive with git-based sharing — different scope from a
232
+ runtime memory layer, so most rows are intentionally out-of-scope rather
233
+ than missing features.
234
+
235
+ ---
236
+
237
+ ## Documentation
238
+
239
+ - [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md) — per-agent install configs (6 platforms + 3 frameworks)
240
+ - [`docs/mcp-usage.md`](docs/mcp-usage.md) — raw MCP protocol guide, TLS setup, RBAC
241
+ - [`docs/PLATFORM_HANDSHAKE.md`](docs/PLATFORM_HANDSHAKE.md) — OSS↔SaaS coordination contract
242
+ - [`paper/`](paper/) — two papers (drift detection + memory pipeline) and supporting eval scripts
243
+ - [`CHANGELOG.md`](CHANGELOG.md) — versioned release notes
244
+ - [`CONTRIBUTING.md`](CONTRIBUTING.md) — adding new domain anchors / running benchmarks
245
+
246
+ ---
247
+
248
+ ## Citation
249
+
250
+ If you use this work, please cite:
251
+
252
+ **Paper 1 · drift detection**:
253
+
254
+ ```bibtex
255
+ @misc{nautiluscompass-drift-2026,
256
+ title = {Nautilus Compass: Black-box Persona Drift Detection
257
+ for Production LLM Agents},
258
+ author = {Chunxiao Wang},
259
+ year = {2026},
260
+ note = {Yiluo Technology Co., Ltd.},
261
+ howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
262
+ }
263
+ ```
264
+
265
+ **Paper 2 · memory pipeline + EverMemBench cross-bench**:
266
+
267
+ ```bibtex
268
+ @misc{nautiluscompass-memrecall-2026,
269
+ title = {Closing the Memory Recall Gap with Chinese LLMs:
270
+ A Multi-Stage Retrieval Pipeline Achieving Zep-SOTA Performance
271
+ on LongMemEval-S at 1/15 Cost},
272
+ author = {Chunxiao Wang},
273
+ year = {2026},
274
+ note = {Yiluo Technology Co., Ltd.},
275
+ howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
276
+ }
277
+ ```
278
+
279
+ The `howpublished` field will be updated to the arXiv identifier once
280
+ the preprints are live.
281
+
282
+ We also build on prior work — please cite as appropriate:
283
+
284
+ - BGE-m3 / BGE-Reranker (Chen et al., BAAI 2024)
285
+ - **Persona Vectors** (Chen et al., Anthropic, [arXiv:2507.21509](https://arxiv.org/abs/2507.21509)) — *complementary white-box approach, not the same as ours*
286
+ - DPT-Agent strategy distillation ([arXiv:2502.11882](https://arxiv.org/abs/2502.11882))
287
+ - A-MEM dynamic links ([arXiv:2502.12110](https://arxiv.org/abs/2502.12110))
288
+ - LongMemEval (Wu et al., NeurIPS 2024)
289
+ - EverMemBench (Hu et al., 2026)
290
+
291
+ ---
292
+
293
+ ## License
294
+
295
+ - **Code, plugin, MCP wrapper, papers, scripts** — MIT (see [`LICENSE`](LICENSE))
296
+ - **Behavioral anchor files** (`anchors*.json`) — CC0 1.0 Universal (see [`LICENSE-ANCHORS`](LICENSE-ANCHORS))
297
+
298
+ You may use this in any project, commercial or otherwise, with attribution.
299
+
300
+ ---
301
+
302
+ ## Star history
303
+
304
+ [![Star History Chart](https://api.star-history.com/svg?repos=chunxiaoxx/nautilus-compass&type=Date)](https://star-history.com/#chunxiaoxx/nautilus-compass&Date)
305
+
306
+ ## Contributors
307
+
308
+ <a href="https://github.com/chunxiaoxx/nautilus-compass/graphs/contributors">
309
+ <img src="https://contrib.rocks/image?repo=chunxiaoxx/nautilus-compass" alt="Contributors" />
310
+ </a>
311
+
312
+ PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
313
+
314
+ ## Contact
315
+
316
+ - **Author**: Chunxiao Wang · Yiluo Technology Co., Ltd. · `chunxiaoxx@gmail.com`
317
+ - **Issues**: [github.com/chunxiaoxx/nautilus-compass/issues](https://github.com/chunxiaoxx/nautilus-compass/issues)
318
+ - **Hosted gateway**: [compass.nautilus.social](https://compass.nautilus.social)
319
+ - **中文文档**: [README.zh-CN.md](README.zh-CN.md)
@@ -0,0 +1,279 @@
1
+ # nautilus-compass
2
+
3
+ > **Cross-agent memory layer with drift detection** for LLM agents.
4
+ > Memory plugin for Claude Code/Desktop · Cline · Cursor · Continue.dev · Zed ·
5
+ > stops your AI from repeating mistakes you've already flagged.
6
+
7
+ 🇬🇧 English (this file) · [🇨🇳 中文](README.zh-CN.md)
8
+
9
+ [![CI](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/ci.yml)
10
+ [![arXiv build](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml/badge.svg?branch=main)](https://github.com/chunxiaoxx/nautilus-compass/actions/workflows/build-paper.yml)
11
+ [![LongMemEval-S](https://img.shields.io/badge/LongMemEval--S-56.6%25-brightgreen)](paper/RESULTS_v0.8.md)
12
+ [![EverMemBench](https://img.shields.io/badge/EverMemBench-44.4%E2%80%9347.3%25-brightgreen)](paper/sections/paper2_06_5_evermembench.tex)
13
+ [![drift-AUC](https://img.shields.io/badge/drift_AUC-0.83_held--out-brightgreen)](#how-it-works)
14
+ [![version](https://img.shields.io/badge/version-1.0.0_stable-blue)](CHANGELOG.md)
15
+ [![MCP](https://img.shields.io/badge/MCP-7%20tools%20%C2%B7%20TLS%20%C2%B7%20RBAC-blue)](docs/mcp-usage.md)
16
+ [![A2A](https://img.shields.io/badge/A2A-mTLS%20%C2%B7%20scoped%20peers-blue)](examples/a2a_tls_demo.py)
17
+ [![license](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
18
+
19
+ ---
20
+
21
+ ## 30-second pitch
22
+
23
+ ```
24
+ Traditional memory systems (mem0 / Letta / claude-mem / Zep):
25
+ "I can recall the right past memory more accurately."
26
+
27
+ nautilus-compass adds one more step:
28
+ "Memory recalled + detect if the AI is about to repeat a known mistake
29
+ + remind it of what worked last time."
30
+ ```
31
+
32
+ **In one line**: when the AI is about to forget a rule you set, take a
33
+ shortcut you flagged, or fabricate a prior agreement, it gets stopped
34
+ by its own history of failure patterns.
35
+
36
+ ---
37
+
38
+ ## What problem does this solve
39
+
40
+ ### A. Long sessions drift
41
+
42
+ You told Claude at session start: *"never claim deployment success
43
+ without verification."* Fifty prompts later Claude says *"deployed
44
+ successfully ✅"* — without verifying. The memory rule was there; the
45
+ AI forgot it under context pressure.
46
+
47
+ ### B. White-box drift detection isn't reachable
48
+
49
+ [Persona Vectors (Anthropic, 2025)](https://arxiv.org/abs/2507.21509)
50
+ proved that LLM activations contain directions for sycophancy and
51
+ hallucination. But that requires model weights — closed APIs (Claude,
52
+ GPT-4) don't expose them. There has been no production black-box
53
+ equivalent that runs in a Claude Code hook.
54
+
55
+ ### C. Memory plugins solve only half the problem
56
+
57
+ Mem0, Letta, claude-mem, Zep all compete on *"recall the most relevant
58
+ past memory."* But memory recalled doesn't stop the AI from breaking
59
+ the rule **this time** — that other half has been unsolved.
60
+
61
+ ---
62
+
63
+ ## How it works
64
+
65
+ ```
66
+ User prompt: "Fix bug X for me"
67
+
68
+
69
+ ┌─────────────────────────────────────┐
70
+ │ UserPromptSubmit Hook (this plugin)│
71
+ └─────────────────────────────────────┘
72
+
73
+ ┌────────────┼────────────┐
74
+ ▼ ▼ ▼
75
+ ┌────────┐ ┌─────────┐ ┌──────────┐
76
+ │ recall │ │ drift │ │ profile │
77
+ │ memory │ │ check │ │ aggregate│
78
+ └────────┘ └─────────┘ └──────────┘
79
+
80
+
81
+ Hooks inject results into Claude's system prompt:
82
+ - Time-bucketed past memory (BGE-m3 semantic recall)
83
+ - Drift score + nearest negative anchor (if score < threshold)
84
+ - Profile facts ("you have 3 unfinished tasks in this repo")
85
+
86
+
87
+ Claude answers — with full context loaded
88
+ ```
89
+
90
+ The drift detector compares each prompt against an anchor set
91
+ (25 positive + 35 negative behavioral patterns drawn from real failure
92
+ transcripts) using BGE-m3 cosine similarity. AUC 0.83 on held-out, 50ms
93
+ p95 hook latency.
94
+
95
+ ---
96
+
97
+ ## Headline numbers
98
+
99
+ | Benchmark | Score | Compare against |
100
+ |---|---|---|
101
+ | **LongMemEval-S** (n=500) | **56.6%** (locked at v0.8) | ties Zep SOTA band, +12 pts vs Gemini-2.5-pro baseline |
102
+ | **EverMemBench-Dynamic** (n=500) | **44.4% (Run 1) / 47.3% (Run 2)** | tops every reported Table 4 baseline (Mem0 37.09, Zep 39.97, MemOS 42.55) |
103
+ | **Drift detector AUC** | **0.83 held-out / 0.92 in-set** | first black-box drift score that runs in a Claude Code hook |
104
+ | **Reproduction cost** | **~$3.50** for 500 LongMemEval questions | under 1/15 of GPT-4o-judged stacks |
105
+ | **p95 hook latency** | **<50 ms** | safe for every-prompt invocation |
106
+
107
+ We deliberately report Run 1 (44.4%) as the abstract headline for
108
+ EverMemBench to avoid cherry-picking; the cross-run mean (45.84%) clears
109
+ MemOS by +3.3 pts. See `paper/sections/paper2_06_5_evermembench.tex`
110
+ for honest dual-run + Gemini cross-judge sensitivity analysis.
111
+
112
+ ---
113
+
114
+ ## Quickstart
115
+
116
+ ### Install in Claude Code
117
+
118
+ ```bash
119
+ git clone https://github.com/chunxiaoxx/nautilus-compass ~/.claude/plugins/nautilus-compass
120
+ bash ~/.claude/plugins/nautilus-compass/install.sh
121
+
122
+ # Start the BGE-m3 daemon (one-time per boot)
123
+ bash ~/.claude/plugins/nautilus-compass/daemon_start.sh
124
+ ```
125
+
126
+ The installer wires three hooks into `~/.claude/settings.json`:
127
+ - `UserPromptSubmit` → injects time-bucketed memory recall + drift
128
+ - `PostToolUse` → mid-session writer
129
+ - `Stop` → end-of-session summary writer
130
+
131
+ Five user-facing slash commands appear in Claude Code:
132
+ `/compass-verify` · `/compass-drift` · `/compass-recall` ·
133
+ `/compass-search` · `/compass-status`.
134
+
135
+ ### Install in any other MCP client
136
+
137
+ ```bash
138
+ python ~/.claude/plugins/nautilus-compass/scripts/install_to_agent.py
139
+ ```
140
+
141
+ Auto-detects Claude Desktop, Cursor, Cline, Continue.dev, Zed Editor and
142
+ patches their MCP config. See [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md)
143
+ for per-agent copy-paste configs and [`docs/mcp-usage.md`](docs/mcp-usage.md)
144
+ for the raw protocol specification.
145
+
146
+ ### Cloud-hosted alternative (no local install)
147
+
148
+ ```bash
149
+ curl https://compass.nautilus.social/.well-known/agent.json
150
+ ```
151
+
152
+ Returns the standard A2A discovery descriptor. Sign up at
153
+ `compass.nautilus.social/signup` for a hosted gateway with multi-user
154
+ sync, audit log, and managed BGE-m3 deployment.
155
+
156
+ ---
157
+
158
+ ## What's exposed (7 MCP tools)
159
+
160
+ | Tool | Purpose | Latency |
161
+ |---|---|---|
162
+ | `ingest_obs(name, body, agent_id?)` | Write observation with auto-anchor + drift signal | ~150 ms |
163
+ | `recall(query, project?, top_k?)` | BGE-m3 semantic + keyword search | ~200 ms |
164
+ | `session_search(query, since?)` | Time-bucketed session-log search | ~80 ms |
165
+ | `profile(user_id?)` | Work-profile aggregate (topics, agents, drift trend) | ~100 ms |
166
+ | `drift_check(prompt, project?)` | Black-box drift score against anchors | <50 ms |
167
+ | `drift_history(since?, agent_id?)` | Drift score timeline for trend audit | ~30 ms |
168
+ | `feedback_log(direction, reason)` | Log positive/negative anchor signal | <20 ms |
169
+
170
+ The MCP server speaks JSON-RPC 2.0 over stdio / TCP / TLS / mTLS.
171
+ Per-token RBAC, per-token rate limiting, `notifications/{progress,
172
+ cancelled, message}`, `logging/setLevel`, and `resources/*` for session-log
173
+ streaming are all spec-complete.
174
+
175
+ ---
176
+
177
+ ## Comparison
178
+
179
+ | Capability | this | mem0 | Letta | Zep | claude-mem | MemOS | Smriti |
180
+ |---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
181
+ | Cross-agent memory | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | archive-only |
182
+ | MCP A2A protocol native | ✅ TLS+mTLS+RBAC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
183
+ | Drift detection | ✅ AUC 0.83 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
184
+ | Merkle integrity audit log | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
185
+ | LongMemEval-S verified | ✅ 56.6% (locked) | n/r | n/r | n/r | ❌ | n/r | ❌ |
186
+ | EverMemBench verified | ✅ 44.4-47.3% | 37.09 | n/r | 39.97 | n/r | 42.55 | ❌ |
187
+ | Self-host + hosted both | ✅ | ☁ only | ✅ | ☁ only | ✅ | OSS only | OSS only |
188
+ | License | MIT | Apache | Apache | proprietary | MIT | Apache | MIT |
189
+
190
+ `n/r` = not reported in their published evaluations. Smriti is a team
191
+ conversation archive with git-based sharing — different scope from a
192
+ runtime memory layer, so most rows are intentionally out-of-scope rather
193
+ than missing features.
194
+
195
+ ---
196
+
197
+ ## Documentation
198
+
199
+ - [`docs/AGENT_ONBOARDING.md`](docs/AGENT_ONBOARDING.md) — per-agent install configs (6 platforms + 3 frameworks)
200
+ - [`docs/mcp-usage.md`](docs/mcp-usage.md) — raw MCP protocol guide, TLS setup, RBAC
201
+ - [`docs/PLATFORM_HANDSHAKE.md`](docs/PLATFORM_HANDSHAKE.md) — OSS↔SaaS coordination contract
202
+ - [`paper/`](paper/) — two papers (drift detection + memory pipeline) and supporting eval scripts
203
+ - [`CHANGELOG.md`](CHANGELOG.md) — versioned release notes
204
+ - [`CONTRIBUTING.md`](CONTRIBUTING.md) — adding new domain anchors / running benchmarks
205
+
206
+ ---
207
+
208
+ ## Citation
209
+
210
+ If you use this work, please cite:
211
+
212
+ **Paper 1 · drift detection**:
213
+
214
+ ```bibtex
215
+ @misc{nautiluscompass-drift-2026,
216
+ title = {Nautilus Compass: Black-box Persona Drift Detection
217
+ for Production LLM Agents},
218
+ author = {Chunxiao Wang},
219
+ year = {2026},
220
+ note = {Yiluo Technology Co., Ltd.},
221
+ howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
222
+ }
223
+ ```
224
+
225
+ **Paper 2 · memory pipeline + EverMemBench cross-bench**:
226
+
227
+ ```bibtex
228
+ @misc{nautiluscompass-memrecall-2026,
229
+ title = {Closing the Memory Recall Gap with Chinese LLMs:
230
+ A Multi-Stage Retrieval Pipeline Achieving Zep-SOTA Performance
231
+ on LongMemEval-S at 1/15 Cost},
232
+ author = {Chunxiao Wang},
233
+ year = {2026},
234
+ note = {Yiluo Technology Co., Ltd.},
235
+ howpublished = {\url{https://github.com/chunxiaoxx/nautilus-compass}}
236
+ }
237
+ ```
238
+
239
+ The `howpublished` field will be updated to the arXiv identifier once
240
+ the preprints are live.
241
+
242
+ We also build on prior work — please cite as appropriate:
243
+
244
+ - BGE-m3 / BGE-Reranker (Chen et al., BAAI 2024)
245
+ - **Persona Vectors** (Chen et al., Anthropic, [arXiv:2507.21509](https://arxiv.org/abs/2507.21509)) — *complementary white-box approach, not the same as ours*
246
+ - DPT-Agent strategy distillation ([arXiv:2502.11882](https://arxiv.org/abs/2502.11882))
247
+ - A-MEM dynamic links ([arXiv:2502.12110](https://arxiv.org/abs/2502.12110))
248
+ - LongMemEval (Wu et al., NeurIPS 2024)
249
+ - EverMemBench (Hu et al., 2026)
250
+
251
+ ---
252
+
253
+ ## License
254
+
255
+ - **Code, plugin, MCP wrapper, papers, scripts** — MIT (see [`LICENSE`](LICENSE))
256
+ - **Behavioral anchor files** (`anchors*.json`) — CC0 1.0 Universal (see [`LICENSE-ANCHORS`](LICENSE-ANCHORS))
257
+
258
+ You may use this in any project, commercial or otherwise, with attribution.
259
+
260
+ ---
261
+
262
+ ## Star history
263
+
264
+ [![Star History Chart](https://api.star-history.com/svg?repos=chunxiaoxx/nautilus-compass&type=Date)](https://star-history.com/#chunxiaoxx/nautilus-compass&Date)
265
+
266
+ ## Contributors
267
+
268
+ <a href="https://github.com/chunxiaoxx/nautilus-compass/graphs/contributors">
269
+ <img src="https://contrib.rocks/image?repo=chunxiaoxx/nautilus-compass" alt="Contributors" />
270
+ </a>
271
+
272
+ PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
273
+
274
+ ## Contact
275
+
276
+ - **Author**: Chunxiao Wang · Yiluo Technology Co., Ltd. · `chunxiaoxx@gmail.com`
277
+ - **Issues**: [github.com/chunxiaoxx/nautilus-compass/issues](https://github.com/chunxiaoxx/nautilus-compass/issues)
278
+ - **Hosted gateway**: [compass.nautilus.social](https://compass.nautilus.social)
279
+ - **中文文档**: [README.zh-CN.md](README.zh-CN.md)
@@ -0,0 +1,13 @@
1
+ """nautilus_compass · cross-agent memory layer for the Nautilus platform.
2
+
3
+ Provides:
4
+ · BGE-m3 dense recall + bge-reranker-v2-m3 cross-encoder reranking
5
+ · MCP server (Claude Desktop / Cline / Cursor compatible)
6
+ · A2A protocol adapter for cross-agent message routing
7
+ · Drift detector (anchor-based · AUC=0.92)
8
+ · Session writer / search / drift-history utilities
9
+
10
+ Submodules are exposed lazily; import only what you need to keep startup fast.
11
+ """
12
+
13
+ __version__ = "0.9.5"