codeatrium 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. codeatrium-0.1.0/.gitignore +26 -0
  2. codeatrium-0.1.0/CHANGELOG.md +18 -0
  3. codeatrium-0.1.0/LICENSE +21 -0
  4. codeatrium-0.1.0/Makefile +22 -0
  5. codeatrium-0.1.0/PKG-INFO +180 -0
  6. codeatrium-0.1.0/README.ja.md +147 -0
  7. codeatrium-0.1.0/README.md +147 -0
  8. codeatrium-0.1.0/assets/architecture.svg +79 -0
  9. codeatrium-0.1.0/assets/interface.en.svg +65 -0
  10. codeatrium-0.1.0/assets/interface.ja.svg +65 -0
  11. codeatrium-0.1.0/pyproject.toml +97 -0
  12. codeatrium-0.1.0/src/codeatrium/__init__.py +3 -0
  13. codeatrium-0.1.0/src/codeatrium/__main__.py +5 -0
  14. codeatrium-0.1.0/src/codeatrium/cli/__init__.py +295 -0
  15. codeatrium-0.1.0/src/codeatrium/cli/distill_cmd.py +76 -0
  16. codeatrium-0.1.0/src/codeatrium/cli/hook_cmd.py +24 -0
  17. codeatrium-0.1.0/src/codeatrium/cli/index_cmd.py +62 -0
  18. codeatrium-0.1.0/src/codeatrium/cli/prime_cmd.py +90 -0
  19. codeatrium-0.1.0/src/codeatrium/cli/search_cmd.py +128 -0
  20. codeatrium-0.1.0/src/codeatrium/cli/server_cmd.py +122 -0
  21. codeatrium-0.1.0/src/codeatrium/cli/show_cmd.py +151 -0
  22. codeatrium-0.1.0/src/codeatrium/cli/status_cmd.py +59 -0
  23. codeatrium-0.1.0/src/codeatrium/config.py +96 -0
  24. codeatrium-0.1.0/src/codeatrium/db.py +135 -0
  25. codeatrium-0.1.0/src/codeatrium/distiller.py +290 -0
  26. codeatrium-0.1.0/src/codeatrium/embedder.py +168 -0
  27. codeatrium-0.1.0/src/codeatrium/embedder_server.py +172 -0
  28. codeatrium-0.1.0/src/codeatrium/hooks.py +156 -0
  29. codeatrium-0.1.0/src/codeatrium/indexer.py +237 -0
  30. codeatrium-0.1.0/src/codeatrium/llm.py +148 -0
  31. codeatrium-0.1.0/src/codeatrium/models.py +53 -0
  32. codeatrium-0.1.0/src/codeatrium/paths.py +74 -0
  33. codeatrium-0.1.0/src/codeatrium/py.typed +0 -0
  34. codeatrium-0.1.0/src/codeatrium/resolver.py +301 -0
  35. codeatrium-0.1.0/src/codeatrium/search.py +273 -0
  36. codeatrium-0.1.0/tests/__init__.py +0 -0
  37. codeatrium-0.1.0/tests/test_config.py +117 -0
  38. codeatrium-0.1.0/tests/test_db.py +69 -0
  39. codeatrium-0.1.0/tests/test_distiller.py +329 -0
  40. codeatrium-0.1.0/tests/test_embedder.py +47 -0
  41. codeatrium-0.1.0/tests/test_index_cmd.py +35 -0
  42. codeatrium-0.1.0/tests/test_indexer.py +303 -0
  43. codeatrium-0.1.0/tests/test_init.py +361 -0
  44. codeatrium-0.1.0/tests/test_resolver.py +139 -0
  45. codeatrium-0.1.0/tests/test_search_phase2.py +290 -0
  46. codeatrium-0.1.0/tests/test_security.py +184 -0
  47. codeatrium-0.1.0/tests/test_server_cmd.py +21 -0
  48. codeatrium-0.1.0/tests/test_show_dump.py +191 -0
  49. codeatrium-0.1.0/tests/test_status_hook.py +190 -0
  50. codeatrium-0.1.0/uv.lock +1349 -0
@@ -0,0 +1,26 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+
8
+ # Virtual environment
9
+ .venv/
10
+
11
+ # Test & lint caches
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+
15
+ # Tool data (local only)
16
+ .codeatrium/
17
+ .logosyncx/
18
+
19
+ # Claude Code local settings
20
+ .claude/
21
+
22
+ # Internal docs (local only)
23
+ docs/internal/
24
+
25
+ # 論文資料(ローカルのみ)
26
+ mem/
@@ -0,0 +1,18 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 2026-03-31
4
+
5
+ ### Added
6
+
7
+ - `loci init` — initialize `.codeatrium/` in project root
8
+ - `loci index` — parse `.jsonl` session logs, split into exchanges, embed with multilingual-e5-small
9
+ - `loci distill` — distill exchanges via `claude --print` into palace objects (exchange_core, specific_context, room_assignments)
10
+ - `loci search` — cross-layer RRF fusion search (BM25 verbatim + HNSW distilled)
11
+ - `loci context` — reverse lookup: code symbol → past conversations
12
+ - `loci show` — fetch verbatim exchange by ref
13
+ - `loci status` — show index state
14
+ - `loci server start/stop/status` — Unix socket embedding server for <0.2s search
15
+ - `loci hook install` — register Claude Code SessionStart/Stop hooks
16
+ - `config.toml` support for distill model and batch limit
17
+ - tree-sitter symbol resolution (Python, TypeScript, Go)
18
+ - Bilingual support (Japanese + English) via multilingual-e5-small
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 senna-lang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,22 @@
1
+ VENV := .venv/bin
2
+
3
+ .PHONY: test lint fmt typecheck check hooks
4
+
5
+ test:
6
+ $(VENV)/pytest tests/ -v
7
+
8
+ lint:
9
+ $(VENV)/ruff check src/ tests/
10
+
11
+ fmt:
12
+ $(VENV)/ruff format src/ tests/
13
+
14
+ typecheck:
15
+ $(VENV)/pyright src/
16
+
17
+ check: lint typecheck test
18
+
19
+ hooks:
20
+ @echo '#!/bin/sh\nmake check' > .git/hooks/pre-commit
21
+ @chmod +x .git/hooks/pre-commit
22
+ @echo "pre-commit hook installed: runs make check before every commit"
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeatrium
3
+ Version: 0.1.0
4
+ Summary: Memory palace for AI coding agents — index sessions, recall code context in <0.2s
5
+ Project-URL: Homepage, https://github.com/senna-lang/codeatrium
6
+ Project-URL: Repository, https://github.com/senna-lang/codeatrium
7
+ Project-URL: Issues, https://github.com/senna-lang/codeatrium/issues
8
+ Author-email: senna-lang <senna-lang@users.noreply.github.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,claude,cli,code-context,coding-agent,conversation-history,developer-tools,memory,semantic-search
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.11
21
+ Requires-Dist: sentence-transformers>=3.0.0
22
+ Requires-Dist: sqlite-vec>=0.1.0
23
+ Requires-Dist: tree-sitter-go<0.24.0,>=0.23.0
24
+ Requires-Dist: tree-sitter-python<0.24.0,>=0.23.0
25
+ Requires-Dist: tree-sitter-typescript<0.24.0,>=0.23.0
26
+ Requires-Dist: tree-sitter<0.24.0,>=0.23.0
27
+ Requires-Dist: typer[all]>=0.12.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pyright>=1.1.0; extra == 'dev'
30
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Codeatrium
35
+
36
+ [![CI](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml/badge.svg)](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml)
37
+ [![PyPI](https://img.shields.io/pypi/v/codeatrium)](https://pypi.org/project/codeatrium/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
39
+
40
+ A **memory palace** for AI coding agents.
41
+
42
+ English · [日本語](README.ja.md)
43
+
44
+ Codeatrium distills past conversations into *palace objects* and stores them in a searchable index, giving agents long-term memory. Past decisions, implementations, and code locations can be recalled in under 0.2 seconds.
45
+
46
+ The CLI command `loci` (from [Method of Loci](https://en.wikipedia.org/wiki/Method_of_loci)) is designed to be **called by the agent itself** — running `loci search "..." --json` from within a prompt.
47
+
48
+ The architecture extends the conversational memory model from [arXiv:2603.13017](https://arxiv.org/abs/2603.13017) for coding agents.
49
+
50
+ > **Note:** Currently [Claude Code](https://docs.anthropic.com/en/docs/claude-code) only. Session log format (`.jsonl`) and distillation (`claude --print`) depend on Claude Code.
51
+
52
+ ## Simple Interface
53
+
54
+ Agents use two core commands:
55
+
56
+ - **Semantic search** — `loci search "query"` retrieves past conversations by semantic similarity
57
+ - **Reverse lookup from code** — `loci context --symbol "name"` recalls past conversations about a specific code symbol
58
+ - tree-sitter symbol resolution (Python / TypeScript / Go) lets agents understand implementation intent before editing
59
+
60
+ <img src="assets/interface.en.svg" alt="Simple Interface" width="800">
61
+
62
+ ## How It Works
63
+
64
+ <img src="assets/architecture.svg" alt="Codeatrium Architecture" width="800">
65
+
66
+ 1. **Index** — Splits agent session logs into exchanges (user utterance + agent response pairs) and indexes them with FTS5 for keyword search
67
+ 2. **Distill** — An LLM (`claude --print`, default `claude-haiku-4-5`) summarizes each exchange into a palace object: `exchange_core` (what was done), `specific_context` (concrete details), `room_assignments` (topic tags). tree-sitter resolves touched files to symbol level (function/class/method + file + line + signature)
68
+ 3. **Search** — Cross-layer search fusing BM25 on verbatim text with HNSW on distilled embeddings via RRF
69
+
70
+ Raw conversations are not embedded — only the condensed distilled text is embedded with `multilingual-e5-small` (384-dim), balancing semantic search quality with embedding cost. The embedding model runs as a **Unix socket server**, keeping search latency **under 0.2 seconds** after the first load.
71
+
72
+ ## Installation
73
+
74
+ ```bash
75
+ pipx install codeatrium
76
+ ```
77
+
78
+ Requires Python 3.11+.
79
+
80
+ ## Quick Start
81
+
82
+ ```bash
83
+ # Initialize in project root
84
+ loci init
85
+
86
+ # Install hooks for automatic indexing
87
+ loci hook install
88
+ ```
89
+
90
+ When running `loci init`, if past session logs are detected, you'll be prompted with:
91
+
92
+ > [!IMPORTANT]
93
+ > When adopting this tool mid-project, a large number of exchanges may already exist. Distilling all of them will consume significant `claude --print` (Haiku) tokens. We recommend starting with `Skip all` or `Distill last 50`.
94
+
95
+ 1. **Min chars threshold** — Minimum character filter for exchanges (default: 50). This controls how many exchanges become distillation candidates. Higher values exclude short conversations and reduce token usage; lower values include nearly everything.
96
+ 2. **Handling existing exchanges** — Choose how much past history to distill:
97
+ - Skip all (no past session distillation)
98
+ - Distill last 50 (recent history only)
99
+ - Distill all (everything — high token cost)
100
+ - Custom (specify a number)
101
+ 3. **Run distillation now?** — Choose No to defer to the next session start
102
+
103
+ ## Agent Instructions
104
+
105
+ Agent instructions are injected automatically — no manual setup required:
106
+
107
+ - **`loci init`** — Inserts a marker section (`<!-- BEGIN CODEATRIUM -->...<!-- END CODEATRIUM -->`) into `CLAUDE.md`
108
+ - **`loci prime`** — Dynamically injects command usage into the context window at every session start via SessionStart Hook
109
+
110
+ ## CLI Commands
111
+
112
+ | Command | Description |
113
+ |---------|-------------|
114
+ | `loci init` | Initialize `.codeatrium/` in project root |
115
+ | `loci index` | Index new session logs |
116
+ | `loci distill [--limit N]` | Distill undistilled exchanges via LLM |
117
+ | `loci search "query" --json` | Semantic search (agent-facing) |
118
+ | `loci context --symbol "name" --json` | Code symbol → past conversations |
119
+ | `loci show "<ref>" --json` | Retrieve verbatim conversation |
120
+ | `loci status` | Show index state |
121
+ | `loci server start/stop/status` | Embedding server management |
122
+ | `loci hook install` | Register hooks in Claude Code settings |
123
+
124
+ ## Automation (Claude Code Hooks)
125
+
126
+ After `loci hook install`, everything runs automatically:
127
+
128
+ | Hook | Trigger | Command |
129
+ |------|---------|---------|
130
+ | Stop (async) | After every turn | `loci index` |
131
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci prime` |
132
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci server start` |
133
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci distill` |
134
+
135
+ - **`loci index`** — Runs asynchronously after every turn. Indexes only new exchanges, so it's fast even mid-session
136
+ - **`loci distill`** — Distills undistilled exchanges at session start via `claude --print`. Calls Haiku through the user's Claude Code (default: `claude-haiku-4-5`)
137
+ - **`loci server start`** — Keeps the embedding model (~500MB) resident in memory for sub-0.2s search latency
138
+
139
+ ## Search Output
140
+
141
+ ```json
142
+ [
143
+ {
144
+ "exchange_core": "Added connection pool with pool_size=5",
145
+ "specific_context": "pool_size=5, max_overflow=10",
146
+ "rooms": [
147
+ { "room_type": "concept", "room_key": "db-pool", "room_label": "DB connection pooling" }
148
+ ],
149
+ "symbols": [
150
+ { "name": "create_pool", "file": "src/db.py", "line": 42, "signature": "def create_pool(...)" }
151
+ ],
152
+ "verbatim_ref": "~/.claude/projects/.../session.jsonl:ply=42"
153
+ }
154
+ ]
155
+ ```
156
+
157
+ ## Configuration
158
+
159
+ `.codeatrium/config.toml` (generated by `loci init`):
160
+
161
+ ```toml
162
+ [distill]
163
+ model = "claude-haiku-4-5-20251001" # Model for distillation (default)
164
+ batch_limit = 20 # Max distillations per hook run
165
+
166
+ [index]
167
+ min_chars = 50 # Skip exchanges shorter than this
168
+ ```
169
+
170
+ ## Acknowledgments
171
+
172
+ The palace object model, room-based topic grouping, and BM25+HNSW fusion search are based on:
173
+
174
+ > *Structured Distillation for Personalized Agent Memory*
175
+ > (arXiv:2603.13017)
176
+
177
+
178
+ ## License
179
+
180
+ MIT
@@ -0,0 +1,147 @@
1
+ # Codeatrium
2
+
3
+ [![CI](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml/badge.svg)](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/codeatrium)](https://pypi.org/project/codeatrium/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+
7
+ [English](README.md) · 日本語
8
+
9
+ AI コーディングエージェントに**記憶の宮殿**を。
10
+
11
+ Codeatrium は過去の会話を *palace object* に蒸留し、検索可能なインデックスに保存することで、エージェントに長期記憶を与えます。過去の意思決定・実装・コード位置を 0.2 秒で想起できます。
12
+
13
+ CLI コマンド `loci`([Method of Loci=記憶の宮殿](https://ja.wikipedia.org/wiki/%E5%A0%B4%E6%89%80%E6%B3%95)に由来)は**エージェント自身が呼び出す**ことを想定しています。`loci search "..." --json` をプロンプト内から実行します。
14
+
15
+ アーキテクチャは [arXiv:2603.13017](https://arxiv.org/abs/2603.13017) の会話記憶モデルを、コーディングエージェント向けに拡張したものです。
16
+
17
+ > **Note:** 現在は [Claude Code](https://docs.anthropic.com/en/docs/claude-code) 専用です。セッションログ形式(`.jsonl`)と蒸留(`claude --print`)が Claude Code に依存しています。
18
+
19
+ ## シンプルなインターフェース
20
+
21
+ エージェントが使用するコマンドは基本2つ。
22
+
23
+ - **セマンティック検索** — `loci search "クエリ"` でセマンティック類似度から過去の会話を検索
24
+ - **コードから逆引き** — `loci context --symbol "名前"` で特定のコードシンボルに関する過去の会話を想起
25
+ - tree-sitter(Python / TypeScript / Go)のシンボル解決により、エージェントは実装意図・背景を把握できる
26
+
27
+ <img src="assets/interface.ja.svg" alt="Simple Interface" width="800">
28
+
29
+ ## 仕組み
30
+
31
+ <img src="assets/architecture.svg" alt="Codeatrium Architecture" width="800">
32
+
33
+ 1. **Index** — エージェントのセッションログを exchange(ユーザー発話 + エージェント応答のペア)に分割し、FTS5 でキーワード検索可能にする
34
+ 2. **Distill** — LLM(`claude --print`、デフォルトは `claude-haiku-4-5`)が各 exchange を palace object に要約: `exchange_core`(何をしたか)、`specific_context`(具体的な詳細)、`room_assignments`(トピックタグ)。tree-sitter で触れたファイルをシンボルレベル(関数・クラス・メソッド + ファイル + 行 + シグネチャ)に解決
35
+ 3. **Search** — 会話原文の BM25 と蒸留済み埋め込みの HNSW を RRF で融合するクロスレイヤー検索
36
+
37
+ 会話原文は埋め込まず、蒸留で濃縮されたテキストのみを `multilingual-e5-small`(384次元)で埋め込むことで、セマンティック検索の精度と埋め込みコストを両立しています。埋め込みモデルは **Unix ソケットサーバー**で常駐し、初回以降の検索は **0.2 秒以内**で返ります。
38
+
39
+ ## インストール
40
+
41
+ ```bash
42
+ pipx install codeatrium
43
+ ```
44
+
45
+ Python 3.11 以上が必要です。
46
+
47
+ ## クイックスタート
48
+
49
+ ```bash
50
+ # プロジェクトルートで初期化
51
+ loci init
52
+
53
+ # 自動インデックスのフックをインストール
54
+ loci hook install
55
+ ```
56
+
57
+ `loci init` を実行すると、過去のセッションログが検出された場合に以下の質問が表示されます:
58
+
59
+ > [!IMPORTANT]
60
+ > 途中からこのツールを導入する場合、すでに大量の exchange が蓄積されています。全件蒸留すると `claude --print` (Haiku) のトークンが大量に消費されるため、まずは `Skip all` か `Distill last 50` で始めることを推奨します。
61
+
62
+ 1. **Min chars threshold** — exchange の最小文字数フィルタ(デフォルト: 50文字)。この閾値で蒸留の母数(対象 exchange 数)が決まります。値を大きくすると短い会話が除外され蒸留対象が減り、小さくするとほぼ全ての会話が蒸留対象になりトークン消費が増えます。
63
+ 2. **既存 exchange の扱い** — 過去のセッションをどこまで蒸留するか選択:
64
+ - Skip all(過去のセッション蒸留なし)
65
+ - Distill last 50(直近の履歴のみ)
66
+ - Distill all(全件、トークン消費あり)
67
+ - Custom(件数を指定)
68
+ 3. **蒸留を今すぐ実行するか** — No を選ぶと次回セッション開始時に自動実行されます
69
+
70
+ ## エージェント向けインストラクション
71
+
72
+ エージェントへのインストラクションは自動挿入されるので手動で書く必要はありません:
73
+
74
+ - **`loci init`** — `CLAUDE.md` にマーカー付きセクション(`<!-- BEGIN CODEATRIUM -->...<!-- END CODEATRIUM -->`)を挿入。
75
+ - **`loci prime`** — SessionStart Hook で毎セッション開始時にコマンドの使い方をコンテキストウィンドウに動的注入
76
+
77
+ ## CLI コマンド
78
+
79
+ | コマンド | 説明 |
80
+ |---------|------|
81
+ | `loci init` | プロジェクトルートに `.codeatrium/` を初期化 |
82
+ | `loci index` | 新しいセッションログをインデックス |
83
+ | `loci distill [--limit N]` | 未蒸留の exchange を LLM で蒸留 |
84
+ | `loci search "クエリ" --json` | セマンティック検索(エージェント向け) |
85
+ | `loci context --symbol "名前" --json` | コードシンボル → 過去の会話 |
86
+ | `loci show "<ref>" --json` | 会話原文を取得 |
87
+ | `loci status` | インデックス状態を表示 |
88
+ | `loci server start/stop/status` | 埋め込みサーバー管理 |
89
+ | `loci hook install` | Claude Code の設定にフックを登録 |
90
+
91
+ ## 自動化(Claude Code フック)
92
+
93
+ `loci hook install` 後、すべて自動で動作します:
94
+
95
+ | フック | トリガー | コマンド |
96
+ |--------|---------|---------|
97
+ | Stop (async) | 毎ラリー後 | `loci index` |
98
+ | SessionStart | 起動時 / `/clear` / `/resume` / `compact` | `loci prime` |
99
+ | SessionStart | 起動時 / `/clear` / `/resume` / `compact` | `loci server start` |
100
+ | SessionStart | 起動時 / `/clear` / `/resume` / `compact` | `loci distill` |
101
+
102
+ - **`loci index`** — 毎ラリー後に非同期で実行。セッション途中でも差分のみインデックスするので高速
103
+ - **`loci distill`** — セッション開始時に未蒸留の exchange を `claude --print` で蒸留。ユーザーの Claude Code で Haiku を呼び出します(デフォルト: `claude-haiku-4-5`)
104
+ - **`loci server start`** — 埋め込みモデル(約500MB)をメモリに常駐させ、以降の検索を 0.2 秒以内に
105
+
106
+ ## 検索出力
107
+
108
+ ```json
109
+ [
110
+ {
111
+ "exchange_core": "pool_size=5 でコネクションプールを追加した",
112
+ "specific_context": "pool_size=5, max_overflow=10",
113
+ "rooms": [
114
+ { "room_type": "concept", "room_key": "db-pool", "room_label": "DB コネクションプーリング" }
115
+ ],
116
+ "symbols": [
117
+ { "name": "create_pool", "file": "src/db.py", "line": 42, "signature": "def create_pool(...)" }
118
+ ],
119
+ "verbatim_ref": "~/.claude/projects/.../session.jsonl:ply=42"
120
+ }
121
+ ]
122
+ ```
123
+
124
+ ## 設定
125
+
126
+ `.codeatrium/config.toml`(`loci init` で生成):
127
+
128
+ ```toml
129
+ [distill]
130
+ model = "claude-haiku-4-5-20251001" # 蒸留に使うモデル(デフォルト)
131
+ batch_limit = 20 # 1回あたりの蒸留上限
132
+
133
+ [index]
134
+ min_chars = 50 # この文字数未満の exchange をスキップ
135
+ ```
136
+
137
+ ## Acknowledgments
138
+
139
+ Palace object モデル、room ベースのトピックグルーピング、BM25+HNSW 融合検索は以下の論文に基づいています:
140
+
141
+ > *Structured Distillation for Personalized Agent Memory*
142
+ > (arXiv:2603.13017)
143
+
144
+
145
+ ## ライセンス
146
+
147
+ MIT
@@ -0,0 +1,147 @@
1
+ # Codeatrium
2
+
3
+ [![CI](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml/badge.svg)](https://github.com/senna-lang/Codeatrium/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/codeatrium)](https://pypi.org/project/codeatrium/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+
7
+ A **memory palace** for AI coding agents.
8
+
9
+ English · [日本語](README.ja.md)
10
+
11
+ Codeatrium distills past conversations into *palace objects* and stores them in a searchable index, giving agents long-term memory. Past decisions, implementations, and code locations can be recalled in under 0.2 seconds.
12
+
13
+ The CLI command `loci` (from [Method of Loci](https://en.wikipedia.org/wiki/Method_of_loci)) is designed to be **called by the agent itself** — running `loci search "..." --json` from within a prompt.
14
+
15
+ The architecture extends the conversational memory model from [arXiv:2603.13017](https://arxiv.org/abs/2603.13017) for coding agents.
16
+
17
+ > **Note:** Currently [Claude Code](https://docs.anthropic.com/en/docs/claude-code) only. Session log format (`.jsonl`) and distillation (`claude --print`) depend on Claude Code.
18
+
19
+ ## Simple Interface
20
+
21
+ Agents use two core commands:
22
+
23
+ - **Semantic search** — `loci search "query"` retrieves past conversations by semantic similarity
24
+ - **Reverse lookup from code** — `loci context --symbol "name"` recalls past conversations about a specific code symbol
25
+ - tree-sitter symbol resolution (Python / TypeScript / Go) lets agents understand implementation intent before editing
26
+
27
+ <img src="assets/interface.en.svg" alt="Simple Interface" width="800">
28
+
29
+ ## How It Works
30
+
31
+ <img src="assets/architecture.svg" alt="Codeatrium Architecture" width="800">
32
+
33
+ 1. **Index** — Splits agent session logs into exchanges (user utterance + agent response pairs) and indexes them with FTS5 for keyword search
34
+ 2. **Distill** — An LLM (`claude --print`, default `claude-haiku-4-5`) summarizes each exchange into a palace object: `exchange_core` (what was done), `specific_context` (concrete details), `room_assignments` (topic tags). tree-sitter resolves touched files to symbol level (function/class/method + file + line + signature)
35
+ 3. **Search** — Cross-layer search fusing BM25 on verbatim text with HNSW on distilled embeddings via RRF
36
+
37
+ Raw conversations are not embedded — only the condensed distilled text is embedded with `multilingual-e5-small` (384-dim), balancing semantic search quality with embedding cost. The embedding model runs as a **Unix socket server**, keeping search latency **under 0.2 seconds** after the first load.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pipx install codeatrium
43
+ ```
44
+
45
+ Requires Python 3.11+.
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ # Initialize in project root
51
+ loci init
52
+
53
+ # Install hooks for automatic indexing
54
+ loci hook install
55
+ ```
56
+
57
+ When running `loci init`, if past session logs are detected, you'll be prompted with:
58
+
59
+ > [!IMPORTANT]
60
+ > When adopting this tool mid-project, a large number of exchanges may already exist. Distilling all of them will consume significant `claude --print` (Haiku) tokens. We recommend starting with `Skip all` or `Distill last 50`.
61
+
62
+ 1. **Min chars threshold** — Minimum character filter for exchanges (default: 50). This controls how many exchanges become distillation candidates. Higher values exclude short conversations and reduce token usage; lower values include nearly everything.
63
+ 2. **Handling existing exchanges** — Choose how much past history to distill:
64
+ - Skip all (no past session distillation)
65
+ - Distill last 50 (recent history only)
66
+ - Distill all (everything — high token cost)
67
+ - Custom (specify a number)
68
+ 3. **Run distillation now?** — Choose No to defer to the next session start
69
+
70
+ ## Agent Instructions
71
+
72
+ Agent instructions are injected automatically — no manual setup required:
73
+
74
+ - **`loci init`** — Inserts a marker section (`<!-- BEGIN CODEATRIUM -->...<!-- END CODEATRIUM -->`) into `CLAUDE.md`
75
+ - **`loci prime`** — Dynamically injects command usage into the context window at every session start via SessionStart Hook
76
+
77
+ ## CLI Commands
78
+
79
+ | Command | Description |
80
+ |---------|-------------|
81
+ | `loci init` | Initialize `.codeatrium/` in project root |
82
+ | `loci index` | Index new session logs |
83
+ | `loci distill [--limit N]` | Distill undistilled exchanges via LLM |
84
+ | `loci search "query" --json` | Semantic search (agent-facing) |
85
+ | `loci context --symbol "name" --json` | Code symbol → past conversations |
86
+ | `loci show "<ref>" --json` | Retrieve verbatim conversation |
87
+ | `loci status` | Show index state |
88
+ | `loci server start/stop/status` | Embedding server management |
89
+ | `loci hook install` | Register hooks in Claude Code settings |
90
+
91
+ ## Automation (Claude Code Hooks)
92
+
93
+ After `loci hook install`, everything runs automatically:
94
+
95
+ | Hook | Trigger | Command |
96
+ |------|---------|---------|
97
+ | Stop (async) | After every turn | `loci index` |
98
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci prime` |
99
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci server start` |
100
+ | SessionStart | startup / `/clear` / `/resume` / `compact` | `loci distill` |
101
+
102
+ - **`loci index`** — Runs asynchronously after every turn. Indexes only new exchanges, so it's fast even mid-session
103
+ - **`loci distill`** — Distills undistilled exchanges at session start via `claude --print`. Calls Haiku through the user's Claude Code (default: `claude-haiku-4-5`)
104
+ - **`loci server start`** — Keeps the embedding model (~500MB) resident in memory for sub-0.2s search latency
105
+
106
+ ## Search Output
107
+
108
+ ```json
109
+ [
110
+ {
111
+ "exchange_core": "Added connection pool with pool_size=5",
112
+ "specific_context": "pool_size=5, max_overflow=10",
113
+ "rooms": [
114
+ { "room_type": "concept", "room_key": "db-pool", "room_label": "DB connection pooling" }
115
+ ],
116
+ "symbols": [
117
+ { "name": "create_pool", "file": "src/db.py", "line": 42, "signature": "def create_pool(...)" }
118
+ ],
119
+ "verbatim_ref": "~/.claude/projects/.../session.jsonl:ply=42"
120
+ }
121
+ ]
122
+ ```
123
+
124
+ ## Configuration
125
+
126
+ `.codeatrium/config.toml` (generated by `loci init`):
127
+
128
+ ```toml
129
+ [distill]
130
+ model = "claude-haiku-4-5-20251001" # Model for distillation (default)
131
+ batch_limit = 20 # Max distillations per hook run
132
+
133
+ [index]
134
+ min_chars = 50 # Skip exchanges shorter than this
135
+ ```
136
+
137
+ ## Acknowledgments
138
+
139
+ The palace object model, room-based topic grouping, and BM25+HNSW fusion search are based on:
140
+
141
+ > *Structured Distillation for Personalized Agent Memory*
142
+ > (arXiv:2603.13017)
143
+
144
+
145
+ ## License
146
+
147
+ MIT