dejasearch 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dejasearch-0.3.0/.editorconfig +15 -0
- dejasearch-0.3.0/.github/workflows/ci.yml +38 -0
- dejasearch-0.3.0/.gitignore +12 -0
- dejasearch-0.3.0/CHANGELOG.md +55 -0
- dejasearch-0.3.0/CONTRIBUTING.md +36 -0
- dejasearch-0.3.0/LICENSE +21 -0
- dejasearch-0.3.0/PKG-INFO +172 -0
- dejasearch-0.3.0/README.md +143 -0
- dejasearch-0.3.0/pyproject.toml +51 -0
- dejasearch-0.3.0/src/deja/__init__.py +1 -0
- dejasearch-0.3.0/src/deja/chunker.py +47 -0
- dejasearch-0.3.0/src/deja/cli.py +265 -0
- dejasearch-0.3.0/src/deja/config.py +26 -0
- dejasearch-0.3.0/src/deja/db.py +92 -0
- dejasearch-0.3.0/src/deja/eval.py +83 -0
- dejasearch-0.3.0/src/deja/indexer.py +194 -0
- dejasearch-0.3.0/src/deja/parser.py +104 -0
- dejasearch-0.3.0/src/deja/search.py +134 -0
- dejasearch-0.3.0/src/deja/secrets.py +48 -0
- dejasearch-0.3.0/src/deja/server.py +133 -0
- dejasearch-0.3.0/tests/__init__.py +0 -0
- dejasearch-0.3.0/tests/test_chunker.py +56 -0
- dejasearch-0.3.0/tests/test_cli.py +115 -0
- dejasearch-0.3.0/tests/test_db.py +35 -0
- dejasearch-0.3.0/tests/test_indexer.py +186 -0
- dejasearch-0.3.0/tests/test_integration.py +42 -0
- dejasearch-0.3.0/tests/test_parser.py +101 -0
- dejasearch-0.3.0/tests/test_search.py +49 -0
- dejasearch-0.3.0/tests/test_secrets.py +49 -0
- dejasearch-0.3.0/tests/test_server.py +75 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ${{ matrix.os }}
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
os: [ubuntu-latest, windows-latest]
|
|
15
|
+
python-version: ["3.10", "3.13"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v5
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v6
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Cache embedding model
|
|
25
|
+
uses: actions/cache@v5
|
|
26
|
+
with:
|
|
27
|
+
path: |
|
|
28
|
+
~/.cache/fastembed_cache
|
|
29
|
+
~/AppData/Local/Temp/fastembed_cache
|
|
30
|
+
key: fastembed-multilingual-e5-small
|
|
31
|
+
|
|
32
|
+
- name: Install
|
|
33
|
+
run: pip install -e ".[dev]"
|
|
34
|
+
|
|
35
|
+
- name: Run tests
|
|
36
|
+
run: pytest -v
|
|
37
|
+
env:
|
|
38
|
+
PYTHONIOENCODING: utf-8
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.3.0 (2026-04-02)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
- **`get_context` MCP tool** — retrieve a chunk with surrounding turns (±window) from the same session (#4)
|
|
7
|
+
- **Secret filtering** — passwords, API keys, tokens, and private keys are redacted during indexing (#5)
|
|
8
|
+
- **`deja redact`** — update secrets in existing index without re-embedding (seconds vs full reindex)
|
|
9
|
+
- **`platformdirs`** — index stored in OS-standard location; auto-detects legacy `~/.claude/deja/` (#7)
|
|
10
|
+
|
|
11
|
+
### Tests
|
|
12
|
+
- Tests for get_context, secret redaction, redact command, stats health check (41 total)
|
|
13
|
+
|
|
14
|
+
### CI
|
|
15
|
+
- Upgraded to Node24 actions (checkout@v5, setup-python@v6, cache@v5)
|
|
16
|
+
|
|
17
|
+
## 0.2.0 (2026-04-02)
|
|
18
|
+
|
|
19
|
+
### Breaking Changes
|
|
20
|
+
- `get_session` MCP tool renamed to `get_session_chunks` (honest about returning chunks, not raw messages)
|
|
21
|
+
- Requires FastMCP >= 3.0.0 (upgraded from 2.x)
|
|
22
|
+
|
|
23
|
+
### Bug Fixes
|
|
24
|
+
- **Incremental indexing correctness (P0)** — message_index no longer collides on incremental runs; dangling user at offset boundary no longer lost; stable upsert preserves rowid (#16)
|
|
25
|
+
- **Streaming indexer** — process turns in batches (TURNS_PER_BATCH=50) instead of loading entire file into memory (#9). Note: fastembed + ONNX runtime still uses ~3GB RAM for the model itself
|
|
26
|
+
- **FTS query** — token-wise AND instead of exact phrase match; `nginx proxy` now finds results with both words in any order (#18)
|
|
27
|
+
- **Search filters** — overfetch candidates (k=100) when project/date filters are active (#19)
|
|
28
|
+
- **FastMCP upgrade** — use public `ctx.lifespan_context` API instead of private `_lifespan_result` (#10)
|
|
29
|
+
- **SQLite threading** — `check_same_thread=False` for async MCP server
|
|
30
|
+
- **Windows UTF-8** — force UTF-8 stdout/stderr on Windows to prevent cp1252 crashes on Cyrillic
|
|
31
|
+
- **Hardcoded paths** — index location changed to `~/.claude/deja/` (no user-specific paths in code)
|
|
32
|
+
|
|
33
|
+
### Features
|
|
34
|
+
- `deja eval` command with MRR@5 scoring for search quality benchmarking
|
|
35
|
+
- Time decay scoring (disabled by default, `time_decay=True` to enable)
|
|
36
|
+
- Auto-indexing docs in README (Claude Code Stop hook)
|
|
37
|
+
- SQLite indexes on `session_id` and `(project_path, timestamp)`
|
|
38
|
+
|
|
39
|
+
### Repo
|
|
40
|
+
- GitHub Actions CI (ubuntu + windows, Python 3.10 + 3.13)
|
|
41
|
+
- README with badges, architecture diagram, usage docs
|
|
42
|
+
- CONTRIBUTING.md, .editorconfig, LICENSE (MIT)
|
|
43
|
+
- Social preview image
|
|
44
|
+
|
|
45
|
+
## 0.1.0 (2026-03-31)
|
|
46
|
+
|
|
47
|
+
Initial MVP release.
|
|
48
|
+
|
|
49
|
+
- JSONL parser with offset support and tool_result separation
|
|
50
|
+
- Chunker with sentence-boundary splitting (1500 chars, 200 overlap)
|
|
51
|
+
- Indexer with incremental/safe reindex, GC, batch embedding
|
|
52
|
+
- Hybrid search: vector KNN + FTS5 + Reciprocal Rank Fusion
|
|
53
|
+
- MCP server (FastMCP, stdio transport) with `search` and `get_session` tools
|
|
54
|
+
- CLI: `deja index`, `deja serve`
|
|
55
|
+
- fastembed (multilingual-e5-small, 384-dim ONNX) + sqlite-vec + SQLite FTS5
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
git clone https://github.com/CynepMyx/deja.git
|
|
7
|
+
cd deja
|
|
8
|
+
python -m venv .venv
|
|
9
|
+
.venv/Scripts/activate # Windows
|
|
10
|
+
# source .venv/bin/activate # Linux/macOS
|
|
11
|
+
pip install -e ".[dev]"
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
First test run downloads the embedding model (~117 MB).
|
|
15
|
+
|
|
16
|
+
## Tests
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pytest -v
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Tests use temporary directories and in-memory databases — no external services required.
|
|
23
|
+
|
|
24
|
+
## Code style
|
|
25
|
+
|
|
26
|
+
- Python 3.10+ (no walrus operator abuse, keep it readable)
|
|
27
|
+
- No docstrings on obvious functions
|
|
28
|
+
- `print(..., file=sys.stderr)` for logging — stdout is reserved for MCP JSON-RPC
|
|
29
|
+
- Follow existing patterns in the codebase
|
|
30
|
+
|
|
31
|
+
## Pull requests
|
|
32
|
+
|
|
33
|
+
1. Create a branch from `main`
|
|
34
|
+
2. Make your changes
|
|
35
|
+
3. Ensure all tests pass
|
|
36
|
+
4. Open a PR with a clear description of what and why
|
dejasearch-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Oleg Usoltsev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dejasearch
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Semantic search MCP server for Claude Code sessions
|
|
5
|
+
Project-URL: Repository, https://github.com/CynepMyx/deja
|
|
6
|
+
Project-URL: Issues, https://github.com/CynepMyx/deja/issues
|
|
7
|
+
Author: Oleg Usoltsev
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: claude-code,embeddings,mcp,semantic-search,sqlite
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: fastembed<1.0,>=0.6.0
|
|
22
|
+
Requires-Dist: fastmcp<4.0,>=3.0.0
|
|
23
|
+
Requires-Dist: platformdirs>=4.0
|
|
24
|
+
Requires-Dist: sqlite-vec==0.1.8
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# deja
|
|
31
|
+
|
|
32
|
+
[](https://github.com/CynepMyx/deja/actions)
|
|
33
|
+
[](https://python.org)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
[](https://modelcontextprotocol.io/)
|
|
36
|
+
|
|
37
|
+
> Semantic search over your Claude Code session history. Ask questions about past conversations by meaning, not just keywords.
|
|
38
|
+
|
|
39
|
+
**deja** is an [MCP server](https://modelcontextprotocol.io/) that indexes Claude Code JSONL sessions and provides hybrid search (vector + full-text) directly from Claude Code.
|
|
40
|
+
|
|
41
|
+
## How it works
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
~/.claude/projects/*/ deja index index.db
|
|
45
|
+
*.jsonl ──────────────► (SQLite + vec + FTS5)
|
|
46
|
+
embeddings
|
|
47
|
+
│
|
|
48
|
+
│ deja serve (MCP stdio)
|
|
49
|
+
▼
|
|
50
|
+
Claude Code
|
|
51
|
+
"search past sessions"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
1. **Index** — parses JSONL session files, extracts conversation turns, embeds with `multilingual-e5-small`, stores in SQLite
|
|
55
|
+
2. **Serve** — MCP server opens the index and answers search queries via stdio transport
|
|
56
|
+
|
|
57
|
+
Search combines vector KNN (semantic similarity) and FTS5 (keyword matching) via Reciprocal Rank Fusion.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/CynepMyx/deja.git
|
|
63
|
+
cd deja
|
|
64
|
+
python -m venv .venv
|
|
65
|
+
.venv/Scripts/pip install -e . # Windows
|
|
66
|
+
# .venv/bin/pip install -e . # Linux/macOS
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
First run downloads the embedding model (~117 MB).
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
### Build the index
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
deja index # incremental — only new/changed files
|
|
77
|
+
deja index --reindex # full rebuild
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Scans all `~/.claude/projects/*/*.jsonl` files.
|
|
81
|
+
|
|
82
|
+
### Add to Claude Code
|
|
83
|
+
|
|
84
|
+
Add to `~/.claude.json` under `mcpServers`:
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
"deja": {
|
|
88
|
+
"type": "stdio",
|
|
89
|
+
"command": "/path/to/deja/.venv/Scripts/deja.exe",
|
|
90
|
+
"args": ["serve"],
|
|
91
|
+
"env": {
|
|
92
|
+
"PYTHONUNBUFFERED": "1"
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Restart Claude Code — deja will appear as a connected MCP server.
|
|
98
|
+
|
|
99
|
+
### MCP Tools
|
|
100
|
+
|
|
101
|
+
| Tool | Description |
|
|
102
|
+
|------|-------------|
|
|
103
|
+
| `search` | Hybrid semantic + keyword search across all sessions |
|
|
104
|
+
| `get_session_chunks` | Get indexed chunks for a session (not raw messages) |
|
|
105
|
+
|
|
106
|
+
**search** parameters:
|
|
107
|
+
- `query` (string) — what to search for
|
|
108
|
+
- `limit` (int, default 10) — max results
|
|
109
|
+
- `project` (string, optional) — filter by project
|
|
110
|
+
- `date_from` / `date_to` (string, optional) — ISO date range
|
|
111
|
+
|
|
112
|
+
### Auto-indexing (optional)
|
|
113
|
+
|
|
114
|
+
Index automatically when a Claude Code session ends. Add a Stop hook to `~/.claude/settings.json`:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
"hooks": {
|
|
118
|
+
"Stop": [
|
|
119
|
+
{
|
|
120
|
+
"matcher": "",
|
|
121
|
+
"hooks": [
|
|
122
|
+
{
|
|
123
|
+
"type": "command",
|
|
124
|
+
"command": "/path/to/deja/.venv/bin/deja index"
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
}
|
|
128
|
+
]
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
On Windows with Git Bash, wrap in a shell script:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
#!/bin/bash
|
|
136
|
+
DEJA="/path/to/deja/.venv/Scripts/deja.exe"
|
|
137
|
+
[ -f "$DEJA" ] && "$DEJA" index >/dev/null 2>&1 &
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
```json
|
|
141
|
+
"command": "bash /path/to/deja-index.sh"
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
PID lock prevents concurrent indexers — safe with multiple sessions.
|
|
145
|
+
|
|
146
|
+
## Stack
|
|
147
|
+
|
|
148
|
+
- **[fastembed](https://github.com/qdrant/fastembed)** — ONNX embeddings (`intfloat/multilingual-e5-small`, 384-dim)
|
|
149
|
+
- **[sqlite-vec](https://github.com/asg017/sqlite-vec)** — vector KNN search in SQLite
|
|
150
|
+
- **SQLite FTS5** — full-text keyword search
|
|
151
|
+
- **[FastMCP](https://github.com/jlowin/fastmcp)** — MCP server framework
|
|
152
|
+
|
|
153
|
+
## Performance
|
|
154
|
+
|
|
155
|
+
| Metric | Value |
|
|
156
|
+
|--------|-------|
|
|
157
|
+
| Incremental index | < 30 sec |
|
|
158
|
+
| Search latency (warm) | < 500 ms |
|
|
159
|
+
| First search (cold start) | < 5 sec |
|
|
160
|
+
| RAM (search) | ~150 MB |
|
|
161
|
+
| RAM (indexing) | ~300 MB |
|
|
162
|
+
|
|
163
|
+
## Development
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
pip install -e ".[dev]"
|
|
167
|
+
pytest
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# deja
|
|
2
|
+
|
|
3
|
+
[](https://github.com/CynepMyx/deja/actions)
|
|
4
|
+
[](https://python.org)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://modelcontextprotocol.io/)
|
|
7
|
+
|
|
8
|
+
> Semantic search over your Claude Code session history. Ask questions about past conversations by meaning, not just keywords.
|
|
9
|
+
|
|
10
|
+
**deja** is an [MCP server](https://modelcontextprotocol.io/) that indexes Claude Code JSONL sessions and provides hybrid search (vector + full-text) directly from Claude Code.
|
|
11
|
+
|
|
12
|
+
## How it works
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
~/.claude/projects/*/ deja index index.db
|
|
16
|
+
*.jsonl ──────────────► (SQLite + vec + FTS5)
|
|
17
|
+
embeddings
|
|
18
|
+
│
|
|
19
|
+
│ deja serve (MCP stdio)
|
|
20
|
+
▼
|
|
21
|
+
Claude Code
|
|
22
|
+
"search past sessions"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
1. **Index** — parses JSONL session files, extracts conversation turns, embeds with `multilingual-e5-small`, stores in SQLite
|
|
26
|
+
2. **Serve** — MCP server opens the index and answers search queries via stdio transport
|
|
27
|
+
|
|
28
|
+
Search combines vector KNN (semantic similarity) and FTS5 (keyword matching) via Reciprocal Rank Fusion.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
git clone https://github.com/CynepMyx/deja.git
|
|
34
|
+
cd deja
|
|
35
|
+
python -m venv .venv
|
|
36
|
+
.venv/Scripts/pip install -e . # Windows
|
|
37
|
+
# .venv/bin/pip install -e . # Linux/macOS
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
First run downloads the embedding model (~117 MB).
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Build the index
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
deja index # incremental — only new/changed files
|
|
48
|
+
deja index --reindex # full rebuild
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Scans all `~/.claude/projects/*/*.jsonl` files.
|
|
52
|
+
|
|
53
|
+
### Add to Claude Code
|
|
54
|
+
|
|
55
|
+
Add to `~/.claude.json` under `mcpServers`:
|
|
56
|
+
|
|
57
|
+
```json
|
|
58
|
+
"deja": {
|
|
59
|
+
"type": "stdio",
|
|
60
|
+
"command": "/path/to/deja/.venv/Scripts/deja.exe",
|
|
61
|
+
"args": ["serve"],
|
|
62
|
+
"env": {
|
|
63
|
+
"PYTHONUNBUFFERED": "1"
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Restart Claude Code — deja will appear as a connected MCP server.
|
|
69
|
+
|
|
70
|
+
### MCP Tools
|
|
71
|
+
|
|
72
|
+
| Tool | Description |
|
|
73
|
+
|------|-------------|
|
|
74
|
+
| `search` | Hybrid semantic + keyword search across all sessions |
|
|
75
|
+
| `get_session_chunks` | Get indexed chunks for a session (not raw messages) |
|
|
76
|
+
|
|
77
|
+
**search** parameters:
|
|
78
|
+
- `query` (string) — what to search for
|
|
79
|
+
- `limit` (int, default 10) — max results
|
|
80
|
+
- `project` (string, optional) — filter by project
|
|
81
|
+
- `date_from` / `date_to` (string, optional) — ISO date range
|
|
82
|
+
|
|
83
|
+
### Auto-indexing (optional)
|
|
84
|
+
|
|
85
|
+
Index automatically when a Claude Code session ends. Add a Stop hook to `~/.claude/settings.json`:
|
|
86
|
+
|
|
87
|
+
```json
|
|
88
|
+
"hooks": {
|
|
89
|
+
"Stop": [
|
|
90
|
+
{
|
|
91
|
+
"matcher": "",
|
|
92
|
+
"hooks": [
|
|
93
|
+
{
|
|
94
|
+
"type": "command",
|
|
95
|
+
"command": "/path/to/deja/.venv/bin/deja index"
|
|
96
|
+
}
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
On Windows with Git Bash, wrap in a shell script:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
#!/bin/bash
|
|
107
|
+
DEJA="/path/to/deja/.venv/Scripts/deja.exe"
|
|
108
|
+
[ -f "$DEJA" ] && "$DEJA" index >/dev/null 2>&1 &
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
```json
|
|
112
|
+
"command": "bash /path/to/deja-index.sh"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
PID lock prevents concurrent indexers — safe with multiple sessions.
|
|
116
|
+
|
|
117
|
+
## Stack
|
|
118
|
+
|
|
119
|
+
- **[fastembed](https://github.com/qdrant/fastembed)** — ONNX embeddings (`intfloat/multilingual-e5-small`, 384-dim)
|
|
120
|
+
- **[sqlite-vec](https://github.com/asg017/sqlite-vec)** — vector KNN search in SQLite
|
|
121
|
+
- **SQLite FTS5** — full-text keyword search
|
|
122
|
+
- **[FastMCP](https://github.com/jlowin/fastmcp)** — MCP server framework
|
|
123
|
+
|
|
124
|
+
## Performance
|
|
125
|
+
|
|
126
|
+
| Metric | Value |
|
|
127
|
+
|--------|-------|
|
|
128
|
+
| Incremental index | < 30 sec |
|
|
129
|
+
| Search latency (warm) | < 500 ms |
|
|
130
|
+
| First search (cold start) | < 5 sec |
|
|
131
|
+
| RAM (search) | ~150 MB |
|
|
132
|
+
| RAM (indexing) | ~300 MB |
|
|
133
|
+
|
|
134
|
+
## Development
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
pip install -e ".[dev]"
|
|
138
|
+
pytest
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dejasearch"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Semantic search MCP server for Claude Code sessions"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Oleg Usoltsev" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["claude-code", "mcp", "semantic-search", "embeddings", "sqlite"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Software Development :: Libraries",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"fastembed>=0.6.0,<1.0",
|
|
29
|
+
"sqlite-vec==0.1.8",
|
|
30
|
+
"fastmcp>=3.0.0,<4.0",
|
|
31
|
+
"platformdirs>=4.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=8.0",
|
|
37
|
+
"pytest-asyncio>=0.23",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Repository = "https://github.com/CynepMyx/deja"
|
|
42
|
+
Issues = "https://github.com/CynepMyx/deja/issues"
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
deja = "deja.cli:main"
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src/deja"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
MAX_CHUNK_SIZE = 1500
|
|
2
|
+
OVERLAP = 200
|
|
3
|
+
|
|
4
|
+
def _split_text(text: str) -> list[str]:
|
|
5
|
+
if len(text) <= MAX_CHUNK_SIZE:
|
|
6
|
+
return [text]
|
|
7
|
+
|
|
8
|
+
chunks = []
|
|
9
|
+
start = 0
|
|
10
|
+
while start < len(text):
|
|
11
|
+
end = start + MAX_CHUNK_SIZE
|
|
12
|
+
if end >= len(text):
|
|
13
|
+
chunks.append(text[start:])
|
|
14
|
+
break
|
|
15
|
+
|
|
16
|
+
search_region = text[end - OVERLAP:end]
|
|
17
|
+
for sep in ["\n\n", ". ", ".\n", "\n"]:
|
|
18
|
+
pos = search_region.rfind(sep)
|
|
19
|
+
if pos != -1:
|
|
20
|
+
end = end - OVERLAP + pos + len(sep)
|
|
21
|
+
break
|
|
22
|
+
|
|
23
|
+
chunks.append(text[start:end])
|
|
24
|
+
start = end - OVERLAP
|
|
25
|
+
if start < 0:
|
|
26
|
+
start = 0
|
|
27
|
+
|
|
28
|
+
return chunks
|
|
29
|
+
|
|
30
|
+
def make_chunks(
|
|
31
|
+
turn: dict, session_id: str, project_path: str
|
|
32
|
+
) -> list[dict]:
|
|
33
|
+
embed_text = f"{turn['user_text']}\n\n{turn['assistant_text']}"
|
|
34
|
+
parts = _split_text(embed_text)
|
|
35
|
+
|
|
36
|
+
return [
|
|
37
|
+
{
|
|
38
|
+
"chunk_text": part,
|
|
39
|
+
"tool_result_text": turn.get("tool_result_text", "") if i == 0 else "",
|
|
40
|
+
"session_id": session_id,
|
|
41
|
+
"message_index": turn["message_index"],
|
|
42
|
+
"split_index": i,
|
|
43
|
+
"timestamp": turn.get("timestamp", ""),
|
|
44
|
+
"project_path": project_path,
|
|
45
|
+
}
|
|
46
|
+
for i, part in enumerate(parts)
|
|
47
|
+
]
|