dejasearch 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ root = true
2
+
3
+ [*]
4
+ charset = utf-8
5
+ end_of_line = lf
6
+ insert_final_newline = true
7
+ trim_trailing_whitespace = true
8
+ indent_style = space
9
+ indent_size = 4
10
+
11
+ [*.{yml,yaml,toml,json}]
12
+ indent_size = 2
13
+
14
+ [Makefile]
15
+ indent_style = tab
@@ -0,0 +1,38 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest, windows-latest]
15
+ python-version: ["3.10", "3.13"]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v5
19
+
20
+ - uses: actions/setup-python@v6
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Cache embedding model
25
+ uses: actions/cache@v5
26
+ with:
27
+ path: |
28
+ ~/.cache/fastembed_cache
29
+ ~/AppData/Local/Temp/fastembed_cache
30
+ key: fastembed-multilingual-e5-small
31
+
32
+ - name: Install
33
+ run: pip install -e ".[dev]"
34
+
35
+ - name: Run tests
36
+ run: pytest -v
37
+ env:
38
+ PYTHONIOENCODING: utf-8
@@ -0,0 +1,12 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .eggs/
9
+ .pytest_cache/
10
+ *.db
11
+ *.lock
12
+ tests/golden_pairs.json
@@ -0,0 +1,55 @@
1
+ # Changelog
2
+
3
+ ## 0.3.0 (2026-04-02)
4
+
5
+ ### Features
6
+ - **`get_context` MCP tool** — retrieve a chunk with surrounding turns (±window) from the same session (#4)
7
+ - **Secret filtering** — passwords, API keys, tokens, and private keys are redacted during indexing (#5)
8
+ - **`deja redact`** — update secrets in existing index without re-embedding (seconds vs full reindex)
9
+ - **`platformdirs`** — index stored in OS-standard location; auto-detects legacy `~/.claude/deja/` (#7)
10
+
11
+ ### Tests
12
+ - Tests for get_context, secret redaction, redact command, stats health check (41 total)
13
+
14
+ ### CI
15
+ - Upgraded to Node24 actions (checkout@v5, setup-python@v6, cache@v5)
16
+
17
+ ## 0.2.0 (2026-04-02)
18
+
19
+ ### Breaking Changes
20
+ - `get_session` MCP tool renamed to `get_session_chunks` (honest about returning chunks, not raw messages)
21
+ - Requires FastMCP >= 3.0.0 (upgraded from 2.x)
22
+
23
+ ### Bug Fixes
24
+ - **Incremental indexing correctness (P0)** — message_index no longer collides on incremental runs; dangling user at offset boundary no longer lost; stable upsert preserves rowid (#16)
25
+ - **Streaming indexer** — process turns in batches (TURNS_PER_BATCH=50) instead of loading entire file into memory (#9). Note: fastembed + ONNX runtime still uses ~3GB RAM for the model itself
26
+ - **FTS query** — token-wise AND instead of exact phrase match; `nginx proxy` now finds results with both words in any order (#18)
27
+ - **Search filters** — overfetch candidates (k=100) when project/date filters are active (#19)
28
+ - **FastMCP upgrade** — use public `ctx.lifespan_context` API instead of private `_lifespan_result` (#10)
29
+ - **SQLite threading** — `check_same_thread=False` for async MCP server
30
+ - **Windows UTF-8** — force UTF-8 stdout/stderr on Windows to prevent cp1252 crashes on Cyrillic
31
+ - **Hardcoded paths** — index location changed to `~/.claude/deja/` (no user-specific paths in code)
32
+
33
+ ### Features
34
+ - `deja eval` command with MRR@5 scoring for search quality benchmarking
35
+ - Time decay scoring (disabled by default, `time_decay=True` to enable)
36
+ - Auto-indexing docs in README (Claude Code Stop hook)
37
+ - SQLite indexes on `session_id` and `(project_path, timestamp)`
38
+
39
+ ### Repo
40
+ - GitHub Actions CI (ubuntu + windows, Python 3.10 + 3.13)
41
+ - README with badges, architecture diagram, usage docs
42
+ - CONTRIBUTING.md, .editorconfig, LICENSE (MIT)
43
+ - Social preview image
44
+
45
+ ## 0.1.0 (2026-03-31)
46
+
47
+ Initial MVP release.
48
+
49
+ - JSONL parser with offset support and tool_result separation
50
+ - Chunker with sentence-boundary splitting (1500 chars, 200 overlap)
51
+ - Indexer with incremental/safe reindex, GC, batch embedding
52
+ - Hybrid search: vector KNN + FTS5 + Reciprocal Rank Fusion
53
+ - MCP server (FastMCP, stdio transport) with `search` and `get_session` tools
54
+ - CLI: `deja index`, `deja serve`
55
+ - fastembed (multilingual-e5-small, 384-dim ONNX) + sqlite-vec + SQLite FTS5
@@ -0,0 +1,36 @@
1
+ # Contributing
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ git clone https://github.com/CynepMyx/deja.git
7
+ cd deja
8
+ python -m venv .venv
9
+ .venv/Scripts/activate # Windows
10
+ # source .venv/bin/activate # Linux/macOS
11
+ pip install -e ".[dev]"
12
+ ```
13
+
14
+ First test run downloads the embedding model (~117 MB).
15
+
16
+ ## Tests
17
+
18
+ ```bash
19
+ pytest -v
20
+ ```
21
+
22
+ Tests use temporary directories and in-memory databases — no external services required.
23
+
24
+ ## Code style
25
+
26
+ - Python 3.10+ (no walrus operator abuse, keep it readable)
27
+ - No docstrings on obvious functions
28
+ - `print(..., file=sys.stderr)` for logging — stdout is reserved for MCP JSON-RPC
29
+ - Follow existing patterns in the codebase
30
+
31
+ ## Pull requests
32
+
33
+ 1. Create a branch from `main`
34
+ 2. Make your changes
35
+ 3. Ensure all tests pass
36
+ 4. Open a PR with a clear description of what and why
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Oleg Usoltsev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,172 @@
1
+ Metadata-Version: 2.4
2
+ Name: dejasearch
3
+ Version: 0.3.0
4
+ Summary: Semantic search MCP server for Claude Code sessions
5
+ Project-URL: Repository, https://github.com/CynepMyx/deja
6
+ Project-URL: Issues, https://github.com/CynepMyx/deja/issues
7
+ Author: Oleg Usoltsev
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: claude-code,embeddings,mcp,semantic-search,sqlite
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: fastembed<1.0,>=0.6.0
22
+ Requires-Dist: fastmcp<4.0,>=3.0.0
23
+ Requires-Dist: platformdirs>=4.0
24
+ Requires-Dist: sqlite-vec==0.1.8
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
27
+ Requires-Dist: pytest>=8.0; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # deja
31
+
32
+ [![CI](https://github.com/CynepMyx/deja/actions/workflows/ci.yml/badge.svg)](https://github.com/CynepMyx/deja/actions)
33
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue?logo=python&logoColor=white)](https://python.org)
34
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
35
+ [![MCP](https://img.shields.io/badge/MCP-compatible-8A2BE2?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0id2hpdGUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PGNpcmNsZSBjeD0iOCIgY3k9IjgiIHI9IjYiIGZpbGw9Im5vbmUiIHN0cm9rZT0id2hpdGUiIHN0cm9rZS13aWR0aD0iMS41Ii8+PGNpcmNsZSBjeD0iOCIgY3k9IjgiIHI9IjIiLz48L3N2Zz4=)](https://modelcontextprotocol.io/)
36
+
37
+ > Semantic search over your Claude Code session history. Ask questions about past conversations by meaning, not just keywords.
38
+
39
+ **deja** is an [MCP server](https://modelcontextprotocol.io/) that indexes Claude Code JSONL sessions and provides hybrid search (vector + full-text) directly from Claude Code.
40
+
41
+ ## How it works
42
+
43
+ ```
44
+ ~/.claude/projects/*/ deja index index.db
45
+ *.jsonl ──────────────► (SQLite + vec + FTS5)
46
+ embeddings
47
+
48
+ │ deja serve (MCP stdio)
49
+
50
+ Claude Code
51
+ "search past sessions"
52
+ ```
53
+
54
+ 1. **Index** — parses JSONL session files, extracts conversation turns, embeds with `multilingual-e5-small`, stores in SQLite
55
+ 2. **Serve** — MCP server opens the index and answers search queries via stdio transport
56
+
57
+ Search combines vector KNN (semantic similarity) and FTS5 (keyword matching) via Reciprocal Rank Fusion.
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ git clone https://github.com/CynepMyx/deja.git
63
+ cd deja
64
+ python -m venv .venv
65
+ .venv/Scripts/pip install -e . # Windows
66
+ # .venv/bin/pip install -e . # Linux/macOS
67
+ ```
68
+
69
+ First run downloads the embedding model (~117 MB).
70
+
71
+ ## Usage
72
+
73
+ ### Build the index
74
+
75
+ ```bash
76
+ deja index # incremental — only new/changed files
77
+ deja index --reindex # full rebuild
78
+ ```
79
+
80
+ Scans all `~/.claude/projects/*/*.jsonl` files.
81
+
82
+ ### Add to Claude Code
83
+
84
+ Add to `~/.claude.json` under `mcpServers`:
85
+
86
+ ```json
87
+ "deja": {
88
+ "type": "stdio",
89
+ "command": "/path/to/deja/.venv/Scripts/deja.exe",
90
+ "args": ["serve"],
91
+ "env": {
92
+ "PYTHONUNBUFFERED": "1"
93
+ }
94
+ }
95
+ ```
96
+
97
+ Restart Claude Code — deja will appear as a connected MCP server.
98
+
99
+ ### MCP Tools
100
+
101
+ | Tool | Description |
102
+ |------|-------------|
103
+ | `search` | Hybrid semantic + keyword search across all sessions |
104
+ | `get_session_chunks` | Get indexed chunks for a session (not raw messages) |
105
+
106
+ **search** parameters:
107
+ - `query` (string) — what to search for
108
+ - `limit` (int, default 10) — max results
109
+ - `project` (string, optional) — filter by project
110
+ - `date_from` / `date_to` (string, optional) — ISO date range
111
+
112
+ ### Auto-indexing (optional)
113
+
114
+ Index automatically when a Claude Code session ends. Add a Stop hook to `~/.claude/settings.json`:
115
+
116
+ ```json
117
+ "hooks": {
118
+ "Stop": [
119
+ {
120
+ "matcher": "",
121
+ "hooks": [
122
+ {
123
+ "type": "command",
124
+ "command": "/path/to/deja/.venv/bin/deja index"
125
+ }
126
+ ]
127
+ }
128
+ ]
129
+ }
130
+ ```
131
+
132
+ On Windows with Git Bash, wrap in a shell script:
133
+
134
+ ```bash
135
+ #!/bin/bash
136
+ DEJA="/path/to/deja/.venv/Scripts/deja.exe"
137
+ [ -f "$DEJA" ] && "$DEJA" index >/dev/null 2>&1 &
138
+ ```
139
+
140
+ ```json
141
+ "command": "bash /path/to/deja-index.sh"
142
+ ```
143
+
144
+ PID lock prevents concurrent indexers — safe with multiple sessions.
145
+
146
+ ## Stack
147
+
148
+ - **[fastembed](https://github.com/qdrant/fastembed)** — ONNX embeddings (`intfloat/multilingual-e5-small`, 384-dim)
149
+ - **[sqlite-vec](https://github.com/asg017/sqlite-vec)** — vector KNN search in SQLite
150
+ - **SQLite FTS5** — full-text keyword search
151
+ - **[FastMCP](https://github.com/jlowin/fastmcp)** — MCP server framework
152
+
153
+ ## Performance
154
+
155
+ | Metric | Value |
156
+ |--------|-------|
157
+ | Incremental index | < 30 sec |
158
+ | Search latency (warm) | < 500 ms |
159
+ | First search (cold start) | < 5 sec |
160
+ | RAM (search) | ~150 MB |
161
+ | RAM (indexing) | ~300 MB |
162
+
163
+ ## Development
164
+
165
+ ```bash
166
+ pip install -e ".[dev]"
167
+ pytest
168
+ ```
169
+
170
+ ## License
171
+
172
+ [MIT](LICENSE)
@@ -0,0 +1,143 @@
1
+ # deja
2
+
3
+ [![CI](https://github.com/CynepMyx/deja/actions/workflows/ci.yml/badge.svg)](https://github.com/CynepMyx/deja/actions)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue?logo=python&logoColor=white)](https://python.org)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
6
+ [![MCP](https://img.shields.io/badge/MCP-compatible-8A2BE2?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMTYiIGhlaWdodD0iMTYiIHZpZXdCb3g9IjAgMCAxNiAxNiIgZmlsbD0id2hpdGUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PGNpcmNsZSBjeD0iOCIgY3k9IjgiIHI9IjYiIGZpbGw9Im5vbmUiIHN0cm9rZT0id2hpdGUiIHN0cm9rZS13aWR0aD0iMS41Ii8+PGNpcmNsZSBjeD0iOCIgY3k9IjgiIHI9IjIiLz48L3N2Zz4=)](https://modelcontextprotocol.io/)
7
+
8
+ > Semantic search over your Claude Code session history. Ask questions about past conversations by meaning, not just keywords.
9
+
10
+ **deja** is an [MCP server](https://modelcontextprotocol.io/) that indexes Claude Code JSONL sessions and provides hybrid search (vector + full-text) directly from Claude Code.
11
+
12
+ ## How it works
13
+
14
+ ```
15
+ ~/.claude/projects/*/ deja index index.db
16
+ *.jsonl ──────────────► (SQLite + vec + FTS5)
17
+ embeddings
18
+
19
+ │ deja serve (MCP stdio)
20
+
21
+ Claude Code
22
+ "search past sessions"
23
+ ```
24
+
25
+ 1. **Index** — parses JSONL session files, extracts conversation turns, embeds with `multilingual-e5-small`, stores in SQLite
26
+ 2. **Serve** — MCP server opens the index and answers search queries via stdio transport
27
+
28
+ Search combines vector KNN (semantic similarity) and FTS5 (keyword matching) via Reciprocal Rank Fusion.
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ git clone https://github.com/CynepMyx/deja.git
34
+ cd deja
35
+ python -m venv .venv
36
+ .venv/Scripts/pip install -e . # Windows
37
+ # .venv/bin/pip install -e . # Linux/macOS
38
+ ```
39
+
40
+ First run downloads the embedding model (~117 MB).
41
+
42
+ ## Usage
43
+
44
+ ### Build the index
45
+
46
+ ```bash
47
+ deja index # incremental — only new/changed files
48
+ deja index --reindex # full rebuild
49
+ ```
50
+
51
+ Scans all `~/.claude/projects/*/*.jsonl` files.
52
+
53
+ ### Add to Claude Code
54
+
55
+ Add to `~/.claude.json` under `mcpServers`:
56
+
57
+ ```json
58
+ "deja": {
59
+ "type": "stdio",
60
+ "command": "/path/to/deja/.venv/Scripts/deja.exe",
61
+ "args": ["serve"],
62
+ "env": {
63
+ "PYTHONUNBUFFERED": "1"
64
+ }
65
+ }
66
+ ```
67
+
68
+ Restart Claude Code — deja will appear as a connected MCP server.
69
+
70
+ ### MCP Tools
71
+
72
+ | Tool | Description |
73
+ |------|-------------|
74
+ | `search` | Hybrid semantic + keyword search across all sessions |
75
+ | `get_session_chunks` | Get indexed chunks for a session (not raw messages) |
76
+
77
+ **search** parameters:
78
+ - `query` (string) — what to search for
79
+ - `limit` (int, default 10) — max results
80
+ - `project` (string, optional) — filter by project
81
+ - `date_from` / `date_to` (string, optional) — ISO date range
82
+
83
+ ### Auto-indexing (optional)
84
+
85
+ Index automatically when a Claude Code session ends. Add a Stop hook to `~/.claude/settings.json`:
86
+
87
+ ```json
88
+ "hooks": {
89
+ "Stop": [
90
+ {
91
+ "matcher": "",
92
+ "hooks": [
93
+ {
94
+ "type": "command",
95
+ "command": "/path/to/deja/.venv/bin/deja index"
96
+ }
97
+ ]
98
+ }
99
+ ]
100
+ }
101
+ ```
102
+
103
+ On Windows with Git Bash, wrap in a shell script:
104
+
105
+ ```bash
106
+ #!/bin/bash
107
+ DEJA="/path/to/deja/.venv/Scripts/deja.exe"
108
+ [ -f "$DEJA" ] && "$DEJA" index >/dev/null 2>&1 &
109
+ ```
110
+
111
+ ```json
112
+ "command": "bash /path/to/deja-index.sh"
113
+ ```
114
+
115
+ PID lock prevents concurrent indexers — safe with multiple sessions.
116
+
117
+ ## Stack
118
+
119
+ - **[fastembed](https://github.com/qdrant/fastembed)** — ONNX embeddings (`intfloat/multilingual-e5-small`, 384-dim)
120
+ - **[sqlite-vec](https://github.com/asg017/sqlite-vec)** — vector KNN search in SQLite
121
+ - **SQLite FTS5** — full-text keyword search
122
+ - **[FastMCP](https://github.com/jlowin/fastmcp)** — MCP server framework
123
+
124
+ ## Performance
125
+
126
+ | Metric | Value |
127
+ |--------|-------|
128
+ | Incremental index | < 30 sec |
129
+ | Search latency (warm) | < 500 ms |
130
+ | First search (cold start) | < 5 sec |
131
+ | RAM (search) | ~150 MB |
132
+ | RAM (indexing) | ~300 MB |
133
+
134
+ ## Development
135
+
136
+ ```bash
137
+ pip install -e ".[dev]"
138
+ pytest
139
+ ```
140
+
141
+ ## License
142
+
143
+ [MIT](LICENSE)
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "dejasearch"
7
+ version = "0.3.0"
8
+ description = "Semantic search MCP server for Claude Code sessions"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Oleg Usoltsev" },
14
+ ]
15
+ keywords = ["claude-code", "mcp", "semantic-search", "embeddings", "sqlite"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Software Development :: Libraries",
26
+ ]
27
+ dependencies = [
28
+ "fastembed>=0.6.0,<1.0",
29
+ "sqlite-vec==0.1.8",
30
+ "fastmcp>=3.0.0,<4.0",
31
+ "platformdirs>=4.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ dev = [
36
+ "pytest>=8.0",
37
+ "pytest-asyncio>=0.23",
38
+ ]
39
+
40
+ [project.urls]
41
+ Repository = "https://github.com/CynepMyx/deja"
42
+ Issues = "https://github.com/CynepMyx/deja/issues"
43
+
44
+ [project.scripts]
45
+ deja = "deja.cli:main"
46
+
47
+ [tool.hatch.build.targets.wheel]
48
+ packages = ["src/deja"]
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests"]
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -0,0 +1,47 @@
1
+ MAX_CHUNK_SIZE = 1500
2
+ OVERLAP = 200
3
+
4
+ def _split_text(text: str) -> list[str]:
5
+ if len(text) <= MAX_CHUNK_SIZE:
6
+ return [text]
7
+
8
+ chunks = []
9
+ start = 0
10
+ while start < len(text):
11
+ end = start + MAX_CHUNK_SIZE
12
+ if end >= len(text):
13
+ chunks.append(text[start:])
14
+ break
15
+
16
+ search_region = text[end - OVERLAP:end]
17
+ for sep in ["\n\n", ". ", ".\n", "\n"]:
18
+ pos = search_region.rfind(sep)
19
+ if pos != -1:
20
+ end = end - OVERLAP + pos + len(sep)
21
+ break
22
+
23
+ chunks.append(text[start:end])
24
+ start = end - OVERLAP
25
+ if start < 0:
26
+ start = 0
27
+
28
+ return chunks
29
+
30
+ def make_chunks(
31
+ turn: dict, session_id: str, project_path: str
32
+ ) -> list[dict]:
33
+ embed_text = f"{turn['user_text']}\n\n{turn['assistant_text']}"
34
+ parts = _split_text(embed_text)
35
+
36
+ return [
37
+ {
38
+ "chunk_text": part,
39
+ "tool_result_text": turn.get("tool_result_text", "") if i == 0 else "",
40
+ "session_id": session_id,
41
+ "message_index": turn["message_index"],
42
+ "split_index": i,
43
+ "timestamp": turn.get("timestamp", ""),
44
+ "project_path": project_path,
45
+ }
46
+ for i, part in enumerate(parts)
47
+ ]