stele-context 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. stele_context-0.7.0/LICENSE +21 -0
  2. stele_context-0.7.0/PKG-INFO +554 -0
  3. stele_context-0.7.0/README.md +481 -0
  4. stele_context-0.7.0/pyproject.toml +115 -0
  5. stele_context-0.7.0/setup.cfg +4 -0
  6. stele_context-0.7.0/stele/__init__.py +36 -0
  7. stele_context-0.7.0/stele/bm25.py +125 -0
  8. stele_context-0.7.0/stele/chunkers/__init__.py +67 -0
  9. stele_context-0.7.0/stele/chunkers/audio.py +198 -0
  10. stele_context-0.7.0/stele/chunkers/base.py +307 -0
  11. stele_context-0.7.0/stele/chunkers/code.py +613 -0
  12. stele_context-0.7.0/stele/chunkers/image.py +277 -0
  13. stele_context-0.7.0/stele/chunkers/numpy_compat.py +77 -0
  14. stele_context-0.7.0/stele/chunkers/pdf.py +173 -0
  15. stele_context-0.7.0/stele/chunkers/text.py +319 -0
  16. stele_context-0.7.0/stele/chunkers/video.py +254 -0
  17. stele_context-0.7.0/stele/cli.py +579 -0
  18. stele_context-0.7.0/stele/cli_metadata.py +140 -0
  19. stele_context-0.7.0/stele/config.py +195 -0
  20. stele_context-0.7.0/stele/coordination.py +695 -0
  21. stele_context-0.7.0/stele/core.py +18 -0
  22. stele_context-0.7.0/stele/document_lock_storage.py +440 -0
  23. stele_context-0.7.0/stele/engine.py +1735 -0
  24. stele_context-0.7.0/stele/env_checks.py +153 -0
  25. stele_context-0.7.0/stele/index.py +616 -0
  26. stele_context-0.7.0/stele/index_store.py +175 -0
  27. stele_context-0.7.0/stele/mcp_server.py +842 -0
  28. stele_context-0.7.0/stele/mcp_stdio.py +969 -0
  29. stele_context-0.7.0/stele/metadata_storage.py +233 -0
  30. stele_context-0.7.0/stele/py.typed +0 -0
  31. stele_context-0.7.0/stele/rwlock.py +52 -0
  32. stele_context-0.7.0/stele/session.py +224 -0
  33. stele_context-0.7.0/stele/session_storage.py +350 -0
  34. stele_context-0.7.0/stele/storage.py +1040 -0
  35. stele_context-0.7.0/stele/symbol_graph.py +327 -0
  36. stele_context-0.7.0/stele/symbol_storage.py +256 -0
  37. stele_context-0.7.0/stele/symbols.py +885 -0
  38. stele_context-0.7.0/stele_context.egg-info/PKG-INFO +554 -0
  39. stele_context-0.7.0/stele_context.egg-info/SOURCES.txt +60 -0
  40. stele_context-0.7.0/stele_context.egg-info/dependency_links.txt +1 -0
  41. stele_context-0.7.0/stele_context.egg-info/entry_points.txt +3 -0
  42. stele_context-0.7.0/stele_context.egg-info/requires.txt +58 -0
  43. stele_context-0.7.0/stele_context.egg-info/top_level.txt +1 -0
  44. stele_context-0.7.0/tests/test_agent_embeddings.py +182 -0
  45. stele_context-0.7.0/tests/test_bm25.py +137 -0
  46. stele_context-0.7.0/tests/test_chunk_history.py +119 -0
  47. stele_context-0.7.0/tests/test_chunkers.py +219 -0
  48. stele_context-0.7.0/tests/test_concurrency.py +366 -0
  49. stele_context-0.7.0/tests/test_config.py +181 -0
  50. stele_context-0.7.0/tests/test_conflicts.py +493 -0
  51. stele_context-0.7.0/tests/test_core.py +261 -0
  52. stele_context-0.7.0/tests/test_engine.py +293 -0
  53. stele_context-0.7.0/tests/test_index.py +205 -0
  54. stele_context-0.7.0/tests/test_index_store.py +209 -0
  55. stele_context-0.7.0/tests/test_mcp_server.py +389 -0
  56. stele_context-0.7.0/tests/test_mcp_stdio.py +236 -0
  57. stele_context-0.7.0/tests/test_metadata.py +370 -0
  58. stele_context-0.7.0/tests/test_session.py +124 -0
  59. stele_context-0.7.0/tests/test_storage_migration.py +208 -0
  60. stele_context-0.7.0/tests/test_symbols.py +874 -0
  61. stele_context-0.7.0/tests/test_tree_sitter.py +242 -0
  62. stele_context-0.7.0/tests/test_worktree_safety.py +838 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stele Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,554 @@
1
+ Metadata-Version: 2.4
2
+ Name: stele-context
3
+ Version: 0.7.0
4
+ Summary: Local context cache for LLM agents with semantic chunking and vector search
5
+ Author: Stele Contributors
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/IronAdamant/Stele
8
+ Project-URL: Repository, https://github.com/IronAdamant/Stele
9
+ Project-URL: Documentation, https://github.com/IronAdamant/Stele#readme
10
+ Keywords: llm,context-cache,chunking,vector-search,semantic-search,offline,local
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Provides-Extra: performance
24
+ Requires-Dist: msgspec>=0.18.0; extra == "performance"
25
+ Requires-Dist: numpy>=1.24.0; extra == "performance"
26
+ Provides-Extra: image
27
+ Requires-Dist: Pillow>=10.0; extra == "image"
28
+ Provides-Extra: pdf
29
+ Requires-Dist: pymupdf>=1.23.0; extra == "pdf"
30
+ Provides-Extra: audio
31
+ Requires-Dist: librosa>=0.10.0; extra == "audio"
32
+ Requires-Dist: numpy>=1.24.0; extra == "audio"
33
+ Provides-Extra: video
34
+ Requires-Dist: opencv-python>=4.8.0; extra == "video"
35
+ Requires-Dist: numpy>=1.24.0; extra == "video"
36
+ Provides-Extra: tree-sitter
37
+ Requires-Dist: tree-sitter>=0.23.0; extra == "tree-sitter"
38
+ Requires-Dist: tree-sitter-javascript>=0.23.0; extra == "tree-sitter"
39
+ Requires-Dist: tree-sitter-typescript>=0.23.0; extra == "tree-sitter"
40
+ Requires-Dist: tree-sitter-java>=0.23.0; extra == "tree-sitter"
41
+ Requires-Dist: tree-sitter-c>=0.23.0; extra == "tree-sitter"
42
+ Requires-Dist: tree-sitter-cpp>=0.23.0; extra == "tree-sitter"
43
+ Requires-Dist: tree-sitter-go>=0.23.0; extra == "tree-sitter"
44
+ Requires-Dist: tree-sitter-rust>=0.23.0; extra == "tree-sitter"
45
+ Requires-Dist: tree-sitter-ruby>=0.23.0; extra == "tree-sitter"
46
+ Requires-Dist: tree-sitter-php>=0.23.0; extra == "tree-sitter"
47
+ Provides-Extra: mcp
48
+ Requires-Dist: mcp>=1.0.0; extra == "mcp"
49
+ Provides-Extra: all
50
+ Requires-Dist: msgspec>=0.18.0; extra == "all"
51
+ Requires-Dist: numpy>=1.24.0; extra == "all"
52
+ Requires-Dist: Pillow>=10.0; extra == "all"
53
+ Requires-Dist: pymupdf>=1.23.0; extra == "all"
54
+ Requires-Dist: librosa>=0.10.0; extra == "all"
55
+ Requires-Dist: opencv-python>=4.8.0; extra == "all"
56
+ Requires-Dist: tree-sitter>=0.23.0; extra == "all"
57
+ Requires-Dist: tree-sitter-javascript>=0.23.0; extra == "all"
58
+ Requires-Dist: tree-sitter-typescript>=0.23.0; extra == "all"
59
+ Requires-Dist: tree-sitter-java>=0.23.0; extra == "all"
60
+ Requires-Dist: tree-sitter-c>=0.23.0; extra == "all"
61
+ Requires-Dist: tree-sitter-cpp>=0.23.0; extra == "all"
62
+ Requires-Dist: tree-sitter-go>=0.23.0; extra == "all"
63
+ Requires-Dist: tree-sitter-rust>=0.23.0; extra == "all"
64
+ Requires-Dist: tree-sitter-ruby>=0.23.0; extra == "all"
65
+ Requires-Dist: tree-sitter-php>=0.23.0; extra == "all"
66
+ Requires-Dist: mcp>=1.0.0; extra == "all"
67
+ Provides-Extra: dev
68
+ Requires-Dist: pytest>=7.0; extra == "dev"
69
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
70
+ Requires-Dist: mypy>=1.0; extra == "dev"
71
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
72
+ Dynamic: license-file
73
+
74
+ # Stele
75
+
76
+ **Local context cache for LLM agents with semantic chunking and vector search.**
77
+
78
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
79
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
80
+ [![Zero Dependencies](https://img.shields.io/badge/dependencies-zero-green.svg)](https://github.com/IronAdamant/Stele)
81
+ [![Tests](https://img.shields.io/badge/tests-412%20passed-brightgreen.svg)](https://github.com/IronAdamant/Stele/actions)
82
+
83
+ Stele helps LLM agents avoid re-reading unchanged files by caching chunk data with semantic search. Documents are routed through modality-specific chunkers, chunk content is stored in SQLite, and an HNSW vector index enables fast O(log n) retrieval. Only modified chunks trigger reprocessing.
84
+
85
+ ## Key Features
86
+
87
+ - **100% Offline & Local-Only**: No internet access, no external API calls, no cloud components
88
+ - **Zero Required Dependencies**: Runs on Python stdlib alone — no supply chain risks
89
+ - **Multi-Modal Support**: Text, code, images, PDFs, audio, and video (optional dependencies)
90
+ - **HNSW Vector Index**: O(log n) semantic search across all indexed chunks
91
+ - **Hybrid Search**: HNSW cosine similarity + BM25 keyword matching, auto-tuned blending
92
+ - **Tree-Sitter Chunking**: AST-aware code chunking for 9 languages (optional, falls back to regex)
93
+ - **Symbol Graph**: Cross-file reference tracking — `find_references`, `find_definition`, `impact_radius`
94
+ - **Multi-Agent Safe**: Per-document locking, optimistic versioning, cross-worktree coordination
95
+ - **MCP Server**: JSON-RPC over stdio for Claude Desktop, HTTP REST for other agents
96
+ - **Project Config**: `.stele.toml` file for per-project settings
97
+ - **Session Management**: Sessions with rollback, pruning, and KV-cache persistence
98
+
99
+ ## Architecture
100
+
101
+ ```mermaid
102
+ graph TB
103
+ subgraph API["API Layer"]
104
+ CLI["CLI<br/>stele index / search / serve"]
105
+ HTTP["HTTP REST<br/>30 tools, threaded"]
106
+ MCP["MCP stdio<br/>32 tools, JSON-RPC"]
107
+ end
108
+
109
+ subgraph Engine["Engine (engine.py)"]
110
+ CFG["Config<br/>.stele.toml loader"]
111
+ SEARCH["Hybrid Search<br/>HNSW + BM25"]
112
+ IDX["index_documents()<br/>detect_changes()"]
113
+ SYM["Symbol Graph<br/>12 languages"]
114
+ SESS["Sessions<br/>rollback, pruning"]
115
+ LOCK["Document Locking<br/>ownership, versioning"]
116
+ end
117
+
118
+ subgraph Chunkers["Chunkers"]
119
+ TXT["TextChunker"]
120
+ CODE["CodeChunker<br/>Python AST<br/>tree-sitter (9 langs)<br/>regex fallback"]
121
+ IMG["ImageChunker<br/>(Pillow)"]
122
+ PDF["PDFChunker<br/>(pymupdf)"]
123
+ AUD["AudioChunker<br/>(librosa)"]
124
+ VID["VideoChunker<br/>(opencv)"]
125
+ end
126
+
127
+ subgraph Storage["Storage"]
128
+ SQLITE["SQLite<br/>chunks, symbols,<br/>sessions, history"]
129
+ HNSW["HNSW Index<br/>128-dim vectors"]
130
+ BM25["BM25 Index<br/>keyword scoring"]
131
+ KV["KV Cache<br/>JSON + zlib"]
132
+ COORD["Coordination DB<br/>cross-worktree locks"]
133
+ end
134
+
135
+ CLI --> Engine
136
+ HTTP --> Engine
137
+ MCP --> Engine
138
+ Engine --> Chunkers
139
+ Engine --> Storage
140
+ ```
141
+
142
+ ## Comparison
143
+
144
+ | Feature | Stele | LangChain | LlamaIndex | EverMemOS |
145
+ |---------|-------|-----------|------------|-----------|
146
+ | Zero dependencies | Yes | No (50+) | No (30+) | No (Mongo, Redis, Milvus) |
147
+ | 100% offline | Yes | No | No | No |
148
+ | No model downloads | Yes | No | No | No |
149
+ | Multi-modal | 6 modalities | Text-focused | Text-focused | Text only |
150
+ | Code-aware chunking | AST + tree-sitter | Basic splitting | Basic splitting | No |
151
+ | Symbol graph | 12 languages | No | No | No |
152
+ | Multi-agent safety | Locks + versioning | No | No | Yes |
153
+ | MCP server | Native | Plugin | Plugin | Planned |
154
+ | Storage | SQLite (embedded) | Vector DB (external) | Vector DB (external) | MongoDB + Milvus |
155
+
156
+ ## Installation
157
+
158
+ ```bash
159
+ # From source
160
+ git clone https://github.com/IronAdamant/Stele.git
161
+ cd stele
162
+ pip install -e .
163
+
164
+ # With dev dependencies
165
+ pip install -e ".[dev]"
166
+ ```
167
+
168
+ ### Requirements
169
+
170
+ - Python 3.9+
171
+ - **Zero required dependencies**
172
+
173
+ ### Optional Extras (all 100% offline)
174
+
175
+ | Extra | Packages | Use Case |
176
+ |-------|----------|----------|
177
+ | `performance` | msgspec, numpy | Faster serialization & vector math |
178
+ | `image` | Pillow | Image indexing & similarity |
179
+ | `pdf` | pymupdf | PDF text extraction |
180
+ | `audio` | librosa, numpy | Audio segmentation & features |
181
+ | `video` | opencv-python, numpy | Video keyframe extraction |
182
+ | `tree-sitter` | tree-sitter + 9 grammar packages | AST-aware code chunking for JS/TS, Java, C/C++, Go, Rust, Ruby, PHP |
183
+ | `mcp` | mcp | MCP stdio server for Claude Desktop |
184
+ | `all` | All of the above | Everything |
185
+
186
+ ```bash
187
+ pip install stele[tree-sitter] # AST-aware code chunking
188
+ pip install stele[image,pdf] # Multi-modal
189
+ pip install stele[all] # Everything
190
+ ```
191
+
192
+ ## Quick Start
193
+
194
+ ### 1. Index Documents
195
+
196
+ ```bash
197
+ stele index src/*.py docs/*.md
198
+ stele index --force document.py # Force re-index
199
+ ```
200
+
201
+ ### 2. Semantic Search
202
+
203
+ ```bash
204
+ stele search "authentication logic" --top-k 5
205
+ stele search "error handling" --json
206
+ ```
207
+
208
+ ### 3. MCP Server (for Claude Code / Claude Desktop)
209
+
210
+ ```bash
211
+ pip install stele[mcp]
212
+ stele serve-mcp
213
+ ```
214
+
215
+ **Claude Code** (`~/.claude/settings.json`):
216
+ ```json
217
+ {
218
+ "mcpServers": {
219
+ "stele": {
220
+ "command": "stele",
221
+ "args": ["serve-mcp"]
222
+ }
223
+ }
224
+ }
225
+ ```
226
+
227
+ **Claude Desktop** (`~/.config/Claude/claude_desktop_config.json`):
228
+ ```json
229
+ {
230
+ "mcpServers": {
231
+ "stele": {
232
+ "command": "stele",
233
+ "args": ["serve-mcp"]
234
+ }
235
+ }
236
+ }
237
+ ```
238
+
239
+ > **Tip:** If installed in a virtualenv, use the full path to the `stele` binary.
240
+
241
+ ### 4. HTTP REST Server
242
+
243
+ ```bash
244
+ stele serve --port 9876
245
+ ```
246
+
247
+ ### 5. Project Configuration
248
+
249
+ Create `.stele.toml` in your project root:
250
+
251
+ ```toml
252
+ [stele]
253
+ chunk_size = 512
254
+ max_chunk_size = 8192
255
+ merge_threshold = 0.75
256
+ change_threshold = 0.90
257
+ search_alpha = 0.6
258
+ skip_dirs = [".git", "node_modules", "dist", "vendor"]
259
+ ```
260
+
261
+ All values are optional — constructor params and env vars override config file values.
262
+
263
+ ## Python API
264
+
265
+ ```python
266
+ from stele import Stele
267
+
268
+ engine = Stele()
269
+
270
+ # Index documents (auto-detects modality, walks directories)
271
+ result = engine.index_documents(["src/", "README.md"])
272
+ print(f"Indexed {result['total_chunks']} chunks")
273
+
274
+ # Hybrid semantic search (HNSW + BM25)
275
+ results = engine.search("authentication logic", top_k=5)
276
+ for r in results:
277
+ print(f"[{r['relevance_score']:.3f}] {r['document_path']}")
278
+ print(f" {r['content'][:100]}...")
279
+
280
+ # Get cached context — unchanged chunks skip reprocessing
281
+ context = engine.get_context(["src/main.py", "src/utils.py"])
282
+ for doc in context["unchanged"]:
283
+ print(f"{doc['path']}: {len(doc['chunks'])} cached chunks")
284
+
285
+ # Symbol graph — cross-file reference tracking
286
+ refs = engine.find_references("Stele")
287
+ defn = engine.find_definition("StorageBackend")
288
+
289
+ # Impact analysis — what breaks if this changes?
290
+ impact = engine.impact_radius(chunk_id="abc123", depth=2)
291
+
292
+ # Staleness detection — find chunks with stale dependencies
293
+ stale = engine.stale_chunks(threshold=0.3)
294
+
295
+ # Chunk version history
296
+ history = engine.get_chunk_history(document_path="src/main.py")
297
+
298
+ # Session management
299
+ engine.save_kv_state("session-1", {"chunk_id": {"key": "value"}})
300
+ engine.rollback("session-1", target_turn=2)
301
+ engine.prune_chunks("session-1", max_tokens=100000)
302
+
303
+ # Multi-agent document locking
304
+ engine.acquire_document_lock("src/main.py", agent_id="agent-alpha")
305
+ engine.index_documents(["src/main.py"], agent_id="agent-alpha")
306
+ engine.release_document_lock("src/main.py", agent_id="agent-alpha")
307
+ ```
308
+
309
+ ### Configuration
310
+
311
+ ```python
312
+ engine = Stele(
313
+ chunk_size=256, # Target tokens per initial chunk
314
+ max_chunk_size=4096, # Maximum tokens per merged chunk
315
+ merge_threshold=0.7, # Similarity threshold for merging
316
+ change_threshold=0.85, # Similarity threshold for "unchanged"
317
+ search_alpha=0.7, # Blend: 1.0 = pure vector, 0.0 = pure keyword
318
+ )
319
+ ```
320
+
321
+ Or use `.stele.toml` (see above) — constructor params override config file values.
322
+
323
+ ### Agent-Supplied Semantic Embeddings
324
+
325
+ LLM agents already understand the semantics of every chunk they read. Instead of using a separate embedding model, Stele captures the agent's understanding directly:
326
+
327
+ ```python
328
+ # After indexing, the agent describes what each chunk does
329
+ engine.store_semantic_summary(
330
+ chunk_id="abc123",
331
+ summary="JWT authentication middleware that validates bearer tokens and attaches user identity to request context"
332
+ )
333
+
334
+ # Now searches like "token validation" match far better than
335
+ # statistical signatures on raw code would
336
+ results = engine.search("token validation middleware")
337
+ ```
338
+
339
+ The agent IS the embedding model. Stele just stores and indexes what the agent tells it — zero new dependencies, no model downloads, no API calls.
340
+
341
+ **How it works:**
342
+ - **Tier 1** (always): 128-dim statistical signatures — trigrams, bigrams, structural features. Used for change detection.
343
+ - **Tier 2** (optional): Agent-supplied semantic summaries. Stele computes a signature from the summary text and uses it for HNSW search. ~9% improvement on semantic queries.
344
+ - **Tier 2 alt**: `store_embedding(chunk_id, vector)` for agents with direct embedding API access.
345
+
346
+ ## MCP Tools
347
+
348
+ ### HTTP Server (30 tools)
349
+
350
+ | Category | Tools |
351
+ |----------|-------|
352
+ | **Indexing** | `index_documents`, `detect_changes_and_update`, `detect_modality`, `get_supported_formats` |
353
+ | **Search** | `search`, `get_context`, `get_relevant_kv` |
354
+ | **Sessions** | `save_kv_state`, `rollback`, `prune_chunks`, `list_sessions` |
355
+ | **Symbols** | `find_references`, `find_definition`, `impact_radius`, `rebuild_symbol_graph`, `stale_chunks` |
356
+ | **Locking** | `acquire_document_lock`, `release_document_lock`, `refresh_document_lock`, `get_document_lock_status`, `release_agent_locks`, `reap_expired_locks` |
357
+ | **History** | `get_conflicts`, `get_chunk_history`, `get_notifications` |
358
+ | **Embeddings** | `store_semantic_summary`, `store_embedding` |
359
+ | **Utilities** | `list_agents`, `environment_check`, `clean_bytecache` |
360
+
361
+ ### MCP stdio Server (32 tools)
362
+
363
+ All HTTP tools plus: `annotate`, `get_annotations`, `delete_annotation`, `update_annotation`, `search_annotations`, `bulk_annotate`, `prune_history`, `map`, `history`, `stats`, `remove`
364
+
365
+ ## How It Works
366
+
367
+ ### Change Detection
368
+
369
+ ```
370
+ For each chunk:
371
+ 1. SHA-256 hash → exact match → instant cache hit (0 tokens)
372
+ 2. Hash differs → compute 128-dim semantic signature
373
+ 3. Cosine similarity > threshold → semantically similar → restore KV
374
+ 4. Similarity ≤ threshold → significant change → reprocess
375
+ ```
376
+
377
+ ### Token Savings
378
+
379
+ | Scenario | Without Stele | With Stele | Savings |
380
+ |----------|---------------|------------|---------|
381
+ | Unchanged document | 10,000 tokens | 0 tokens | 100% |
382
+ | Minor edit (typo) | 10,000 tokens | ~100 tokens | 99% |
383
+ | Moderate edit | 10,000 tokens | ~1,000 tokens | 90% |
384
+ | Major rewrite | 10,000 tokens | 10,000 tokens | 0% |
385
+
386
+ ### Code Chunking Strategy
387
+
388
+ | Language | Parser | Fallback |
389
+ |----------|--------|----------|
390
+ | Python | stdlib `ast` (always) | regex |
391
+ | JS/TS, Java, C/C++, Go, Rust, Ruby, PHP | tree-sitter (optional) | regex patterns |
392
+ | Shell, Swift, SQL, config files | regex patterns | line-based |
393
+
394
+ Tree-sitter provides proper AST boundary detection for function/class definitions.
395
+ Install with `pip install stele[tree-sitter]`.
396
+
397
+ ### Storage Layout
398
+
399
+ ```
400
+ <project_root>/.stele/ # Per-worktree (default)
401
+ ├── stele.db # SQLite: chunks, symbols, sessions, history
402
+ ├── kv_cache/ # JSON + zlib compressed KV states
403
+ └── indices/ # HNSW + BM25 persistent indices
404
+
405
+ <git-common-dir>/stele/ # Shared across worktrees
406
+ └── coordination.db # Agent registry, shared locks, notifications
407
+ ```
408
+
409
+ ## Multi-Agent Support
410
+
411
+ Stele supports multiple LLM agents sharing one store on the same machine.
412
+
413
+ | Layer | Protection |
414
+ |-------|-----------|
415
+ | **Thread safety** | RWLock — concurrent reads, exclusive writes |
416
+ | **Process safety** | `fcntl.flock()` on index files |
417
+ | **Document ownership** | `acquire_document_lock()` with TTL expiry |
418
+ | **Optimistic locking** | `doc_version` compare-and-swap |
419
+ | **Cross-worktree** | Shared coordination DB for locks, agent registry, notifications |
420
+ | **Conflict log** | Full audit trail of ownership violations |
421
+
422
+ ## Performance
423
+
424
+ Run benchmarks:
425
+ ```bash
426
+ python benchmarks/run_all.py # Full suite
427
+ python benchmarks/run_all.py --quick # CI mode
428
+ ```
429
+
430
+ Representative results (quick mode):
431
+
432
+ | Operation | Size | Time | Throughput |
433
+ |-----------|------|------|------------|
434
+ | TextChunker | 10KB | 1.6ms | 6,100 KB/s |
435
+ | CodeChunker (AST) | 10KB | 5.7ms | 1,750 KB/s |
436
+ | store_chunk (batch) | 100 | 27ms | 3,700 ops/s |
437
+ | VectorIndex.search (k=10) | 500 nodes | 4.7ms | 212 qps |
438
+ | BM25.score_batch | 100 docs | 0.18ms | 556K docs/s |
439
+ | engine.search (hybrid) | 50 docs | 9.9ms | 101 qps |
440
+
441
+ ## Security & Supply Chain
442
+
443
+ - **Zero required dependencies** — no supply chain attack surface for core functionality
444
+ - **No model downloads** — semantic signatures use statistical features, not ML models
445
+ - **No API calls** — everything runs locally, no data leaves your machine
446
+ - **No pickle** — session data serialized with JSON+zlib
447
+ - **Minimal codebase** — ~10,000 lines of Python, easy to audit
448
+
449
+ ```bash
450
+ # Maximum security: install with zero dependencies
451
+ pip install stele --no-deps
452
+ ```
453
+
454
+ ## Supported Formats
455
+
456
+ ### Text & Code (Zero Dependencies)
457
+ `.txt`, `.md`, `.rst`, `.csv`, `.log`, `.py`, `.js`, `.ts`, `.jsx`, `.tsx`, `.java`, `.cpp`, `.c`, `.h`, `.go`, `.rs`, `.rb`, `.php`, `.swift`, `.sh`, `.json`, `.yaml`, `.toml`, `.html`, `.css`, `.sql`
458
+
459
+ ### Images (requires Pillow)
460
+ `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.ico`
461
+
462
+ ### PDFs (requires pymupdf)
463
+ `.pdf`
464
+
465
+ ### Audio (requires librosa)
466
+ `.mp3`, `.wav`, `.ogg`, `.flac`, `.m4a`, `.aac`, `.wma`
467
+
468
+ ### Video (requires opencv-python)
469
+ `.mp4`, `.avi`, `.mov`, `.mkv`, `.webm`, `.flv`, `.wmv`
470
+
471
+ ## Configuration Reference
472
+
473
+ ### Environment Variables
474
+
475
+ | Variable | Description |
476
+ |----------|-------------|
477
+ | `STELE_STORAGE_DIR` | Override default storage directory |
478
+ | `STELE_LOG_LEVEL` | Logging level (DEBUG, INFO, WARNING, ERROR) |
479
+
480
+ ### Config File (`.stele.toml`)
481
+
482
+ ```toml
483
+ [stele]
484
+ storage_dir = ".stele" # Storage directory (relative to project root)
485
+ chunk_size = 256 # Target tokens per initial chunk
486
+ max_chunk_size = 4096 # Maximum tokens per merged chunk
487
+ merge_threshold = 0.7 # Similarity threshold for merging chunks
488
+ change_threshold = 0.85 # Similarity threshold for "unchanged"
489
+ search_alpha = 0.7 # Hybrid search blend (1.0=vector, 0.0=keyword)
490
+ skip_dirs = [".git", "node_modules", "__pycache__"]
491
+ ```
492
+
493
+ Priority: constructor params > `.stele.toml` > `STELE_STORAGE_DIR` env var > defaults.
494
+
495
+ ## FAQ
496
+
497
+ **Q: Does Stele require an internet connection?**
498
+ No. Stele is 100% offline. No API calls, no model downloads, no telemetry. All operations run locally using Python stdlib.
499
+
500
+ **Q: How does Stele compare to RAG (Retrieval-Augmented Generation)?**
501
+ Stele is not RAG — it's a context cache. RAG retrieves chunks at query time from an external store. Stele caches chunk KV-states so the LLM skips re-reading unchanged content. It can be used alongside RAG, but its primary value is token savings through change detection.
502
+
503
+ **Q: What happens if tree-sitter isn't installed?**
504
+ Code chunking falls back to regex patterns for non-Python languages. Python always uses stdlib `ast`. Install tree-sitter for better accuracy on JS/TS, Java, C/C++, Go, Rust, Ruby, PHP: `pip install stele[tree-sitter]`.
505
+
506
+ **Q: Can multiple agents use Stele simultaneously?**
507
+ Yes. Stele provides per-document locking, optimistic versioning, and a cross-worktree coordination DB. Both HTTP and MCP servers auto-register agents and inject agent IDs into write operations.
508
+
509
+ **Q: How accurate are the semantic signatures?**
510
+ The 128-dim statistical signatures (trigrams, bigrams, structural features) are approximate. They're designed for change detection (same vs different), not for embedding-quality similarity. For typical code and documentation, they achieve ~95% accuracy on change detection.
511
+
512
+ **Q: Where is data stored?**
513
+ By default, `<project_root>/.stele/` (each git worktree gets its own). Override with `STELE_STORAGE_DIR` or `storage_dir` in `.stele.toml`. Cross-worktree coordination data lives in `<git-common-dir>/stele/coordination.db`.
514
+
515
+ ## Troubleshooting
516
+
517
+ **`ImportError: No module named 'stele'`**
518
+ Ensure Stele is installed: `pip install -e .` from the repo root. If using a virtualenv, make sure it's activated.
519
+
520
+ **MCP server not connecting in Claude Desktop**
521
+ Use the full path to the `stele` binary. Check with `which stele` and update your config. If installed in a virtualenv: `/path/to/.venv/bin/stele`.
522
+
523
+ **`PermissionError` when indexing**
524
+ Another agent holds a lock on the document. Check with `get_document_lock_status()` or `reap_expired_locks()` to clean up stale locks.
525
+
526
+ **Slow search on large indices**
527
+ The HNSW index adapts search width automatically. For 10K+ chunks, search uses 4x `ef_search`. If still slow, reduce `top_k` or check that the BM25 index isn't being rebuilt on every query (it's lazy-loaded once).
528
+
529
+ **Tree-sitter not working for a language**
530
+ Verify the grammar package is installed: `pip install tree-sitter-javascript` (etc.). Check with: `python -c "from stele.chunkers.code import HAS_TREE_SITTER; print(HAS_TREE_SITTER)"`.
531
+
532
+ **Stale `.pyc` files causing issues**
533
+ Run `stele` with the `environment_check` MCP tool, or call `engine.check_environment()`. Use `engine.clean_bytecache()` to remove orphaned `.pyc` files.
534
+
535
+ ## Development
536
+
537
+ ```bash
538
+ pip install -e ".[dev]"
539
+ pytest # 412 tests
540
+ pytest --cov=stele # With coverage
541
+ python benchmarks/run_all.py # Performance benchmarks
542
+ mypy stele/ # Type checking
543
+ ruff check stele/ # Linting
544
+ ```
545
+
546
+ Entry points: `stele` (CLI), `stele-mcp` (MCP stdio server)
547
+
548
+ ## Contributing
549
+
550
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
551
+
552
+ ## License
553
+
554
+ MIT License — see [LICENSE](LICENSE) for details.