srclight 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- srclight-0.8.0/.gitignore +24 -0
- srclight-0.8.0/LICENSE +21 -0
- srclight-0.8.0/PKG-INFO +346 -0
- srclight-0.8.0/README.md +306 -0
- srclight-0.8.0/pyproject.toml +71 -0
- srclight-0.8.0/src/srclight/__init__.py +3 -0
- srclight-0.8.0/src/srclight/build.py +421 -0
- srclight-0.8.0/src/srclight/cli.py +747 -0
- srclight-0.8.0/src/srclight/db.py +1208 -0
- srclight-0.8.0/src/srclight/embeddings.py +417 -0
- srclight-0.8.0/src/srclight/git.py +309 -0
- srclight-0.8.0/src/srclight/indexer.py +973 -0
- srclight-0.8.0/src/srclight/languages.py +322 -0
- srclight-0.8.0/src/srclight/server.py +1442 -0
- srclight-0.8.0/src/srclight/vector_cache.py +231 -0
- srclight-0.8.0/src/srclight/vector_math.py +131 -0
- srclight-0.8.0/src/srclight/workspace.py +840 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
.eggs/
|
|
8
|
+
*.egg
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
env/
|
|
12
|
+
.env
|
|
13
|
+
*.db
|
|
14
|
+
*.sqlite
|
|
15
|
+
*.sqlite3
|
|
16
|
+
.srclight/
|
|
17
|
+
.ruff_cache/
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.mypy_cache/
|
|
20
|
+
htmlcov/
|
|
21
|
+
.coverage
|
|
22
|
+
*.log
|
|
23
|
+
.DS_Store
|
|
24
|
+
Thumbs.db
|
srclight-0.8.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Gig8 LLC
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
srclight-0.8.0/PKG-INFO
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: srclight
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: Deep code indexing for AI agents. SQLite FTS5 + tree-sitter + embeddings + MCP.
|
|
5
|
+
Project-URL: Homepage, https://github.com/srclight/srclight
|
|
6
|
+
Project-URL: Repository, https://github.com/srclight/srclight
|
|
7
|
+
Project-URL: Issues, https://github.com/srclight/srclight/issues
|
|
8
|
+
Author: Gig8 LLC
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,claude,code-indexing,embeddings,fts5,mcp,sqlite,tree-sitter
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: click>=8.1.0
|
|
23
|
+
Requires-Dist: mcp>=1.26.0
|
|
24
|
+
Requires-Dist: numpy>=1.26.0
|
|
25
|
+
Requires-Dist: tree-sitter-c-sharp>=0.23.0
|
|
26
|
+
Requires-Dist: tree-sitter-c>=0.24.0
|
|
27
|
+
Requires-Dist: tree-sitter-cpp>=0.23.0
|
|
28
|
+
Requires-Dist: tree-sitter-javascript>=0.25.0
|
|
29
|
+
Requires-Dist: tree-sitter-python>=0.25.0
|
|
30
|
+
Requires-Dist: tree-sitter-rust>=0.24.0
|
|
31
|
+
Requires-Dist: tree-sitter-typescript>=0.23.0
|
|
32
|
+
Requires-Dist: tree-sitter>=0.25.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: ruff>=0.9.0; extra == 'dev'
|
|
37
|
+
Provides-Extra: gpu
|
|
38
|
+
Requires-Dist: cupy-cuda12x>=13.0; extra == 'gpu'
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# Srclight
|
|
42
|
+
|
|
43
|
+
**Deep code indexing for AI agents.** SQLite FTS5 + tree-sitter + embeddings + MCP.
|
|
44
|
+
|
|
45
|
+
Srclight builds a rich, searchable index of your codebase that AI coding agents can query instantly — replacing dozens of grep/glob calls with precise, structured lookups.
|
|
46
|
+
|
|
47
|
+
## Why?
|
|
48
|
+
|
|
49
|
+
AI coding agents (Claude Code, Cursor, etc.) spend **40-60% of their tokens on orientation** — searching for files, reading code to understand structure, hunting for callers and callees. Srclight eliminates this waste.
|
|
50
|
+
|
|
51
|
+
| Without Srclight | With Srclight |
|
|
52
|
+
|---|---|
|
|
53
|
+
| 8-12 grep rounds to find callers | `get_callers("lookup")` — one call |
|
|
54
|
+
| Read 5 files to understand module | `codebase_map()` — instant overview |
|
|
55
|
+
| "Find code that does X" → 20 greps | `semantic_search("dictionary lookup")` — one call |
|
|
56
|
+
| 15-25 tool calls per bug fix | 5-8 tool calls per bug fix |
|
|
57
|
+
|
|
58
|
+
## Features
|
|
59
|
+
|
|
60
|
+
- **Minimal dependencies** — single SQLite file per repo, no Docker/Redis/vector DB
|
|
61
|
+
- **Fully offline** — no API calls, works air-gapped (Ollama local embeddings)
|
|
62
|
+
- **Incremental** — only re-indexes changed files (content hash detection)
|
|
63
|
+
- **7 languages** — Python, C, C++, C#, JavaScript, TypeScript, Rust
|
|
64
|
+
- **4 search modes** — symbol names, source code (trigram), documentation (stemmed), semantic (embeddings)
|
|
65
|
+
- **Hybrid search** — RRF fusion of keyword + semantic results for best accuracy
|
|
66
|
+
- **Multi-repo workspaces** — search across all your repos simultaneously via SQLite ATTACH+UNION
|
|
67
|
+
- **MCP server** — works with Claude Code, Cursor, and any MCP client
|
|
68
|
+
- **CLI** — index, search, and inspect from the terminal
|
|
69
|
+
- **Auto-reindex** — git post-commit/post-checkout hooks keep indexes fresh
|
|
70
|
+
|
|
71
|
+
## Requirements
|
|
72
|
+
|
|
73
|
+
- **Python 3.11+**
|
|
74
|
+
- **Git** (for change intelligence and auto-reindex hooks)
|
|
75
|
+
- **Ollama** (optional, for semantic search / embeddings) — [ollama.com](https://ollama.com)
|
|
76
|
+
- **NVIDIA GPU + cupy** (optional, for GPU-accelerated vector search)
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Install from PyPI (when published)
|
|
82
|
+
pip install srclight
|
|
83
|
+
|
|
84
|
+
# Install from source
|
|
85
|
+
git clone https://github.com/srclight/srclight.git
|
|
86
|
+
cd srclight
|
|
87
|
+
pip install -e .
|
|
88
|
+
|
|
89
|
+
# Optional: GPU-accelerated vector search (requires CUDA 12.x)
|
|
90
|
+
pip install 'srclight[gpu]'
|
|
91
|
+
|
|
92
|
+
# Index your project
|
|
93
|
+
cd /path/to/your/project
|
|
94
|
+
srclight index
|
|
95
|
+
|
|
96
|
+
# Index with embeddings (requires Ollama running)
|
|
97
|
+
srclight index --embed qwen3-embedding
|
|
98
|
+
|
|
99
|
+
# Search
|
|
100
|
+
srclight search "lookup"
|
|
101
|
+
srclight search --kind function "parse"
|
|
102
|
+
srclight symbols src/main.py
|
|
103
|
+
|
|
104
|
+
# Start MCP server (for Claude Code / Cursor)
|
|
105
|
+
srclight serve
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
> **Note:** `srclight index` automatically adds `.srclight/` to your `.gitignore`. Index databases and embedding files can be large and should never be committed.
|
|
109
|
+
|
|
110
|
+
## Semantic Search (Embeddings)
|
|
111
|
+
|
|
112
|
+
Srclight supports embedding-based semantic search for natural language queries like "find code that handles authentication" or "where is the database connection pool".
|
|
113
|
+
|
|
114
|
+
### Setup
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# Install Ollama (https://ollama.com)
|
|
118
|
+
# Pull an embedding model
|
|
119
|
+
ollama pull qwen3-embedding # Best quality (8B params, needs ~6GB VRAM)
|
|
120
|
+
ollama pull nomic-embed-text # Lighter alternative (137M params)
|
|
121
|
+
|
|
122
|
+
# Index with embeddings
|
|
123
|
+
srclight index --embed qwen3-embedding
|
|
124
|
+
|
|
125
|
+
# Or index workspace with embeddings
|
|
126
|
+
srclight workspace index -w myworkspace --embed qwen3-embedding
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### How It Works
|
|
130
|
+
|
|
131
|
+
1. Each symbol's name + signature + docstring + content is embedded as a float vector
|
|
132
|
+
2. Vectors are stored as BLOBs in `symbol_embeddings` table (SQLite)
|
|
133
|
+
3. After indexing, a `.npy` sidecar snapshot is built and loaded to **GPU VRAM** (cupy) or CPU RAM (numpy) for fast search
|
|
134
|
+
4. `semantic_search(query)` embeds the query and runs cosine similarity against the GPU-resident matrix (~3ms for 27K vectors on a modern GPU)
|
|
135
|
+
5. `hybrid_search(query)` combines FTS5 keyword results + embedding results via Reciprocal Rank Fusion (RRF)
|
|
136
|
+
|
|
137
|
+
### Embedding Providers
|
|
138
|
+
|
|
139
|
+
| Provider | Model | Quality | Local? | Notes |
|
|
140
|
+
|----------|-------|---------|--------|-------|
|
|
141
|
+
| **Ollama** (default) | `qwen3-embedding` | Best local | Yes | Needs ~6GB VRAM |
|
|
142
|
+
| Ollama | `nomic-embed-text` | Good | Yes | Lighter, works on 8GB VRAM |
|
|
143
|
+
| **Voyage AI** (API) | `voyage-code-3` | Best overall | No | Requires `VOYAGE_API_KEY` |
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Use Voyage Code 3 (API, highest quality)
|
|
147
|
+
VOYAGE_API_KEY=your-key srclight index --embed voyage-code-3
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Storage
|
|
151
|
+
|
|
152
|
+
Embeddings are stored in `symbol_embeddings` table in `.srclight/index.db`. After indexing, a `.npy` sidecar snapshot is built for fast GPU loading:
|
|
153
|
+
|
|
154
|
+
| File | Purpose |
|
|
155
|
+
|------|---------|
|
|
156
|
+
| `index.db` | Write path — per-symbol CRUD during indexing |
|
|
157
|
+
| `embeddings.npy` | Read path — contiguous float32 matrix for GPU/CPU search |
|
|
158
|
+
| `embeddings_norms.npy` | Pre-computed row norms (avoids recomputation per query) |
|
|
159
|
+
| `embeddings_meta.json` | Symbol ID mapping, model info, version for cache invalidation |
|
|
160
|
+
|
|
161
|
+
For ~27K symbols at 4096 dims (qwen3-embedding), that's ~428 MB on disk, ~450 MB in VRAM. Incremental: only re-embeds symbols whose content changed; sidecar rebuilt after each indexing run.
|
|
162
|
+
|
|
163
|
+
## Multi-Repo Workspaces
|
|
164
|
+
|
|
165
|
+
Search across multiple repos simultaneously. Each repo keeps its own `.srclight/index.db`; at query time, srclight ATTACHes them all and UNIONs across schemas.
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Create a workspace
|
|
169
|
+
srclight workspace init myworkspace
|
|
170
|
+
|
|
171
|
+
# Add repos
|
|
172
|
+
srclight workspace add /path/to/repo1 -w myworkspace
|
|
173
|
+
srclight workspace add /path/to/repo2 -w myworkspace -n custom-name
|
|
174
|
+
|
|
175
|
+
# Index all repos (with optional embeddings)
|
|
176
|
+
srclight workspace index -w myworkspace
|
|
177
|
+
srclight workspace index -w myworkspace --embed qwen3-embedding
|
|
178
|
+
|
|
179
|
+
# Search across all repos
|
|
180
|
+
srclight workspace search "Dictionary" -w myworkspace
|
|
181
|
+
srclight workspace search "Dictionary" -w myworkspace --project repo1
|
|
182
|
+
|
|
183
|
+
# Status
|
|
184
|
+
srclight workspace status -w myworkspace
|
|
185
|
+
srclight workspace list
|
|
186
|
+
|
|
187
|
+
# Start MCP server in workspace mode
|
|
188
|
+
srclight serve --workspace myworkspace
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## MCP Integration
|
|
192
|
+
|
|
193
|
+
### Claude Code (single repo)
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
claude mcp add srclight -- srclight serve
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Claude Code (workspace mode)
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
claude mcp add srclight -- srclight serve --workspace myworkspace
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Claude Desktop (`claude_desktop_config.json`)
|
|
206
|
+
|
|
207
|
+
```json
|
|
208
|
+
{
|
|
209
|
+
"mcpServers": {
|
|
210
|
+
"srclight": {
|
|
211
|
+
"command": "srclight",
|
|
212
|
+
"args": ["serve", "--workspace", "myworkspace"]
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## MCP Tools (25)
|
|
219
|
+
|
|
220
|
+
Srclight exposes 25 MCP tools organized in five tiers. The MCP server includes built-in instructions that guide AI agents on which tool to use and when — agents receive a session protocol, tool selection guide, and `project` parameter documentation automatically on connection.
|
|
221
|
+
|
|
222
|
+
### Tier 1: Instant Orientation
|
|
223
|
+
| Tool | What it does |
|
|
224
|
+
|------|-------------|
|
|
225
|
+
| `codebase_map()` | Full project overview — call first every session |
|
|
226
|
+
| `search_symbols(query)` | Search across symbol names, code, and docs |
|
|
227
|
+
| `get_symbol(name)` | Full source code + metadata for a symbol |
|
|
228
|
+
| `get_signature(name)` | Just the signature (lightweight) |
|
|
229
|
+
| `symbols_in_file(path)` | Table of contents for a file |
|
|
230
|
+
| `list_projects()` | All projects in workspace with stats |
|
|
231
|
+
|
|
232
|
+
### Tier 2: Relationship Graph
|
|
233
|
+
| Tool | What it does |
|
|
234
|
+
|------|-------------|
|
|
235
|
+
| `get_callers(name)` | Who calls this symbol? |
|
|
236
|
+
| `get_callees(name)` | What does this symbol call? |
|
|
237
|
+
| `get_dependents(name, transitive)` | Blast radius — what breaks if I change this? |
|
|
238
|
+
| `get_implementors(interface)` | All classes implementing an interface |
|
|
239
|
+
| `get_tests_for(name)` | Test functions covering a symbol |
|
|
240
|
+
| `get_type_hierarchy(name)` | Inheritance tree (base classes + subclasses) |
|
|
241
|
+
|
|
242
|
+
### Tier 3: Git Change Intelligence
|
|
243
|
+
| Tool | What it does |
|
|
244
|
+
|------|-------------|
|
|
245
|
+
| `blame_symbol(name)` | Who changed this, when, and why |
|
|
246
|
+
| `recent_changes(n)` | Commit feed (cross-project in workspace) |
|
|
247
|
+
| `git_hotspots(n, since)` | Most frequently changed files (bug magnets) |
|
|
248
|
+
| `whats_changed()` | Uncommitted work in progress |
|
|
249
|
+
| `changes_to(name)` | Commit history for a symbol's file |
|
|
250
|
+
|
|
251
|
+
### Tier 4: Build & Config
|
|
252
|
+
| Tool | What it does |
|
|
253
|
+
|------|-------------|
|
|
254
|
+
| `get_build_targets()` | CMake/.csproj/npm targets with dependencies |
|
|
255
|
+
| `get_platform_variants(name)` | #ifdef platform guards around a symbol |
|
|
256
|
+
| `platform_conditionals()` | All platform-conditional code blocks |
|
|
257
|
+
|
|
258
|
+
### Tier 5: Semantic Search (Embeddings)
|
|
259
|
+
| Tool | What it does |
|
|
260
|
+
|------|-------------|
|
|
261
|
+
| `semantic_search(query)` | Find code by meaning (natural language) |
|
|
262
|
+
| `hybrid_search(query)` | Best of both: keyword + semantic with RRF fusion |
|
|
263
|
+
| `embedding_status()` | Embedding coverage and model info |
|
|
264
|
+
|
|
265
|
+
### Meta
|
|
266
|
+
| Tool | What it does |
|
|
267
|
+
|------|-------------|
|
|
268
|
+
| `index_status()` | Index freshness and stats |
|
|
269
|
+
| `reindex()` | Trigger incremental re-index |
|
|
270
|
+
|
|
271
|
+
In workspace mode, `search_symbols`, `get_symbol`, `codebase_map`, and `hybrid_search` accept an optional `project` filter. Graph/git/build tools require `project` in workspace mode.
|
|
272
|
+
|
|
273
|
+
## Deployment Guide
|
|
274
|
+
|
|
275
|
+
See **[docs/usage-guide.md](docs/usage-guide.md)** for the full deployment and usage guide, including:
|
|
276
|
+
- Setting up srclight as a global MCP server for Claude Code
|
|
277
|
+
- Adding/removing repos from workspaces
|
|
278
|
+
- What happens on commits and branch switches
|
|
279
|
+
- Re-embedding workflows
|
|
280
|
+
- Troubleshooting
|
|
281
|
+
|
|
282
|
+
## Auto-Reindex (Git Hook)
|
|
283
|
+
|
|
284
|
+
Keep indexes fresh automatically:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
# Install post-commit + post-checkout hooks in current repo
|
|
288
|
+
srclight hook install
|
|
289
|
+
|
|
290
|
+
# Install across all repos in a workspace
|
|
291
|
+
srclight hook install --workspace myworkspace
|
|
292
|
+
|
|
293
|
+
# Remove hooks
|
|
294
|
+
srclight hook uninstall
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
The hooks run `srclight index` in the background after each commit and branch switch.
|
|
298
|
+
|
|
299
|
+
## How It Works
|
|
300
|
+
|
|
301
|
+
1. **tree-sitter** parses every source file into an AST
|
|
302
|
+
2. Symbols (functions, classes, methods, structs, etc.) are extracted with full metadata
|
|
303
|
+
3. Three **SQLite FTS5** indexes are built with different tokenization strategies:
|
|
304
|
+
- **Names**: code-aware tokenization (splits `camelCase`, handles `::`, `->`)
|
|
305
|
+
- **Content**: trigram index for substring matching
|
|
306
|
+
- **Docs**: Porter stemming for natural language in docstrings
|
|
307
|
+
4. Optional: **embedding vectors** are generated via Ollama or Voyage API and stored as BLOBs
|
|
308
|
+
5. A `.npy` **sidecar snapshot** is built and loaded to **GPU VRAM** (cupy) or CPU RAM (numpy) for fast search
|
|
309
|
+
6. The **MCP server** exposes structured query tools that AI agents call instead of grep
|
|
310
|
+
7. **Hybrid search** merges keyword (FTS5) and semantic (embedding) results via RRF
|
|
311
|
+
|
|
312
|
+
### Architecture (Workspace Mode)
|
|
313
|
+
|
|
314
|
+
```
|
|
315
|
+
repo1/.srclight/index.db ──┐
|
|
316
|
+
repo2/.srclight/index.db ──┼── ATTACH ──→ :memory: ──→ UNION ALL queries
|
|
317
|
+
repo3/.srclight/index.db ──┘
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
Each repo is indexed independently. At query time, SQLite's ATTACH mechanism joins them into a single searchable namespace. Handles >10 repos via automatic batching (SQLite's ATTACH limit).
|
|
321
|
+
|
|
322
|
+
## Roadmap
|
|
323
|
+
|
|
324
|
+
### Done
|
|
325
|
+
- [x] Symbol intelligence + 3x FTS5 search
|
|
326
|
+
- [x] Relationship graph: callers, callees, hierarchy
|
|
327
|
+
- [x] Blast radius, test discovery, implementors
|
|
328
|
+
- [x] Git change intelligence: blame, hotspots, recent changes
|
|
329
|
+
- [x] Build system awareness: CMake, .csproj, platform conditionals
|
|
330
|
+
- [x] Semantic search: embeddings via Ollama/Voyage, hybrid RRF
|
|
331
|
+
- [x] GPU-accelerated vector search: `.npy` sidecar, cupy/numpy vectorized math
|
|
332
|
+
- [x] Multi-repo workspaces (ATTACH+UNION)
|
|
333
|
+
- [x] Auto-reindex git hooks (post-commit + post-checkout)
|
|
334
|
+
- [x] MCP agent guidance: comprehensive instructions, tool selection guide, session protocol
|
|
335
|
+
- [x] Workspace config hot-reload (no server restart needed to add repos)
|
|
336
|
+
- [x] VectorCache sidecar re-discovery (no restart needed after embedding)
|
|
337
|
+
- [x] Project name suggestions in error messages
|
|
338
|
+
|
|
339
|
+
### Next
|
|
340
|
+
- [ ] Cross-language concept mapping (explicit edges between equivalent symbols across languages)
|
|
341
|
+
- [ ] Pattern intelligence (convention detection, coding pattern extraction)
|
|
342
|
+
- [ ] AI pre-computation (symbol summaries via cheap LLM)
|
|
343
|
+
|
|
344
|
+
## License
|
|
345
|
+
|
|
346
|
+
MIT — Gig8 LLC
|
srclight-0.8.0/README.md
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# Srclight
|
|
2
|
+
|
|
3
|
+
**Deep code indexing for AI agents.** SQLite FTS5 + tree-sitter + embeddings + MCP.
|
|
4
|
+
|
|
5
|
+
Srclight builds a rich, searchable index of your codebase that AI coding agents can query instantly — replacing dozens of grep/glob calls with precise, structured lookups.
|
|
6
|
+
|
|
7
|
+
## Why?
|
|
8
|
+
|
|
9
|
+
AI coding agents (Claude Code, Cursor, etc.) spend **40-60% of their tokens on orientation** — searching for files, reading code to understand structure, hunting for callers and callees. Srclight eliminates this waste.
|
|
10
|
+
|
|
11
|
+
| Without Srclight | With Srclight |
|
|
12
|
+
|---|---|
|
|
13
|
+
| 8-12 grep rounds to find callers | `get_callers("lookup")` — one call |
|
|
14
|
+
| Read 5 files to understand module | `codebase_map()` — instant overview |
|
|
15
|
+
| "Find code that does X" → 20 greps | `semantic_search("dictionary lookup")` — one call |
|
|
16
|
+
| 15-25 tool calls per bug fix | 5-8 tool calls per bug fix |
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **Minimal dependencies** — single SQLite file per repo, no Docker/Redis/vector DB
|
|
21
|
+
- **Fully offline** — no API calls, works air-gapped (Ollama local embeddings)
|
|
22
|
+
- **Incremental** — only re-indexes changed files (content hash detection)
|
|
23
|
+
- **7 languages** — Python, C, C++, C#, JavaScript, TypeScript, Rust
|
|
24
|
+
- **4 search modes** — symbol names, source code (trigram), documentation (stemmed), semantic (embeddings)
|
|
25
|
+
- **Hybrid search** — RRF fusion of keyword + semantic results for best accuracy
|
|
26
|
+
- **Multi-repo workspaces** — search across all your repos simultaneously via SQLite ATTACH+UNION
|
|
27
|
+
- **MCP server** — works with Claude Code, Cursor, and any MCP client
|
|
28
|
+
- **CLI** — index, search, and inspect from the terminal
|
|
29
|
+
- **Auto-reindex** — git post-commit/post-checkout hooks keep indexes fresh
|
|
30
|
+
|
|
31
|
+
## Requirements
|
|
32
|
+
|
|
33
|
+
- **Python 3.11+**
|
|
34
|
+
- **Git** (for change intelligence and auto-reindex hooks)
|
|
35
|
+
- **Ollama** (optional, for semantic search / embeddings) — [ollama.com](https://ollama.com)
|
|
36
|
+
- **NVIDIA GPU + cupy** (optional, for GPU-accelerated vector search)
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Install from PyPI (when published)
|
|
42
|
+
pip install srclight
|
|
43
|
+
|
|
44
|
+
# Install from source
|
|
45
|
+
git clone https://github.com/srclight/srclight.git
|
|
46
|
+
cd srclight
|
|
47
|
+
pip install -e .
|
|
48
|
+
|
|
49
|
+
# Optional: GPU-accelerated vector search (requires CUDA 12.x)
|
|
50
|
+
pip install 'srclight[gpu]'
|
|
51
|
+
|
|
52
|
+
# Index your project
|
|
53
|
+
cd /path/to/your/project
|
|
54
|
+
srclight index
|
|
55
|
+
|
|
56
|
+
# Index with embeddings (requires Ollama running)
|
|
57
|
+
srclight index --embed qwen3-embedding
|
|
58
|
+
|
|
59
|
+
# Search
|
|
60
|
+
srclight search "lookup"
|
|
61
|
+
srclight search --kind function "parse"
|
|
62
|
+
srclight symbols src/main.py
|
|
63
|
+
|
|
64
|
+
# Start MCP server (for Claude Code / Cursor)
|
|
65
|
+
srclight serve
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
> **Note:** `srclight index` automatically adds `.srclight/` to your `.gitignore`. Index databases and embedding files can be large and should never be committed.
|
|
69
|
+
|
|
70
|
+
## Semantic Search (Embeddings)
|
|
71
|
+
|
|
72
|
+
Srclight supports embedding-based semantic search for natural language queries like "find code that handles authentication" or "where is the database connection pool".
|
|
73
|
+
|
|
74
|
+
### Setup
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Install Ollama (https://ollama.com)
|
|
78
|
+
# Pull an embedding model
|
|
79
|
+
ollama pull qwen3-embedding # Best quality (8B params, needs ~6GB VRAM)
|
|
80
|
+
ollama pull nomic-embed-text # Lighter alternative (137M params)
|
|
81
|
+
|
|
82
|
+
# Index with embeddings
|
|
83
|
+
srclight index --embed qwen3-embedding
|
|
84
|
+
|
|
85
|
+
# Or index workspace with embeddings
|
|
86
|
+
srclight workspace index -w myworkspace --embed qwen3-embedding
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### How It Works
|
|
90
|
+
|
|
91
|
+
1. Each symbol's name + signature + docstring + content is embedded as a float vector
|
|
92
|
+
2. Vectors are stored as BLOBs in `symbol_embeddings` table (SQLite)
|
|
93
|
+
3. After indexing, a `.npy` sidecar snapshot is built and loaded to **GPU VRAM** (cupy) or CPU RAM (numpy) for fast search
|
|
94
|
+
4. `semantic_search(query)` embeds the query and runs cosine similarity against the GPU-resident matrix (~3ms for 27K vectors on a modern GPU)
|
|
95
|
+
5. `hybrid_search(query)` combines FTS5 keyword results + embedding results via Reciprocal Rank Fusion (RRF)
|
|
96
|
+
|
|
97
|
+
### Embedding Providers
|
|
98
|
+
|
|
99
|
+
| Provider | Model | Quality | Local? | Notes |
|
|
100
|
+
|----------|-------|---------|--------|-------|
|
|
101
|
+
| **Ollama** (default) | `qwen3-embedding` | Best local | Yes | Needs ~6GB VRAM |
|
|
102
|
+
| Ollama | `nomic-embed-text` | Good | Yes | Lighter, works on 8GB VRAM |
|
|
103
|
+
| **Voyage AI** (API) | `voyage-code-3` | Best overall | No | Requires `VOYAGE_API_KEY` |
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Use Voyage Code 3 (API, highest quality)
|
|
107
|
+
VOYAGE_API_KEY=your-key srclight index --embed voyage-code-3
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Storage
|
|
111
|
+
|
|
112
|
+
Embeddings are stored in `symbol_embeddings` table in `.srclight/index.db`. After indexing, a `.npy` sidecar snapshot is built for fast GPU loading:
|
|
113
|
+
|
|
114
|
+
| File | Purpose |
|
|
115
|
+
|------|---------|
|
|
116
|
+
| `index.db` | Write path — per-symbol CRUD during indexing |
|
|
117
|
+
| `embeddings.npy` | Read path — contiguous float32 matrix for GPU/CPU search |
|
|
118
|
+
| `embeddings_norms.npy` | Pre-computed row norms (avoids recomputation per query) |
|
|
119
|
+
| `embeddings_meta.json` | Symbol ID mapping, model info, version for cache invalidation |
|
|
120
|
+
|
|
121
|
+
For ~27K symbols at 4096 dims (qwen3-embedding), that's ~428 MB on disk, ~450 MB in VRAM. Incremental: only re-embeds symbols whose content changed; sidecar rebuilt after each indexing run.
|
|
122
|
+
|
|
123
|
+
## Multi-Repo Workspaces
|
|
124
|
+
|
|
125
|
+
Search across multiple repos simultaneously. Each repo keeps its own `.srclight/index.db`; at query time, srclight ATTACHes them all and UNIONs across schemas.
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# Create a workspace
|
|
129
|
+
srclight workspace init myworkspace
|
|
130
|
+
|
|
131
|
+
# Add repos
|
|
132
|
+
srclight workspace add /path/to/repo1 -w myworkspace
|
|
133
|
+
srclight workspace add /path/to/repo2 -w myworkspace -n custom-name
|
|
134
|
+
|
|
135
|
+
# Index all repos (with optional embeddings)
|
|
136
|
+
srclight workspace index -w myworkspace
|
|
137
|
+
srclight workspace index -w myworkspace --embed qwen3-embedding
|
|
138
|
+
|
|
139
|
+
# Search across all repos
|
|
140
|
+
srclight workspace search "Dictionary" -w myworkspace
|
|
141
|
+
srclight workspace search "Dictionary" -w myworkspace --project repo1
|
|
142
|
+
|
|
143
|
+
# Status
|
|
144
|
+
srclight workspace status -w myworkspace
|
|
145
|
+
srclight workspace list
|
|
146
|
+
|
|
147
|
+
# Start MCP server in workspace mode
|
|
148
|
+
srclight serve --workspace myworkspace
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## MCP Integration
|
|
152
|
+
|
|
153
|
+
### Claude Code (single repo)
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
claude mcp add srclight -- srclight serve
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Claude Code (workspace mode)
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
claude mcp add srclight -- srclight serve --workspace myworkspace
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Claude Desktop (`claude_desktop_config.json`)
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{
|
|
169
|
+
"mcpServers": {
|
|
170
|
+
"srclight": {
|
|
171
|
+
"command": "srclight",
|
|
172
|
+
"args": ["serve", "--workspace", "myworkspace"]
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## MCP Tools (25)
|
|
179
|
+
|
|
180
|
+
Srclight exposes 25 MCP tools organized in five tiers. The MCP server includes built-in instructions that guide AI agents on which tool to use and when — agents receive a session protocol, tool selection guide, and `project` parameter documentation automatically on connection.
|
|
181
|
+
|
|
182
|
+
### Tier 1: Instant Orientation
|
|
183
|
+
| Tool | What it does |
|
|
184
|
+
|------|-------------|
|
|
185
|
+
| `codebase_map()` | Full project overview — call first every session |
|
|
186
|
+
| `search_symbols(query)` | Search across symbol names, code, and docs |
|
|
187
|
+
| `get_symbol(name)` | Full source code + metadata for a symbol |
|
|
188
|
+
| `get_signature(name)` | Just the signature (lightweight) |
|
|
189
|
+
| `symbols_in_file(path)` | Table of contents for a file |
|
|
190
|
+
| `list_projects()` | All projects in workspace with stats |
|
|
191
|
+
|
|
192
|
+
### Tier 2: Relationship Graph
|
|
193
|
+
| Tool | What it does |
|
|
194
|
+
|------|-------------|
|
|
195
|
+
| `get_callers(name)` | Who calls this symbol? |
|
|
196
|
+
| `get_callees(name)` | What does this symbol call? |
|
|
197
|
+
| `get_dependents(name, transitive)` | Blast radius — what breaks if I change this? |
|
|
198
|
+
| `get_implementors(interface)` | All classes implementing an interface |
|
|
199
|
+
| `get_tests_for(name)` | Test functions covering a symbol |
|
|
200
|
+
| `get_type_hierarchy(name)` | Inheritance tree (base classes + subclasses) |
|
|
201
|
+
|
|
202
|
+
### Tier 3: Git Change Intelligence
|
|
203
|
+
| Tool | What it does |
|
|
204
|
+
|------|-------------|
|
|
205
|
+
| `blame_symbol(name)` | Who changed this, when, and why |
|
|
206
|
+
| `recent_changes(n)` | Commit feed (cross-project in workspace) |
|
|
207
|
+
| `git_hotspots(n, since)` | Most frequently changed files (bug magnets) |
|
|
208
|
+
| `whats_changed()` | Uncommitted work in progress |
|
|
209
|
+
| `changes_to(name)` | Commit history for a symbol's file |
|
|
210
|
+
|
|
211
|
+
### Tier 4: Build & Config
|
|
212
|
+
| Tool | What it does |
|
|
213
|
+
|------|-------------|
|
|
214
|
+
| `get_build_targets()` | CMake/.csproj/npm targets with dependencies |
|
|
215
|
+
| `get_platform_variants(name)` | #ifdef platform guards around a symbol |
|
|
216
|
+
| `platform_conditionals()` | All platform-conditional code blocks |
|
|
217
|
+
|
|
218
|
+
### Tier 5: Semantic Search (Embeddings)
|
|
219
|
+
| Tool | What it does |
|
|
220
|
+
|------|-------------|
|
|
221
|
+
| `semantic_search(query)` | Find code by meaning (natural language) |
|
|
222
|
+
| `hybrid_search(query)` | Best of both: keyword + semantic with RRF fusion |
|
|
223
|
+
| `embedding_status()` | Embedding coverage and model info |
|
|
224
|
+
|
|
225
|
+
### Meta
|
|
226
|
+
| Tool | What it does |
|
|
227
|
+
|------|-------------|
|
|
228
|
+
| `index_status()` | Index freshness and stats |
|
|
229
|
+
| `reindex()` | Trigger incremental re-index |
|
|
230
|
+
|
|
231
|
+
In workspace mode, `search_symbols`, `get_symbol`, `codebase_map`, and `hybrid_search` accept an optional `project` filter. Graph/git/build tools require `project` in workspace mode.
|
|
232
|
+
|
|
233
|
+
## Deployment Guide
|
|
234
|
+
|
|
235
|
+
See **[docs/usage-guide.md](docs/usage-guide.md)** for the full deployment and usage guide, including:
|
|
236
|
+
- Setting up srclight as a global MCP server for Claude Code
|
|
237
|
+
- Adding/removing repos from workspaces
|
|
238
|
+
- What happens on commits and branch switches
|
|
239
|
+
- Re-embedding workflows
|
|
240
|
+
- Troubleshooting
|
|
241
|
+
|
|
242
|
+
## Auto-Reindex (Git Hook)
|
|
243
|
+
|
|
244
|
+
Keep indexes fresh automatically:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
# Install post-commit + post-checkout hooks in current repo
|
|
248
|
+
srclight hook install
|
|
249
|
+
|
|
250
|
+
# Install across all repos in a workspace
|
|
251
|
+
srclight hook install --workspace myworkspace
|
|
252
|
+
|
|
253
|
+
# Remove hooks
|
|
254
|
+
srclight hook uninstall
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
The hooks run `srclight index` in the background after each commit and branch switch.
|
|
258
|
+
|
|
259
|
+
## How It Works
|
|
260
|
+
|
|
261
|
+
1. **tree-sitter** parses every source file into an AST
|
|
262
|
+
2. Symbols (functions, classes, methods, structs, etc.) are extracted with full metadata
|
|
263
|
+
3. Three **SQLite FTS5** indexes are built with different tokenization strategies:
|
|
264
|
+
- **Names**: code-aware tokenization (splits `camelCase`, handles `::`, `->`)
|
|
265
|
+
- **Content**: trigram index for substring matching
|
|
266
|
+
- **Docs**: Porter stemming for natural language in docstrings
|
|
267
|
+
4. Optional: **embedding vectors** are generated via Ollama or Voyage API and stored as BLOBs
|
|
268
|
+
5. A `.npy` **sidecar snapshot** is built and loaded to **GPU VRAM** (cupy) or CPU RAM (numpy) for fast search
|
|
269
|
+
6. The **MCP server** exposes structured query tools that AI agents call instead of grep
|
|
270
|
+
7. **Hybrid search** merges keyword (FTS5) and semantic (embedding) results via RRF
|
|
271
|
+
|
|
272
|
+
### Architecture (Workspace Mode)
|
|
273
|
+
|
|
274
|
+
```
|
|
275
|
+
repo1/.srclight/index.db ──┐
|
|
276
|
+
repo2/.srclight/index.db ──┼── ATTACH ──→ :memory: ──→ UNION ALL queries
|
|
277
|
+
repo3/.srclight/index.db ──┘
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Each repo is indexed independently. At query time, SQLite's ATTACH mechanism joins them into a single searchable namespace. Handles >10 repos via automatic batching (SQLite's ATTACH limit).
|
|
281
|
+
|
|
282
|
+
## Roadmap
|
|
283
|
+
|
|
284
|
+
### Done
|
|
285
|
+
- [x] Symbol intelligence + 3x FTS5 search
|
|
286
|
+
- [x] Relationship graph: callers, callees, hierarchy
|
|
287
|
+
- [x] Blast radius, test discovery, implementors
|
|
288
|
+
- [x] Git change intelligence: blame, hotspots, recent changes
|
|
289
|
+
- [x] Build system awareness: CMake, .csproj, platform conditionals
|
|
290
|
+
- [x] Semantic search: embeddings via Ollama/Voyage, hybrid RRF
|
|
291
|
+
- [x] GPU-accelerated vector search: `.npy` sidecar, cupy/numpy vectorized math
|
|
292
|
+
- [x] Multi-repo workspaces (ATTACH+UNION)
|
|
293
|
+
- [x] Auto-reindex git hooks (post-commit + post-checkout)
|
|
294
|
+
- [x] MCP agent guidance: comprehensive instructions, tool selection guide, session protocol
|
|
295
|
+
- [x] Workspace config hot-reload (no server restart needed to add repos)
|
|
296
|
+
- [x] VectorCache sidecar re-discovery (no restart needed after embedding)
|
|
297
|
+
- [x] Project name suggestions in error messages
|
|
298
|
+
|
|
299
|
+
### Next
|
|
300
|
+
- [ ] Cross-language concept mapping (explicit edges between equivalent symbols across languages)
|
|
301
|
+
- [ ] Pattern intelligence (convention detection, coding pattern extraction)
|
|
302
|
+
- [ ] AI pre-computation (symbol summaries via cheap LLM)
|
|
303
|
+
|
|
304
|
+
## License
|
|
305
|
+
|
|
306
|
+
MIT — Gig8 LLC
|