codeseek 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codeseek-0.1.0/.gitignore +26 -0
- codeseek-0.1.0/CHANGELOG.md +13 -0
- codeseek-0.1.0/LICENSE +21 -0
- codeseek-0.1.0/PKG-INFO +155 -0
- codeseek-0.1.0/README.md +129 -0
- codeseek-0.1.0/pyproject.toml +52 -0
- codeseek-0.1.0/src/codeseek/__init__.py +5 -0
- codeseek-0.1.0/src/codeseek/__main__.py +4 -0
- codeseek-0.1.0/src/codeseek/chunking.py +34 -0
- codeseek-0.1.0/src/codeseek/cli.py +152 -0
- codeseek-0.1.0/src/codeseek/embeddings.py +65 -0
- codeseek-0.1.0/src/codeseek/index.py +71 -0
- codeseek-0.1.0/src/codeseek/mcp_server.py +128 -0
- codeseek-0.1.0/src/codeseek/search.py +33 -0
- codeseek-0.1.0/src/codeseek/sources.py +78 -0
- codeseek-0.1.0/src/codeseek/store.py +103 -0
- codeseek-0.1.0/tests/test_codeseek.py +211 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
.env
|
|
11
|
+
|
|
12
|
+
# codeseek index databases
|
|
13
|
+
*.codeseek.db
|
|
14
|
+
.codeseek.db
|
|
15
|
+
|
|
16
|
+
# Tooling
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.ruff_cache/
|
|
19
|
+
.mypy_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
Thumbs.db
|
|
26
|
+
desktop.ini
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## v0.1.0
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
- index a directory of source files into a local SQLite vector store
|
|
7
|
+
- semantic search over the index from the CLI
|
|
8
|
+
- run as an MCP server (stdio) exposing a `search_code` tool to any MCP client
|
|
9
|
+
- provider-agnostic embeddings: OpenAI, or a local server via `--base-url`
|
|
10
|
+
- pluggable document sources, so the same engine can index more than code
|
|
11
|
+
- zero runtime dependencies (standard library only)
|
|
12
|
+
|
|
13
|
+
Initial release.
|
codeseek-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Seven Of Nine
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
codeseek-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codeseek
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic code search for your repo, as a CLI and an MCP server. Bring any OpenAI-compatible embedding model. Zero dependencies.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Sev7nOfNine/codeseek
|
|
6
|
+
Project-URL: Repository, https://github.com/Sev7nOfNine/codeseek
|
|
7
|
+
Project-URL: Issues, https://github.com/Sev7nOfNine/codeseek/issues
|
|
8
|
+
Author: Seven Of Nine
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: cli,code-search,developer-tools,embeddings,llm,mcp,model-context-protocol,rag,semantic-search
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Topic :: Software Development
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Provides-Extra: test
|
|
24
|
+
Requires-Dist: pytest>=7; extra == 'test'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# codeseek
|
|
28
|
+
|
|
29
|
+
[](https://pypi.org/project/codeseek/)
|
|
30
|
+
[](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml)
|
|
31
|
+
[](LICENSE)
|
|
32
|
+
[](https://pypi.org/project/codeseek/)
|
|
33
|
+
|
|
34
|
+
**Semantic search over your codebase — as a CLI and an MCP server. Zero dependencies.**
|
|
35
|
+
|
|
36
|
+
`codeseek` indexes a repository into a local vector store and lets you search it
|
|
37
|
+
by meaning, not just by string match. Use it from the terminal, or run it as an
|
|
38
|
+
[MCP](https://modelcontextprotocol.io) server so an AI coding assistant or editor
|
|
39
|
+
can ask your codebase questions directly.
|
|
40
|
+
|
|
41
|
+
It brings no embedding model of its own: point it at the OpenAI API, or at any
|
|
42
|
+
OpenAI-compatible endpoint such as a local `llama.cpp` server, so private code
|
|
43
|
+
can be embedded without leaving your machine. Storage is plain SQLite; search is
|
|
44
|
+
brute-force cosine. The whole thing is the Python standard library and nothing
|
|
45
|
+
else.
|
|
46
|
+
|
|
47
|
+
## Install
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install codeseek
|
|
51
|
+
# or:
|
|
52
|
+
pipx install codeseek
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Requires Python 3.8+.
|
|
56
|
+
|
|
57
|
+
## Quick start
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
export OPENAI_API_KEY=sk-...
|
|
61
|
+
|
|
62
|
+
# 1. Index the current repository
|
|
63
|
+
codeseek index .
|
|
64
|
+
|
|
65
|
+
# 2. Search it
|
|
66
|
+
codeseek search "where do we validate the auth token?"
|
|
67
|
+
codeseek search "retry with backoff" -k 3
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Results come back as markdown, each with a file path, line range, and similarity
|
|
71
|
+
score:
|
|
72
|
+
|
|
73
|
+
```markdown
|
|
74
|
+
### src/auth/token.py:40-70 (score 0.812)
|
|
75
|
+
```
|
|
76
|
+
def verify_token(raw: str) -> Claims:
|
|
77
|
+
...
|
|
78
|
+
```
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Use a local or alternative provider
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
codeseek index . --base-url http://localhost:8080/v1 --model nomic-embed-text
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## As an MCP server
|
|
88
|
+
|
|
89
|
+
`codeseek serve` speaks MCP over stdio and exposes one tool, `search_code`.
|
|
90
|
+
|
|
91
|
+
After indexing a repo, register it with your MCP client. A typical `mcpServers`
|
|
92
|
+
configuration looks like this:
|
|
93
|
+
|
|
94
|
+
```json
|
|
95
|
+
{
|
|
96
|
+
"mcpServers": {
|
|
97
|
+
"codeseek": {
|
|
98
|
+
"command": "codeseek",
|
|
99
|
+
"args": ["serve", "--db", "/path/to/your/repo/.codeseek.db"],
|
|
100
|
+
"env": { "OPENAI_API_KEY": "sk-..." }
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The assistant can then call `search_code` to pull relevant code into its context
|
|
107
|
+
on demand, instead of you pasting files by hand.
|
|
108
|
+
|
|
109
|
+
## Commands
|
|
110
|
+
|
|
111
|
+
| Command | What it does |
|
|
112
|
+
| --- | --- |
|
|
113
|
+
| `codeseek index [PATH]` | Index a directory (default `.`) into `--db`. |
|
|
114
|
+
| `codeseek search QUERY` | Search the index; `-k` sets result count. |
|
|
115
|
+
| `codeseek serve` | Run the MCP server over stdio. |
|
|
116
|
+
|
|
117
|
+
Shared options: `--db`, `--model`, `--base-url`, `--api-key` (each with an
|
|
118
|
+
environment-variable default).
|
|
119
|
+
|
|
120
|
+
## How it works
|
|
121
|
+
|
|
122
|
+
1. **Source** — files are walked and read (sensible code/text extensions, common
|
|
123
|
+
build and vendor directories skipped).
|
|
124
|
+
2. **Chunking** — each file is split into overlapping line windows.
|
|
125
|
+
3. **Embedding** — chunks are embedded in batches via your provider.
|
|
126
|
+
4. **Storage** — vectors land in a local SQLite database.
|
|
127
|
+
5. **Search** — your query is embedded and compared against every chunk by cosine
|
|
128
|
+
similarity; the top matches are returned.
|
|
129
|
+
|
|
130
|
+
The document source is pluggable: the engine only consumes `Document` objects, so
|
|
131
|
+
the same indexing and search machinery can be pointed at things other than code.
|
|
132
|
+
|
|
133
|
+
## Privacy note
|
|
134
|
+
|
|
135
|
+
Indexing sends file contents to whichever embeddings provider you configure. For
|
|
136
|
+
private code, prefer a self-hosted model via `--base-url`.
|
|
137
|
+
|
|
138
|
+
## Scope
|
|
139
|
+
|
|
140
|
+
Search is a linear scan, which is plenty fast for a single repository (a few
|
|
141
|
+
thousand chunks). Indexing very large monorepos would want a real approximate
|
|
142
|
+
vector index — a natural next step, not today's goal.
|
|
143
|
+
|
|
144
|
+
## Development
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
pip install -e ".[test]"
|
|
148
|
+
python -m pytest
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
All tests run offline; the embedding and HTTP layers accept injectable fakes.
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT — see [LICENSE](LICENSE).
|
codeseek-0.1.0/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# codeseek
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/codeseek/)
|
|
4
|
+
[](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://pypi.org/project/codeseek/)
|
|
7
|
+
|
|
8
|
+
**Semantic search over your codebase — as a CLI and an MCP server. Zero dependencies.**
|
|
9
|
+
|
|
10
|
+
`codeseek` indexes a repository into a local vector store and lets you search it
|
|
11
|
+
by meaning, not just by string match. Use it from the terminal, or run it as an
|
|
12
|
+
[MCP](https://modelcontextprotocol.io) server so an AI coding assistant or editor
|
|
13
|
+
can ask your codebase questions directly.
|
|
14
|
+
|
|
15
|
+
It brings no embedding model of its own: point it at the OpenAI API, or at any
|
|
16
|
+
OpenAI-compatible endpoint such as a local `llama.cpp` server, so private code
|
|
17
|
+
can be embedded without leaving your machine. Storage is plain SQLite; search is
|
|
18
|
+
brute-force cosine. The whole thing is the Python standard library and nothing
|
|
19
|
+
else.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install codeseek
|
|
25
|
+
# or:
|
|
26
|
+
pipx install codeseek
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Requires Python 3.8+.
|
|
30
|
+
|
|
31
|
+
## Quick start
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
export OPENAI_API_KEY=sk-...
|
|
35
|
+
|
|
36
|
+
# 1. Index the current repository
|
|
37
|
+
codeseek index .
|
|
38
|
+
|
|
39
|
+
# 2. Search it
|
|
40
|
+
codeseek search "where do we validate the auth token?"
|
|
41
|
+
codeseek search "retry with backoff" -k 3
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Results come back as markdown, each with a file path, line range, and similarity
|
|
45
|
+
score:
|
|
46
|
+
|
|
47
|
+
```markdown
|
|
48
|
+
### src/auth/token.py:40-70 (score 0.812)
|
|
49
|
+
```
|
|
50
|
+
def verify_token(raw: str) -> Claims:
|
|
51
|
+
...
|
|
52
|
+
```
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Use a local or alternative provider
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
codeseek index . --base-url http://localhost:8080/v1 --model nomic-embed-text
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## As an MCP server
|
|
62
|
+
|
|
63
|
+
`codeseek serve` speaks MCP over stdio and exposes one tool, `search_code`.
|
|
64
|
+
|
|
65
|
+
After indexing a repo, register it with your MCP client. A typical `mcpServers`
|
|
66
|
+
configuration looks like this:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"mcpServers": {
|
|
71
|
+
"codeseek": {
|
|
72
|
+
"command": "codeseek",
|
|
73
|
+
"args": ["serve", "--db", "/path/to/your/repo/.codeseek.db"],
|
|
74
|
+
"env": { "OPENAI_API_KEY": "sk-..." }
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The assistant can then call `search_code` to pull relevant code into its context
|
|
81
|
+
on demand, instead of you pasting files by hand.
|
|
82
|
+
|
|
83
|
+
## Commands
|
|
84
|
+
|
|
85
|
+
| Command | What it does |
|
|
86
|
+
| --- | --- |
|
|
87
|
+
| `codeseek index [PATH]` | Index a directory (default `.`) into `--db`. |
|
|
88
|
+
| `codeseek search QUERY` | Search the index; `-k` sets result count. |
|
|
89
|
+
| `codeseek serve` | Run the MCP server over stdio. |
|
|
90
|
+
|
|
91
|
+
Shared options: `--db`, `--model`, `--base-url`, `--api-key` (each with an
|
|
92
|
+
environment-variable default).
|
|
93
|
+
|
|
94
|
+
## How it works
|
|
95
|
+
|
|
96
|
+
1. **Source** — files are walked and read (sensible code/text extensions, common
|
|
97
|
+
build and vendor directories skipped).
|
|
98
|
+
2. **Chunking** — each file is split into overlapping line windows.
|
|
99
|
+
3. **Embedding** — chunks are embedded in batches via your provider.
|
|
100
|
+
4. **Storage** — vectors land in a local SQLite database.
|
|
101
|
+
5. **Search** — your query is embedded and compared against every chunk by cosine
|
|
102
|
+
similarity; the top matches are returned.
|
|
103
|
+
|
|
104
|
+
The document source is pluggable: the engine only consumes `Document` objects, so
|
|
105
|
+
the same indexing and search machinery can be pointed at things other than code.
|
|
106
|
+
|
|
107
|
+
## Privacy note
|
|
108
|
+
|
|
109
|
+
Indexing sends file contents to whichever embeddings provider you configure. For
|
|
110
|
+
private code, prefer a self-hosted model via `--base-url`.
|
|
111
|
+
|
|
112
|
+
## Scope
|
|
113
|
+
|
|
114
|
+
Search is a linear scan, which is plenty fast for a single repository (a few
|
|
115
|
+
thousand chunks). Indexing very large monorepos would want a real approximate
|
|
116
|
+
vector index — a natural next step, not today's goal.
|
|
117
|
+
|
|
118
|
+
## Development
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pip install -e ".[test]"
|
|
122
|
+
python -m pytest
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
All tests run offline; the embedding and HTTP layers accept injectable fakes.
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "codeseek"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Semantic code search for your repo, as a CLI and an MCP server. Bring any OpenAI-compatible embedding model. Zero dependencies."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Seven Of Nine" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"semantic-search",
|
|
15
|
+
"code-search",
|
|
16
|
+
"embeddings",
|
|
17
|
+
"rag",
|
|
18
|
+
"mcp",
|
|
19
|
+
"model-context-protocol",
|
|
20
|
+
"llm",
|
|
21
|
+
"developer-tools",
|
|
22
|
+
"cli",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 4 - Beta",
|
|
26
|
+
"Environment :: Console",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
32
|
+
"Topic :: Software Development",
|
|
33
|
+
"Topic :: Software Development :: Libraries",
|
|
34
|
+
"Topic :: Utilities",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
test = ["pytest>=7"]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/Sev7nOfNine/codeseek"
|
|
42
|
+
Repository = "https://github.com/Sev7nOfNine/codeseek"
|
|
43
|
+
Issues = "https://github.com/Sev7nOfNine/codeseek/issues"
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
codeseek = "codeseek.cli:main"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["src/codeseek"]
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.sdist]
|
|
52
|
+
include = ["src/codeseek", "README.md", "LICENSE", "CHANGELOG.md", "tests"]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Split text into overlapping line windows for embedding."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import List, Tuple
|
|
6
|
+
|
|
7
|
+
# (start_line, end_line, text), with 1-based inclusive line numbers.
|
|
8
|
+
Chunk = Tuple[int, int, str]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def chunk_text(text: str, *, max_lines: int = 60, overlap: int = 10) -> List[Chunk]:
|
|
12
|
+
"""Break ``text`` into overlapping windows of at most ``max_lines`` lines."""
|
|
13
|
+
if max_lines <= 0:
|
|
14
|
+
raise ValueError("max_lines must be positive")
|
|
15
|
+
if overlap >= max_lines:
|
|
16
|
+
overlap = max_lines // 2
|
|
17
|
+
|
|
18
|
+
lines = text.splitlines()
|
|
19
|
+
if not lines:
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
step = max_lines - overlap
|
|
23
|
+
chunks: List[Chunk] = []
|
|
24
|
+
n = len(lines)
|
|
25
|
+
i = 0
|
|
26
|
+
while i < n:
|
|
27
|
+
window = lines[i : i + max_lines]
|
|
28
|
+
start = i + 1
|
|
29
|
+
end = min(i + max_lines, n)
|
|
30
|
+
chunks.append((start, end, "\n".join(window)))
|
|
31
|
+
if i + max_lines >= n:
|
|
32
|
+
break
|
|
33
|
+
i += step
|
|
34
|
+
return chunks
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Command-line interface for codeseek."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from typing import Callable, List, Optional
|
|
9
|
+
|
|
10
|
+
from . import __version__
|
|
11
|
+
from .embeddings import EmbeddingError, embed
|
|
12
|
+
from .index import build_index
|
|
13
|
+
from .search import format_results, search_index
|
|
14
|
+
from .sources import FileSource
|
|
15
|
+
from .store import VectorStore
|
|
16
|
+
|
|
17
|
+
DEFAULT_DB = ".codeseek.db"
|
|
18
|
+
DEFAULT_MODEL = "text-embedding-3-small"
|
|
19
|
+
|
|
20
|
+
Embedder = Callable[[List[str]], List[List[float]]]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _add_provider_args(sub: argparse.ArgumentParser) -> None:
|
|
24
|
+
sub.add_argument(
|
|
25
|
+
"--model",
|
|
26
|
+
default=os.environ.get("CODESEEK_MODEL", DEFAULT_MODEL),
|
|
27
|
+
metavar="NAME",
|
|
28
|
+
help="embedding model (default: {0}, or $CODESEEK_MODEL).".format(DEFAULT_MODEL),
|
|
29
|
+
)
|
|
30
|
+
sub.add_argument(
|
|
31
|
+
"--base-url",
|
|
32
|
+
default=os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1"),
|
|
33
|
+
metavar="URL",
|
|
34
|
+
help="OpenAI-compatible base URL (default: $OPENAI_BASE_URL or OpenAI).",
|
|
35
|
+
)
|
|
36
|
+
sub.add_argument(
|
|
37
|
+
"--api-key",
|
|
38
|
+
default=os.environ.get("OPENAI_API_KEY"),
|
|
39
|
+
metavar="KEY",
|
|
40
|
+
help="embeddings API key (default: $OPENAI_API_KEY).",
|
|
41
|
+
)
|
|
42
|
+
sub.add_argument(
|
|
43
|
+
"--db",
|
|
44
|
+
default=DEFAULT_DB,
|
|
45
|
+
metavar="FILE",
|
|
46
|
+
help="index database path (default: {0}).".format(DEFAULT_DB),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
51
|
+
parser = argparse.ArgumentParser(
|
|
52
|
+
prog="codeseek",
|
|
53
|
+
description="Semantic code search as a CLI and an MCP server.",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--version", action="version", version="%(prog)s {0}".format(__version__)
|
|
57
|
+
)
|
|
58
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
59
|
+
|
|
60
|
+
p_index = sub.add_parser("index", help="index a directory of code.")
|
|
61
|
+
p_index.add_argument(
|
|
62
|
+
"path", nargs="?", default=".", help="directory to index (default: .)."
|
|
63
|
+
)
|
|
64
|
+
p_index.add_argument(
|
|
65
|
+
"--max-lines", type=int, default=60, help="lines per chunk (default: 60)."
|
|
66
|
+
)
|
|
67
|
+
p_index.add_argument(
|
|
68
|
+
"--overlap", type=int, default=10, help="overlapping lines (default: 10)."
|
|
69
|
+
)
|
|
70
|
+
_add_provider_args(p_index)
|
|
71
|
+
|
|
72
|
+
p_search = sub.add_parser("search", help="search the index.")
|
|
73
|
+
p_search.add_argument("query", help="what to look for.")
|
|
74
|
+
p_search.add_argument(
|
|
75
|
+
"-k", type=int, default=5, help="number of results (default: 5)."
|
|
76
|
+
)
|
|
77
|
+
_add_provider_args(p_search)
|
|
78
|
+
|
|
79
|
+
p_serve = sub.add_parser("serve", help="run the MCP server over stdio.")
|
|
80
|
+
_add_provider_args(p_serve)
|
|
81
|
+
|
|
82
|
+
return parser
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _make_embedder(args: argparse.Namespace) -> Embedder:
|
|
86
|
+
def embedder(texts: List[str]) -> List[List[float]]:
|
|
87
|
+
return embed(
|
|
88
|
+
texts, model=args.model, api_key=args.api_key, base_url=args.base_url
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return embedder
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
95
|
+
parser = build_parser()
|
|
96
|
+
args = parser.parse_args(argv)
|
|
97
|
+
|
|
98
|
+
if not args.api_key:
|
|
99
|
+
parser.exit(2, "codeseek: no API key (set $OPENAI_API_KEY or --api-key)\n")
|
|
100
|
+
|
|
101
|
+
embedder = _make_embedder(args)
|
|
102
|
+
|
|
103
|
+
if args.command == "index":
|
|
104
|
+
source = FileSource(args.path)
|
|
105
|
+
store = VectorStore(args.db)
|
|
106
|
+
try:
|
|
107
|
+
count = build_index(
|
|
108
|
+
source,
|
|
109
|
+
store,
|
|
110
|
+
embedder,
|
|
111
|
+
max_lines=args.max_lines,
|
|
112
|
+
overlap=args.overlap,
|
|
113
|
+
progress=lambda doc_id: sys.stderr.write(
|
|
114
|
+
" indexed {0}\n".format(doc_id)
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
except EmbeddingError as exc:
|
|
118
|
+
parser.exit(1, "codeseek: {0}\n".format(exc))
|
|
119
|
+
finally:
|
|
120
|
+
store.close()
|
|
121
|
+
sys.stderr.write("codeseek: indexed {0} chunks into {1}\n".format(count, args.db))
|
|
122
|
+
return 0
|
|
123
|
+
|
|
124
|
+
if args.command == "search":
|
|
125
|
+
store = VectorStore(args.db)
|
|
126
|
+
try:
|
|
127
|
+
results = search_index(store, embedder, args.query, k=args.k)
|
|
128
|
+
except EmbeddingError as exc:
|
|
129
|
+
parser.exit(1, "codeseek: {0}\n".format(exc))
|
|
130
|
+
finally:
|
|
131
|
+
store.close()
|
|
132
|
+
sys.stdout.write(format_results(results) + "\n")
|
|
133
|
+
return 0
|
|
134
|
+
|
|
135
|
+
if args.command == "serve":
|
|
136
|
+
from .mcp_server import MCPServer
|
|
137
|
+
|
|
138
|
+
store = VectorStore(args.db)
|
|
139
|
+
|
|
140
|
+
def search_fn(query: str, k: int):
|
|
141
|
+
return search_index(store, embedder, query, k=k)
|
|
142
|
+
|
|
143
|
+
MCPServer(search_fn, server_version=__version__).serve()
|
|
144
|
+
store.close()
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
parser.error("unknown command")
|
|
148
|
+
return 2
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Embeddings client for OpenAI-compatible APIs, using only the stdlib.
|
|
2
|
+
|
|
3
|
+
Works with the OpenAI platform, or any compatible ``/embeddings`` endpoint such
|
|
4
|
+
as a local llama.cpp server, so private code can be embedded without leaving the
|
|
5
|
+
machine.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import urllib.error
|
|
12
|
+
import urllib.request
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
Opener = Callable[..., Any]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EmbeddingError(RuntimeError):
|
|
19
|
+
"""Raised when the embeddings request fails or returns an unexpected shape."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_embeddings(data: Dict[str, Any]) -> List[List[float]]:
|
|
23
|
+
"""Pull the list of embedding vectors out of an embeddings response."""
|
|
24
|
+
try:
|
|
25
|
+
items = data["data"]
|
|
26
|
+
return [item["embedding"] for item in items]
|
|
27
|
+
except (KeyError, TypeError):
|
|
28
|
+
raise EmbeddingError(
|
|
29
|
+
"unexpected embeddings response: {0}".format(json.dumps(data)[:300])
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def embed(
|
|
34
|
+
texts: List[str],
|
|
35
|
+
*,
|
|
36
|
+
model: str,
|
|
37
|
+
api_key: str,
|
|
38
|
+
base_url: str = "https://api.openai.com/v1",
|
|
39
|
+
timeout: int = 120,
|
|
40
|
+
opener: Optional[Opener] = None,
|
|
41
|
+
) -> List[List[float]]:
|
|
42
|
+
"""Embed a batch of texts and return one vector per input."""
|
|
43
|
+
if not texts:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
url = base_url.rstrip("/") + "/embeddings"
|
|
47
|
+
payload = json.dumps({"model": model, "input": texts}).encode("utf-8")
|
|
48
|
+
|
|
49
|
+
req = urllib.request.Request(url, data=payload, method="POST")
|
|
50
|
+
req.add_header("Content-Type", "application/json")
|
|
51
|
+
req.add_header("Authorization", "Bearer " + api_key)
|
|
52
|
+
|
|
53
|
+
do_open = opener or urllib.request.urlopen
|
|
54
|
+
try:
|
|
55
|
+
with do_open(req, timeout=timeout) as resp:
|
|
56
|
+
raw = resp.read().decode("utf-8")
|
|
57
|
+
except urllib.error.HTTPError as exc:
|
|
58
|
+
body = exc.read().decode("utf-8", "replace")
|
|
59
|
+
raise EmbeddingError(
|
|
60
|
+
"embeddings request failed (HTTP {0}): {1}".format(exc.code, body[:500])
|
|
61
|
+
)
|
|
62
|
+
except urllib.error.URLError as exc:
|
|
63
|
+
raise EmbeddingError("embeddings request failed: {0}".format(exc.reason))
|
|
64
|
+
|
|
65
|
+
return parse_embeddings(json.loads(raw))
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Build a search index: source -> chunks -> embeddings -> store."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, List, Optional
|
|
6
|
+
|
|
7
|
+
from .chunking import chunk_text
|
|
8
|
+
from .sources import Document
|
|
9
|
+
from .store import VectorStore
|
|
10
|
+
|
|
11
|
+
# An embedder turns a batch of texts into a batch of vectors.
|
|
12
|
+
Embedder = Callable[[List[str]], List[List[float]]]
|
|
13
|
+
# Anything iterable of Documents (FileSource, or a future notes source).
|
|
14
|
+
Source = "object with a documents() iterator"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_index(
|
|
18
|
+
source,
|
|
19
|
+
store: VectorStore,
|
|
20
|
+
embedder: Embedder,
|
|
21
|
+
*,
|
|
22
|
+
batch_size: int = 64,
|
|
23
|
+
max_lines: int = 60,
|
|
24
|
+
overlap: int = 10,
|
|
25
|
+
progress: Optional[Callable[[str], None]] = None,
|
|
26
|
+
) -> int:
|
|
27
|
+
"""Index every document from ``source`` into ``store``. Returns chunk count."""
|
|
28
|
+
store.clear()
|
|
29
|
+
|
|
30
|
+
pending_rows: List[dict] = []
|
|
31
|
+
pending_text: List[str] = []
|
|
32
|
+
total = 0
|
|
33
|
+
|
|
34
|
+
def flush() -> None:
|
|
35
|
+
nonlocal total
|
|
36
|
+
if not pending_text:
|
|
37
|
+
return
|
|
38
|
+
vectors = embedder(pending_text)
|
|
39
|
+
rows = []
|
|
40
|
+
for row, vec in zip(pending_rows, vectors):
|
|
41
|
+
enriched = dict(row)
|
|
42
|
+
enriched["embedding"] = vec
|
|
43
|
+
rows.append(enriched)
|
|
44
|
+
store.add(rows)
|
|
45
|
+
total += len(rows)
|
|
46
|
+
pending_rows.clear()
|
|
47
|
+
pending_text.clear()
|
|
48
|
+
|
|
49
|
+
for doc in source.documents(): # type: Document
|
|
50
|
+
path = doc.metadata.get("path", doc.id)
|
|
51
|
+
for start, end, text in chunk_text(
|
|
52
|
+
doc.text, max_lines=max_lines, overlap=overlap
|
|
53
|
+
):
|
|
54
|
+
pending_rows.append(
|
|
55
|
+
{
|
|
56
|
+
"id": "{0}:{1}-{2}".format(doc.id, start, end),
|
|
57
|
+
"doc_id": doc.id,
|
|
58
|
+
"path": path,
|
|
59
|
+
"start_line": start,
|
|
60
|
+
"end_line": end,
|
|
61
|
+
"text": text,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
pending_text.append(text)
|
|
65
|
+
if len(pending_text) >= batch_size:
|
|
66
|
+
flush()
|
|
67
|
+
if progress is not None:
|
|
68
|
+
progress(doc.id)
|
|
69
|
+
|
|
70
|
+
flush()
|
|
71
|
+
return total
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""A minimal Model Context Protocol (MCP) server over stdio.
|
|
2
|
+
|
|
3
|
+
Implements just enough of the protocol — ``initialize``, ``tools/list`` and
|
|
4
|
+
``tools/call`` exchanged as newline-delimited JSON-RPC 2.0 — to expose a single
|
|
5
|
+
``search_code`` tool to any MCP client, with no third-party dependencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from .search import format_results
|
|
15
|
+
from .store import ScoredChunk
|
|
16
|
+
|
|
17
|
+
PROTOCOL_VERSION = "2024-11-05"
|
|
18
|
+
|
|
19
|
+
SEARCH_TOOL: Dict[str, Any] = {
|
|
20
|
+
"name": "search_code",
|
|
21
|
+
"description": (
|
|
22
|
+
"Semantic search over the indexed codebase. Returns the most relevant "
|
|
23
|
+
"code chunks with their file path and line range."
|
|
24
|
+
),
|
|
25
|
+
"inputSchema": {
|
|
26
|
+
"type": "object",
|
|
27
|
+
"properties": {
|
|
28
|
+
"query": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"description": "A natural-language or code query.",
|
|
31
|
+
},
|
|
32
|
+
"k": {
|
|
33
|
+
"type": "integer",
|
|
34
|
+
"description": "How many results to return (default 5).",
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
"required": ["query"],
|
|
38
|
+
},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# A search function takes (query, k) and returns scored chunks.
|
|
42
|
+
SearchFn = Callable[[str, int], List[ScoredChunk]]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MCPServer:
|
|
46
|
+
"""Dispatch MCP JSON-RPC requests to a search backend."""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
search_fn: SearchFn,
|
|
51
|
+
*,
|
|
52
|
+
server_name: str = "codeseek",
|
|
53
|
+
server_version: str = "0.1.0",
|
|
54
|
+
) -> None:
|
|
55
|
+
self.search_fn = search_fn
|
|
56
|
+
self.server_name = server_name
|
|
57
|
+
self.server_version = server_version
|
|
58
|
+
|
|
59
|
+
def _error(self, req_id: Any, code: int, message: str) -> Dict[str, Any]:
|
|
60
|
+
return {
|
|
61
|
+
"jsonrpc": "2.0",
|
|
62
|
+
"id": req_id,
|
|
63
|
+
"error": {"code": code, "message": message},
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def handle(self, request: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
67
|
+
"""Handle one request. Returns a response, or None for notifications."""
|
|
68
|
+
method = request.get("method")
|
|
69
|
+
req_id = request.get("id")
|
|
70
|
+
|
|
71
|
+
if method == "initialize":
|
|
72
|
+
result: Dict[str, Any] = {
|
|
73
|
+
"protocolVersion": PROTOCOL_VERSION,
|
|
74
|
+
"capabilities": {"tools": {}},
|
|
75
|
+
"serverInfo": {
|
|
76
|
+
"name": self.server_name,
|
|
77
|
+
"version": self.server_version,
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
elif method == "tools/list":
|
|
81
|
+
result = {"tools": [SEARCH_TOOL]}
|
|
82
|
+
elif method == "tools/call":
|
|
83
|
+
params = request.get("params") or {}
|
|
84
|
+
if params.get("name") != "search_code":
|
|
85
|
+
return self._error(
|
|
86
|
+
req_id, -32602, "unknown tool: {0}".format(params.get("name"))
|
|
87
|
+
)
|
|
88
|
+
args = params.get("arguments") or {}
|
|
89
|
+
query = args.get("query", "")
|
|
90
|
+
try:
|
|
91
|
+
k = int(args.get("k", 5) or 5)
|
|
92
|
+
except (TypeError, ValueError):
|
|
93
|
+
k = 5
|
|
94
|
+
try:
|
|
95
|
+
results = self.search_fn(query, k)
|
|
96
|
+
result = {"content": [{"type": "text", "text": format_results(results)}]}
|
|
97
|
+
except Exception as exc: # surface errors to the client, do not crash
|
|
98
|
+
return {
|
|
99
|
+
"jsonrpc": "2.0",
|
|
100
|
+
"id": req_id,
|
|
101
|
+
"result": {
|
|
102
|
+
"content": [{"type": "text", "text": "error: {0}".format(exc)}],
|
|
103
|
+
"isError": True,
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
elif method and method.startswith("notifications/"):
|
|
107
|
+
return None
|
|
108
|
+
else:
|
|
109
|
+
return self._error(req_id, -32601, "method not found: {0}".format(method))
|
|
110
|
+
|
|
111
|
+
return {"jsonrpc": "2.0", "id": req_id, "result": result}
|
|
112
|
+
|
|
113
|
+
def serve(self, stdin: Any = None, stdout: Any = None) -> None:
|
|
114
|
+
"""Read newline-delimited JSON-RPC from stdin, write responses to stdout."""
|
|
115
|
+
stdin = stdin if stdin is not None else sys.stdin
|
|
116
|
+
stdout = stdout if stdout is not None else sys.stdout
|
|
117
|
+
for line in stdin:
|
|
118
|
+
line = line.strip()
|
|
119
|
+
if not line:
|
|
120
|
+
continue
|
|
121
|
+
try:
|
|
122
|
+
request = json.loads(line)
|
|
123
|
+
except json.JSONDecodeError:
|
|
124
|
+
continue
|
|
125
|
+
response = self.handle(request)
|
|
126
|
+
if response is not None:
|
|
127
|
+
stdout.write(json.dumps(response) + "\n")
|
|
128
|
+
stdout.flush()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Query the index: embed the query, return the most similar chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, List
|
|
6
|
+
|
|
7
|
+
from .store import ScoredChunk, VectorStore
|
|
8
|
+
|
|
9
|
+
Embedder = Callable[[List[str]], List[List[float]]]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def search_index(
|
|
13
|
+
store: VectorStore, embedder: Embedder, query: str, k: int = 5
|
|
14
|
+
) -> List[ScoredChunk]:
|
|
15
|
+
"""Return the top ``k`` chunks most similar to ``query``."""
|
|
16
|
+
if not query.strip():
|
|
17
|
+
return []
|
|
18
|
+
query_vec = embedder([query])[0]
|
|
19
|
+
return store.search(query_vec, k)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def format_results(results: List[ScoredChunk]) -> str:
|
|
23
|
+
"""Render scored chunks as readable markdown."""
|
|
24
|
+
if not results:
|
|
25
|
+
return "No matches."
|
|
26
|
+
parts = []
|
|
27
|
+
for score, row in results:
|
|
28
|
+
parts.append(
|
|
29
|
+
"### {0}:{1}-{2} (score {3:.3f})\n```\n{4}\n```".format(
|
|
30
|
+
row["path"], row["start_line"], row["end_line"], score, row["text"]
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
return "\n\n".join(parts)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Document sources for indexing.
|
|
2
|
+
|
|
3
|
+
A *source* is anything that yields :class:`Document` objects. The rest of
|
|
4
|
+
codeseek (chunking, embedding, storage, search) never cares where documents come
|
|
5
|
+
from, so a new source — code files, markdown notes, a wiki export — is all it
|
|
6
|
+
takes to point the same engine at different data.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Dict, Iterable, Iterator, Optional, Set
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Document:
|
|
18
|
+
"""A unit of text to index."""
|
|
19
|
+
|
|
20
|
+
id: str
|
|
21
|
+
text: str
|
|
22
|
+
metadata: Dict[str, str] = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# A pragmatic default set of source-code and text extensions.
|
|
26
|
+
DEFAULT_EXTENSIONS: Set[str] = {
|
|
27
|
+
".py", ".pyi", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs", ".java", ".kt",
|
|
28
|
+
".c", ".h", ".cc", ".cpp", ".hpp", ".cs", ".rb", ".php", ".swift", ".scala",
|
|
29
|
+
".sh", ".bash", ".ps1", ".lua", ".r", ".jl", ".sql", ".html", ".css", ".scss",
|
|
30
|
+
".md", ".rst", ".txt", ".toml", ".yaml", ".yml", ".json", ".ini", ".cfg",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
DEFAULT_EXCLUDES: Set[str] = {
|
|
34
|
+
".git", ".hg", ".svn", "__pycache__", "node_modules", ".venv", "venv",
|
|
35
|
+
"dist", "build", ".mypy_cache", ".pytest_cache", ".ruff_cache", ".idea",
|
|
36
|
+
".tox", "target", "vendor",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Skip files larger than this; they are usually generated or binary.
|
|
40
|
+
MAX_FILE_BYTES = 400_000
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FileSource:
|
|
44
|
+
"""Yield documents from source files under a directory tree."""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
root: str,
|
|
49
|
+
*,
|
|
50
|
+
extensions: Optional[Iterable[str]] = None,
|
|
51
|
+
exclude_dirs: Optional[Iterable[str]] = None,
|
|
52
|
+
max_file_bytes: int = MAX_FILE_BYTES,
|
|
53
|
+
) -> None:
|
|
54
|
+
self.root = root
|
|
55
|
+
self.extensions = set(extensions) if extensions else set(DEFAULT_EXTENSIONS)
|
|
56
|
+
self.exclude_dirs = set(exclude_dirs) if exclude_dirs else set(DEFAULT_EXCLUDES)
|
|
57
|
+
self.max_file_bytes = max_file_bytes
|
|
58
|
+
|
|
59
|
+
def documents(self) -> Iterator[Document]:
|
|
60
|
+
for dirpath, dirnames, filenames in os.walk(self.root):
|
|
61
|
+
# Prune excluded directories in place so os.walk does not descend.
|
|
62
|
+
dirnames[:] = [d for d in dirnames if d not in self.exclude_dirs]
|
|
63
|
+
for name in filenames:
|
|
64
|
+
ext = os.path.splitext(name)[1].lower()
|
|
65
|
+
if ext not in self.extensions:
|
|
66
|
+
continue
|
|
67
|
+
full = os.path.join(dirpath, name)
|
|
68
|
+
try:
|
|
69
|
+
if os.path.getsize(full) > self.max_file_bytes:
|
|
70
|
+
continue
|
|
71
|
+
with open(full, "r", encoding="utf-8") as handle:
|
|
72
|
+
text = handle.read()
|
|
73
|
+
except (OSError, UnicodeDecodeError):
|
|
74
|
+
continue
|
|
75
|
+
if not text.strip():
|
|
76
|
+
continue
|
|
77
|
+
rel = os.path.relpath(full, self.root).replace(os.sep, "/")
|
|
78
|
+
yield Document(id=rel, text=text, metadata={"path": rel})
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""A small SQLite-backed vector store with brute-force cosine search.
|
|
2
|
+
|
|
3
|
+
Brute force is intentional: for a single repository (a few thousand chunks) a
|
|
4
|
+
linear scan in Python is fast enough and keeps the dependency count at zero. For
|
|
5
|
+
much larger corpora a real vector index would be the next step.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import math
|
|
12
|
+
import sqlite3
|
|
13
|
+
from typing import Dict, List, Optional, Sequence, Tuple
|
|
14
|
+
|
|
15
|
+
ScoredChunk = Tuple[float, Dict[str, object]]
|
|
16
|
+
|
|
17
|
+
_SCHEMA = """
|
|
18
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
19
|
+
id TEXT PRIMARY KEY,
|
|
20
|
+
doc_id TEXT,
|
|
21
|
+
path TEXT,
|
|
22
|
+
start_line INTEGER,
|
|
23
|
+
end_line INTEGER,
|
|
24
|
+
text TEXT,
|
|
25
|
+
embedding TEXT
|
|
26
|
+
)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def cosine(a: Sequence[float], b: Sequence[float]) -> float:
|
|
31
|
+
"""Cosine similarity between two equal-length vectors."""
|
|
32
|
+
if len(a) != len(b):
|
|
33
|
+
raise ValueError("vectors have different lengths")
|
|
34
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
35
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
36
|
+
norm_b = math.sqrt(sum(y * y for y in b))
|
|
37
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
38
|
+
return 0.0
|
|
39
|
+
return dot / (norm_a * norm_b)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class VectorStore:
|
|
43
|
+
"""Persisted store of embedded text chunks."""
|
|
44
|
+
|
|
45
|
+
def __init__(self, path: str = ":memory:") -> None:
|
|
46
|
+
self.conn = sqlite3.connect(path)
|
|
47
|
+
self.conn.execute(_SCHEMA)
|
|
48
|
+
self.conn.commit()
|
|
49
|
+
|
|
50
|
+
def clear(self) -> None:
|
|
51
|
+
self.conn.execute("DELETE FROM chunks")
|
|
52
|
+
self.conn.commit()
|
|
53
|
+
|
|
54
|
+
def add(self, rows: List[Dict[str, object]]) -> None:
|
|
55
|
+
"""Insert (or replace) chunk rows. Each row carries an ``embedding`` list."""
|
|
56
|
+
self.conn.executemany(
|
|
57
|
+
"INSERT OR REPLACE INTO chunks "
|
|
58
|
+
"(id, doc_id, path, start_line, end_line, text, embedding) "
|
|
59
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
60
|
+
[
|
|
61
|
+
(
|
|
62
|
+
r["id"],
|
|
63
|
+
r["doc_id"],
|
|
64
|
+
r["path"],
|
|
65
|
+
r["start_line"],
|
|
66
|
+
r["end_line"],
|
|
67
|
+
r["text"],
|
|
68
|
+
json.dumps(r["embedding"]),
|
|
69
|
+
)
|
|
70
|
+
for r in rows
|
|
71
|
+
],
|
|
72
|
+
)
|
|
73
|
+
self.conn.commit()
|
|
74
|
+
|
|
75
|
+
def count(self) -> int:
|
|
76
|
+
return int(self.conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0])
|
|
77
|
+
|
|
78
|
+
def search(self, query_vec: Sequence[float], k: int = 5) -> List[ScoredChunk]:
|
|
79
|
+
cur = self.conn.execute(
|
|
80
|
+
"SELECT id, doc_id, path, start_line, end_line, text, embedding FROM chunks"
|
|
81
|
+
)
|
|
82
|
+
scored: List[ScoredChunk] = []
|
|
83
|
+
for row in cur:
|
|
84
|
+
embedding = json.loads(row[6])
|
|
85
|
+
score = cosine(query_vec, embedding)
|
|
86
|
+
scored.append(
|
|
87
|
+
(
|
|
88
|
+
score,
|
|
89
|
+
{
|
|
90
|
+
"id": row[0],
|
|
91
|
+
"doc_id": row[1],
|
|
92
|
+
"path": row[2],
|
|
93
|
+
"start_line": row[3],
|
|
94
|
+
"end_line": row[4],
|
|
95
|
+
"text": row[5],
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
100
|
+
return scored[:k]
|
|
101
|
+
|
|
102
|
+
def close(self) -> None:
|
|
103
|
+
self.conn.close()
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Offline tests for codeseek. No network, no real embeddings."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from codeseek import embeddings
|
|
10
|
+
from codeseek.chunking import chunk_text
|
|
11
|
+
from codeseek.cli import build_parser
|
|
12
|
+
from codeseek.index import build_index
|
|
13
|
+
from codeseek.mcp_server import MCPServer
|
|
14
|
+
from codeseek.search import format_results, search_index
|
|
15
|
+
from codeseek.sources import Document, FileSource
|
|
16
|
+
from codeseek.store import VectorStore, cosine
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# --- chunking -----------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_chunk_short_text_single_chunk():
|
|
23
|
+
chunks = chunk_text("a\nb\nc", max_lines=60, overlap=10)
|
|
24
|
+
assert len(chunks) == 1
|
|
25
|
+
assert chunks[0] == (1, 3, "a\nb\nc")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_chunk_overlap_and_windows():
|
|
29
|
+
text = "\n".join(str(i) for i in range(1, 26)) # 25 lines
|
|
30
|
+
chunks = chunk_text(text, max_lines=10, overlap=2)
|
|
31
|
+
assert chunks[0][0] == 1 and chunks[0][1] == 10
|
|
32
|
+
# step = 8, so second window starts at line 9
|
|
33
|
+
assert chunks[1][0] == 9
|
|
34
|
+
assert chunks[-1][1] == 25 # last window reaches the end
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_chunk_empty():
|
|
38
|
+
assert chunk_text("", max_lines=10) == []
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- embeddings parsing -------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_parse_embeddings_ok():
|
|
45
|
+
data = {"data": [{"embedding": [0.1, 0.2]}, {"embedding": [0.3, 0.4]}]}
|
|
46
|
+
assert embeddings.parse_embeddings(data) == [[0.1, 0.2], [0.3, 0.4]]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_parse_embeddings_bad():
|
|
50
|
+
with pytest.raises(embeddings.EmbeddingError):
|
|
51
|
+
embeddings.parse_embeddings({"oops": 1})
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_embed_empty_returns_empty():
|
|
55
|
+
assert embeddings.embed([], model="m", api_key="k") == []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# --- store --------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_cosine_basics():
|
|
62
|
+
assert cosine([1, 0], [1, 0]) == pytest.approx(1.0)
|
|
63
|
+
assert cosine([1, 0], [0, 1]) == pytest.approx(0.0)
|
|
64
|
+
assert cosine([0, 0], [1, 1]) == 0.0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_store_add_and_search():
|
|
68
|
+
store = VectorStore(":memory:")
|
|
69
|
+
store.add(
|
|
70
|
+
[
|
|
71
|
+
{"id": "a:1-1", "doc_id": "a", "path": "a.py", "start_line": 1,
|
|
72
|
+
"end_line": 1, "text": "alpha", "embedding": [1.0, 0.0]},
|
|
73
|
+
{"id": "b:1-1", "doc_id": "b", "path": "b.py", "start_line": 1,
|
|
74
|
+
"end_line": 1, "text": "beta", "embedding": [0.0, 1.0]},
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
assert store.count() == 2
|
|
78
|
+
results = store.search([0.9, 0.1], k=1)
|
|
79
|
+
assert len(results) == 1
|
|
80
|
+
assert results[0][1]["path"] == "a.py"
|
|
81
|
+
store.close()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# --- sources ------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_file_source_walks_and_filters():
|
|
88
|
+
with tempfile.TemporaryDirectory() as root:
|
|
89
|
+
with open(os.path.join(root, "keep.py"), "w", encoding="utf-8") as f:
|
|
90
|
+
f.write("print('hi')\n")
|
|
91
|
+
with open(os.path.join(root, "skip.bin"), "w", encoding="utf-8") as f:
|
|
92
|
+
f.write("nope\n")
|
|
93
|
+
os.makedirs(os.path.join(root, "node_modules"))
|
|
94
|
+
with open(os.path.join(root, "node_modules", "x.py"), "w", encoding="utf-8") as f:
|
|
95
|
+
f.write("should be excluded\n")
|
|
96
|
+
|
|
97
|
+
docs = list(FileSource(root).documents())
|
|
98
|
+
ids = {d.id for d in docs}
|
|
99
|
+
assert "keep.py" in ids
|
|
100
|
+
assert "skip.bin" not in ids
|
|
101
|
+
assert "node_modules/x.py" not in ids
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# --- index (with a fake embedder) --------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class _FakeSource:
|
|
108
|
+
def documents(self):
|
|
109
|
+
yield Document(id="f.py", text="line1\nline2\nline3", metadata={"path": "f.py"})
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _fake_embedder(texts):
|
|
113
|
+
# deterministic vector: [len, number of lines]
|
|
114
|
+
return [[float(len(t)), float(t.count("\n") + 1)] for t in texts]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_build_index_counts_chunks():
|
|
118
|
+
store = VectorStore(":memory:")
|
|
119
|
+
count = build_index(_FakeSource(), store, _fake_embedder, max_lines=2, overlap=0)
|
|
120
|
+
assert count == store.count()
|
|
121
|
+
assert count >= 2 # 3 lines, 2 per chunk -> at least 2 chunks
|
|
122
|
+
store.close()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_search_index_roundtrip():
|
|
126
|
+
store = VectorStore(":memory:")
|
|
127
|
+
build_index(_FakeSource(), store, _fake_embedder, max_lines=2, overlap=0)
|
|
128
|
+
results = search_index(store, _fake_embedder, "line1\nline2", k=1)
|
|
129
|
+
assert len(results) == 1
|
|
130
|
+
assert "f.py" in results[0][1]["path"]
|
|
131
|
+
store.close()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_search_empty_query():
|
|
135
|
+
store = VectorStore(":memory:")
|
|
136
|
+
assert search_index(store, _fake_embedder, " ", k=3) == []
|
|
137
|
+
store.close()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_format_results_empty():
|
|
141
|
+
assert format_results([]) == "No matches."
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# --- mcp server ---------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _server():
|
|
148
|
+
def search_fn(query, k):
|
|
149
|
+
return [(0.99, {"path": "x.py", "start_line": 1, "end_line": 2, "text": "code"})]
|
|
150
|
+
|
|
151
|
+
return MCPServer(search_fn)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def test_mcp_initialize():
|
|
155
|
+
resp = _server().handle({"jsonrpc": "2.0", "id": 1, "method": "initialize"})
|
|
156
|
+
assert resp["result"]["protocolVersion"]
|
|
157
|
+
assert resp["result"]["serverInfo"]["name"] == "codeseek"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_mcp_tools_list():
|
|
161
|
+
resp = _server().handle({"jsonrpc": "2.0", "id": 2, "method": "tools/list"})
|
|
162
|
+
names = [t["name"] for t in resp["result"]["tools"]]
|
|
163
|
+
assert names == ["search_code"]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_mcp_tools_call():
|
|
167
|
+
resp = _server().handle(
|
|
168
|
+
{
|
|
169
|
+
"jsonrpc": "2.0",
|
|
170
|
+
"id": 3,
|
|
171
|
+
"method": "tools/call",
|
|
172
|
+
"params": {"name": "search_code", "arguments": {"query": "code", "k": 1}},
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
assert "x.py" in resp["result"]["content"][0]["text"]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_mcp_unknown_tool():
|
|
179
|
+
resp = _server().handle(
|
|
180
|
+
{"jsonrpc": "2.0", "id": 4, "method": "tools/call", "params": {"name": "nope"}}
|
|
181
|
+
)
|
|
182
|
+
assert resp["error"]["code"] == -32602
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def test_mcp_notification_returns_none():
|
|
186
|
+
assert _server().handle({"jsonrpc": "2.0", "method": "notifications/initialized"}) is None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_mcp_unknown_method():
|
|
190
|
+
resp = _server().handle({"jsonrpc": "2.0", "id": 5, "method": "bogus"})
|
|
191
|
+
assert resp["error"]["code"] == -32601
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# --- cli parser ---------------------------------------------------------------
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_parser_requires_subcommand():
|
|
198
|
+
with pytest.raises(SystemExit):
|
|
199
|
+
build_parser().parse_args([])
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_parser_index_defaults():
|
|
203
|
+
args = build_parser().parse_args(["index", "src"])
|
|
204
|
+
assert args.command == "index"
|
|
205
|
+
assert args.path == "src"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_parser_search():
|
|
209
|
+
args = build_parser().parse_args(["search", "how does auth work", "-k", "3"])
|
|
210
|
+
assert args.command == "search"
|
|
211
|
+
assert args.k == 3
|