codeseek 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+ .env
11
+
12
+ # codeseek index databases
13
+ *.codeseek.db
14
+ .codeseek.db
15
+
16
+ # Tooling
17
+ .pytest_cache/
18
+ .ruff_cache/
19
+ .mypy_cache/
20
+ .coverage
21
+ htmlcov/
22
+
23
+ # OS
24
+ .DS_Store
25
+ Thumbs.db
26
+ desktop.ini
@@ -0,0 +1,13 @@
1
+ # Changelog
2
+
3
+ ## v0.1.0
4
+
5
+ ### Features
6
+ - index a directory of source files into a local SQLite vector store
7
+ - semantic search over the index from the CLI
8
+ - run as an MCP server (stdio) exposing a `search_code` tool to any MCP client
9
+ - provider-agnostic embeddings: OpenAI, or a local server via `--base-url`
10
+ - pluggable document sources, so the same engine can index more than code
11
+ - zero runtime dependencies (standard library only)
12
+
13
+ Initial release.
codeseek-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Seven Of Nine
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.4
2
+ Name: codeseek
3
+ Version: 0.1.0
4
+ Summary: Semantic code search for your repo, as a CLI and an MCP server. Bring any OpenAI-compatible embedding model. Zero dependencies.
5
+ Project-URL: Homepage, https://github.com/Sev7nOfNine/codeseek
6
+ Project-URL: Repository, https://github.com/Sev7nOfNine/codeseek
7
+ Project-URL: Issues, https://github.com/Sev7nOfNine/codeseek/issues
8
+ Author: Seven Of Nine
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: cli,code-search,developer-tools,embeddings,llm,mcp,model-context-protocol,rag,semantic-search
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Topic :: Software Development
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Topic :: Utilities
22
+ Requires-Python: >=3.8
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest>=7; extra == 'test'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # codeseek
28
+
29
+ [![PyPI](https://img.shields.io/pypi/v/codeseek.svg)](https://pypi.org/project/codeseek/)
30
+ [![CI](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml/badge.svg)](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml)
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
32
+ [![Python](https://img.shields.io/pypi/pyversions/codeseek.svg)](https://pypi.org/project/codeseek/)
33
+
34
+ **Semantic search over your codebase — as a CLI and an MCP server. Zero dependencies.**
35
+
36
+ `codeseek` indexes a repository into a local vector store and lets you search it
37
+ by meaning, not just by string match. Use it from the terminal, or run it as an
38
+ [MCP](https://modelcontextprotocol.io) server so an AI coding assistant or editor
39
+ can ask your codebase questions directly.
40
+
41
+ It brings no embedding model of its own: point it at the OpenAI API, or at any
42
+ OpenAI-compatible endpoint such as a local `llama.cpp` server, so private code
43
+ can be embedded without leaving your machine. Storage is plain SQLite; search is
44
+ brute-force cosine. The whole thing is the Python standard library and nothing
45
+ else.
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install codeseek
51
+ # or:
52
+ pipx install codeseek
53
+ ```
54
+
55
+ Requires Python 3.8+.
56
+
57
+ ## Quick start
58
+
59
+ ```bash
60
+ export OPENAI_API_KEY=sk-...
61
+
62
+ # 1. Index the current repository
63
+ codeseek index .
64
+
65
+ # 2. Search it
66
+ codeseek search "where do we validate the auth token?"
67
+ codeseek search "retry with backoff" -k 3
68
+ ```
69
+
70
+ Results come back as markdown, each with a file path, line range, and similarity
71
+ score:
72
+
73
+ ```markdown
74
+ ### src/auth/token.py:40-70 (score 0.812)
75
+ ​```
76
+ def verify_token(raw: str) -> Claims:
77
+ ...
78
+ ​```
79
+ ```
80
+
81
+ ### Use a local or alternative provider
82
+
83
+ ```bash
84
+ codeseek index . --base-url http://localhost:8080/v1 --model nomic-embed-text
85
+ ```
86
+
87
+ ## As an MCP server
88
+
89
+ `codeseek serve` speaks MCP over stdio and exposes one tool, `search_code`.
90
+
91
+ After indexing a repo, register it with your MCP client. A typical `mcpServers`
92
+ configuration looks like this:
93
+
94
+ ```json
95
+ {
96
+ "mcpServers": {
97
+ "codeseek": {
98
+ "command": "codeseek",
99
+ "args": ["serve", "--db", "/path/to/your/repo/.codeseek.db"],
100
+ "env": { "OPENAI_API_KEY": "sk-..." }
101
+ }
102
+ }
103
+ }
104
+ ```
105
+
106
+ The assistant can then call `search_code` to pull relevant code into its context
107
+ on demand, instead of you pasting files by hand.
108
+
109
+ ## Commands
110
+
111
+ | Command | What it does |
112
+ | --- | --- |
113
+ | `codeseek index [PATH]` | Index a directory (default `.`) into `--db`. |
114
+ | `codeseek search QUERY` | Search the index; `-k` sets result count. |
115
+ | `codeseek serve` | Run the MCP server over stdio. |
116
+
117
+ Shared options: `--db`, `--model`, `--base-url`, `--api-key` (each with an
118
+ environment-variable default).
119
+
120
+ ## How it works
121
+
122
+ 1. **Source** — files are walked and read (sensible code/text extensions, common
123
+ build and vendor directories skipped).
124
+ 2. **Chunking** — each file is split into overlapping line windows.
125
+ 3. **Embedding** — chunks are embedded in batches via your provider.
126
+ 4. **Storage** — vectors land in a local SQLite database.
127
+ 5. **Search** — your query is embedded and compared against every chunk by cosine
128
+ similarity; the top matches are returned.
129
+
130
+ The document source is pluggable: the engine only consumes `Document` objects, so
131
+ the same indexing and search machinery can be pointed at things other than code.
132
+
133
+ ## Privacy note
134
+
135
+ Indexing sends file contents to whichever embeddings provider you configure. For
136
+ private code, prefer a self-hosted model via `--base-url`.
137
+
138
+ ## Scope
139
+
140
+ Search is a linear scan, which is plenty fast for a single repository (a few
141
+ thousand chunks). Indexing very large monorepos would want a real approximate
142
+ vector index — a natural next step, not today's goal.
143
+
144
+ ## Development
145
+
146
+ ```bash
147
+ pip install -e ".[test]"
148
+ python -m pytest
149
+ ```
150
+
151
+ All tests run offline; the embedding and HTTP layers accept injectable fakes.
152
+
153
+ ## License
154
+
155
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,129 @@
1
+ # codeseek
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/codeseek.svg)](https://pypi.org/project/codeseek/)
4
+ [![CI](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml/badge.svg)](https://github.com/Sev7nOfNine/codeseek/actions/workflows/ci.yml)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+ [![Python](https://img.shields.io/pypi/pyversions/codeseek.svg)](https://pypi.org/project/codeseek/)
7
+
8
+ **Semantic search over your codebase — as a CLI and an MCP server. Zero dependencies.**
9
+
10
+ `codeseek` indexes a repository into a local vector store and lets you search it
11
+ by meaning, not just by string match. Use it from the terminal, or run it as an
12
+ [MCP](https://modelcontextprotocol.io) server so an AI coding assistant or editor
13
+ can ask your codebase questions directly.
14
+
15
+ It brings no embedding model of its own: point it at the OpenAI API, or at any
16
+ OpenAI-compatible endpoint such as a local `llama.cpp` server, so private code
17
+ can be embedded without leaving your machine. Storage is plain SQLite; search is
18
+ brute-force cosine. The whole thing is the Python standard library and nothing
19
+ else.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install codeseek
25
+ # or:
26
+ pipx install codeseek
27
+ ```
28
+
29
+ Requires Python 3.8+.
30
+
31
+ ## Quick start
32
+
33
+ ```bash
34
+ export OPENAI_API_KEY=sk-...
35
+
36
+ # 1. Index the current repository
37
+ codeseek index .
38
+
39
+ # 2. Search it
40
+ codeseek search "where do we validate the auth token?"
41
+ codeseek search "retry with backoff" -k 3
42
+ ```
43
+
44
+ Results come back as markdown, each with a file path, line range, and similarity
45
+ score:
46
+
47
+ ```markdown
48
+ ### src/auth/token.py:40-70 (score 0.812)
49
+ ​```
50
+ def verify_token(raw: str) -> Claims:
51
+ ...
52
+ ​```
53
+ ```
54
+
55
+ ### Use a local or alternative provider
56
+
57
+ ```bash
58
+ codeseek index . --base-url http://localhost:8080/v1 --model nomic-embed-text
59
+ ```
60
+
61
+ ## As an MCP server
62
+
63
+ `codeseek serve` speaks MCP over stdio and exposes one tool, `search_code`.
64
+
65
+ After indexing a repo, register it with your MCP client. A typical `mcpServers`
66
+ configuration looks like this:
67
+
68
+ ```json
69
+ {
70
+ "mcpServers": {
71
+ "codeseek": {
72
+ "command": "codeseek",
73
+ "args": ["serve", "--db", "/path/to/your/repo/.codeseek.db"],
74
+ "env": { "OPENAI_API_KEY": "sk-..." }
75
+ }
76
+ }
77
+ }
78
+ ```
79
+
80
+ The assistant can then call `search_code` to pull relevant code into its context
81
+ on demand, instead of you pasting files by hand.
82
+
83
+ ## Commands
84
+
85
+ | Command | What it does |
86
+ | --- | --- |
87
+ | `codeseek index [PATH]` | Index a directory (default `.`) into `--db`. |
88
+ | `codeseek search QUERY` | Search the index; `-k` sets result count. |
89
+ | `codeseek serve` | Run the MCP server over stdio. |
90
+
91
+ Shared options: `--db`, `--model`, `--base-url`, `--api-key` (each with an
92
+ environment-variable default).
93
+
94
+ ## How it works
95
+
96
+ 1. **Source** — files are walked and read (sensible code/text extensions, common
97
+ build and vendor directories skipped).
98
+ 2. **Chunking** — each file is split into overlapping line windows.
99
+ 3. **Embedding** — chunks are embedded in batches via your provider.
100
+ 4. **Storage** — vectors land in a local SQLite database.
101
+ 5. **Search** — your query is embedded and compared against every chunk by cosine
102
+ similarity; the top matches are returned.
103
+
104
+ The document source is pluggable: the engine only consumes `Document` objects, so
105
+ the same indexing and search machinery can be pointed at things other than code.
106
+
107
+ ## Privacy note
108
+
109
+ Indexing sends file contents to whichever embeddings provider you configure. For
110
+ private code, prefer a self-hosted model via `--base-url`.
111
+
112
+ ## Scope
113
+
114
+ Search is a linear scan, which is plenty fast for a single repository (a few
115
+ thousand chunks). Indexing very large monorepos would want a real approximate
116
+ vector index — a natural next step, not today's goal.
117
+
118
+ ## Development
119
+
120
+ ```bash
121
+ pip install -e ".[test]"
122
+ python -m pytest
123
+ ```
124
+
125
+ All tests run offline; the embedding and HTTP layers accept injectable fakes.
126
+
127
+ ## License
128
+
129
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "codeseek"
7
+ version = "0.1.0"
8
+ description = "Semantic code search for your repo, as a CLI and an MCP server. Bring any OpenAI-compatible embedding model. Zero dependencies."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Seven Of Nine" }]
13
+ keywords = [
14
+ "semantic-search",
15
+ "code-search",
16
+ "embeddings",
17
+ "rag",
18
+ "mcp",
19
+ "model-context-protocol",
20
+ "llm",
21
+ "developer-tools",
22
+ "cli",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 4 - Beta",
26
+ "Environment :: Console",
27
+ "Intended Audience :: Developers",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3 :: Only",
32
+ "Topic :: Software Development",
33
+ "Topic :: Software Development :: Libraries",
34
+ "Topic :: Utilities",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ test = ["pytest>=7"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/Sev7nOfNine/codeseek"
42
+ Repository = "https://github.com/Sev7nOfNine/codeseek"
43
+ Issues = "https://github.com/Sev7nOfNine/codeseek/issues"
44
+
45
+ [project.scripts]
46
+ codeseek = "codeseek.cli:main"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["src/codeseek"]
50
+
51
+ [tool.hatch.build.targets.sdist]
52
+ include = ["src/codeseek", "README.md", "LICENSE", "CHANGELOG.md", "tests"]
@@ -0,0 +1,5 @@
1
+ """codeseek — semantic code search as a CLI and an MCP server."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ["__version__"]
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
@@ -0,0 +1,34 @@
1
+ """Split text into overlapping line windows for embedding."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Tuple
6
+
7
+ # (start_line, end_line, text), with 1-based inclusive line numbers.
8
+ Chunk = Tuple[int, int, str]
9
+
10
+
11
+ def chunk_text(text: str, *, max_lines: int = 60, overlap: int = 10) -> List[Chunk]:
12
+ """Break ``text`` into overlapping windows of at most ``max_lines`` lines."""
13
+ if max_lines <= 0:
14
+ raise ValueError("max_lines must be positive")
15
+ if overlap >= max_lines:
16
+ overlap = max_lines // 2
17
+
18
+ lines = text.splitlines()
19
+ if not lines:
20
+ return []
21
+
22
+ step = max_lines - overlap
23
+ chunks: List[Chunk] = []
24
+ n = len(lines)
25
+ i = 0
26
+ while i < n:
27
+ window = lines[i : i + max_lines]
28
+ start = i + 1
29
+ end = min(i + max_lines, n)
30
+ chunks.append((start, end, "\n".join(window)))
31
+ if i + max_lines >= n:
32
+ break
33
+ i += step
34
+ return chunks
@@ -0,0 +1,152 @@
1
+ """Command-line interface for codeseek."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import os
7
+ import sys
8
+ from typing import Callable, List, Optional
9
+
10
+ from . import __version__
11
+ from .embeddings import EmbeddingError, embed
12
+ from .index import build_index
13
+ from .search import format_results, search_index
14
+ from .sources import FileSource
15
+ from .store import VectorStore
16
+
17
+ DEFAULT_DB = ".codeseek.db"
18
+ DEFAULT_MODEL = "text-embedding-3-small"
19
+
20
+ Embedder = Callable[[List[str]], List[List[float]]]
21
+
22
+
23
+ def _add_provider_args(sub: argparse.ArgumentParser) -> None:
24
+ sub.add_argument(
25
+ "--model",
26
+ default=os.environ.get("CODESEEK_MODEL", DEFAULT_MODEL),
27
+ metavar="NAME",
28
+ help="embedding model (default: {0}, or $CODESEEK_MODEL).".format(DEFAULT_MODEL),
29
+ )
30
+ sub.add_argument(
31
+ "--base-url",
32
+ default=os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1"),
33
+ metavar="URL",
34
+ help="OpenAI-compatible base URL (default: $OPENAI_BASE_URL or OpenAI).",
35
+ )
36
+ sub.add_argument(
37
+ "--api-key",
38
+ default=os.environ.get("OPENAI_API_KEY"),
39
+ metavar="KEY",
40
+ help="embeddings API key (default: $OPENAI_API_KEY).",
41
+ )
42
+ sub.add_argument(
43
+ "--db",
44
+ default=DEFAULT_DB,
45
+ metavar="FILE",
46
+ help="index database path (default: {0}).".format(DEFAULT_DB),
47
+ )
48
+
49
+
50
+ def build_parser() -> argparse.ArgumentParser:
51
+ parser = argparse.ArgumentParser(
52
+ prog="codeseek",
53
+ description="Semantic code search as a CLI and an MCP server.",
54
+ )
55
+ parser.add_argument(
56
+ "--version", action="version", version="%(prog)s {0}".format(__version__)
57
+ )
58
+ sub = parser.add_subparsers(dest="command", required=True)
59
+
60
+ p_index = sub.add_parser("index", help="index a directory of code.")
61
+ p_index.add_argument(
62
+ "path", nargs="?", default=".", help="directory to index (default: .)."
63
+ )
64
+ p_index.add_argument(
65
+ "--max-lines", type=int, default=60, help="lines per chunk (default: 60)."
66
+ )
67
+ p_index.add_argument(
68
+ "--overlap", type=int, default=10, help="overlapping lines (default: 10)."
69
+ )
70
+ _add_provider_args(p_index)
71
+
72
+ p_search = sub.add_parser("search", help="search the index.")
73
+ p_search.add_argument("query", help="what to look for.")
74
+ p_search.add_argument(
75
+ "-k", type=int, default=5, help="number of results (default: 5)."
76
+ )
77
+ _add_provider_args(p_search)
78
+
79
+ p_serve = sub.add_parser("serve", help="run the MCP server over stdio.")
80
+ _add_provider_args(p_serve)
81
+
82
+ return parser
83
+
84
+
85
+ def _make_embedder(args: argparse.Namespace) -> Embedder:
86
+ def embedder(texts: List[str]) -> List[List[float]]:
87
+ return embed(
88
+ texts, model=args.model, api_key=args.api_key, base_url=args.base_url
89
+ )
90
+
91
+ return embedder
92
+
93
+
94
+ def main(argv: Optional[List[str]] = None) -> int:
95
+ parser = build_parser()
96
+ args = parser.parse_args(argv)
97
+
98
+ if not args.api_key:
99
+ parser.exit(2, "codeseek: no API key (set $OPENAI_API_KEY or --api-key)\n")
100
+
101
+ embedder = _make_embedder(args)
102
+
103
+ if args.command == "index":
104
+ source = FileSource(args.path)
105
+ store = VectorStore(args.db)
106
+ try:
107
+ count = build_index(
108
+ source,
109
+ store,
110
+ embedder,
111
+ max_lines=args.max_lines,
112
+ overlap=args.overlap,
113
+ progress=lambda doc_id: sys.stderr.write(
114
+ " indexed {0}\n".format(doc_id)
115
+ ),
116
+ )
117
+ except EmbeddingError as exc:
118
+ parser.exit(1, "codeseek: {0}\n".format(exc))
119
+ finally:
120
+ store.close()
121
+ sys.stderr.write("codeseek: indexed {0} chunks into {1}\n".format(count, args.db))
122
+ return 0
123
+
124
+ if args.command == "search":
125
+ store = VectorStore(args.db)
126
+ try:
127
+ results = search_index(store, embedder, args.query, k=args.k)
128
+ except EmbeddingError as exc:
129
+ parser.exit(1, "codeseek: {0}\n".format(exc))
130
+ finally:
131
+ store.close()
132
+ sys.stdout.write(format_results(results) + "\n")
133
+ return 0
134
+
135
+ if args.command == "serve":
136
+ from .mcp_server import MCPServer
137
+
138
+ store = VectorStore(args.db)
139
+
140
+ def search_fn(query: str, k: int):
141
+ return search_index(store, embedder, query, k=k)
142
+
143
+ MCPServer(search_fn, server_version=__version__).serve()
144
+ store.close()
145
+ return 0
146
+
147
+ parser.error("unknown command")
148
+ return 2
149
+
150
+
151
+ if __name__ == "__main__":
152
+ raise SystemExit(main())
@@ -0,0 +1,65 @@
1
+ """Embeddings client for OpenAI-compatible APIs, using only the stdlib.
2
+
3
+ Works with the OpenAI platform, or any compatible ``/embeddings`` endpoint such
4
+ as a local llama.cpp server, so private code can be embedded without leaving the
5
+ machine.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import urllib.error
12
+ import urllib.request
13
+ from typing import Any, Callable, Dict, List, Optional
14
+
15
+ Opener = Callable[..., Any]
16
+
17
+
18
+ class EmbeddingError(RuntimeError):
19
+ """Raised when the embeddings request fails or returns an unexpected shape."""
20
+
21
+
22
+ def parse_embeddings(data: Dict[str, Any]) -> List[List[float]]:
23
+ """Pull the list of embedding vectors out of an embeddings response."""
24
+ try:
25
+ items = data["data"]
26
+ return [item["embedding"] for item in items]
27
+ except (KeyError, TypeError):
28
+ raise EmbeddingError(
29
+ "unexpected embeddings response: {0}".format(json.dumps(data)[:300])
30
+ )
31
+
32
+
33
+ def embed(
34
+ texts: List[str],
35
+ *,
36
+ model: str,
37
+ api_key: str,
38
+ base_url: str = "https://api.openai.com/v1",
39
+ timeout: int = 120,
40
+ opener: Optional[Opener] = None,
41
+ ) -> List[List[float]]:
42
+ """Embed a batch of texts and return one vector per input."""
43
+ if not texts:
44
+ return []
45
+
46
+ url = base_url.rstrip("/") + "/embeddings"
47
+ payload = json.dumps({"model": model, "input": texts}).encode("utf-8")
48
+
49
+ req = urllib.request.Request(url, data=payload, method="POST")
50
+ req.add_header("Content-Type", "application/json")
51
+ req.add_header("Authorization", "Bearer " + api_key)
52
+
53
+ do_open = opener or urllib.request.urlopen
54
+ try:
55
+ with do_open(req, timeout=timeout) as resp:
56
+ raw = resp.read().decode("utf-8")
57
+ except urllib.error.HTTPError as exc:
58
+ body = exc.read().decode("utf-8", "replace")
59
+ raise EmbeddingError(
60
+ "embeddings request failed (HTTP {0}): {1}".format(exc.code, body[:500])
61
+ )
62
+ except urllib.error.URLError as exc:
63
+ raise EmbeddingError("embeddings request failed: {0}".format(exc.reason))
64
+
65
+ return parse_embeddings(json.loads(raw))
@@ -0,0 +1,71 @@
1
+ """Build a search index: source -> chunks -> embeddings -> store."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, List, Optional
6
+
7
+ from .chunking import chunk_text
8
+ from .sources import Document
9
+ from .store import VectorStore
10
+
11
+ # An embedder turns a batch of texts into a batch of vectors.
12
+ Embedder = Callable[[List[str]], List[List[float]]]
13
+ # Anything iterable of Documents (FileSource, or a future notes source).
14
+ Source = "object with a documents() iterator"
15
+
16
+
17
+ def build_index(
18
+ source,
19
+ store: VectorStore,
20
+ embedder: Embedder,
21
+ *,
22
+ batch_size: int = 64,
23
+ max_lines: int = 60,
24
+ overlap: int = 10,
25
+ progress: Optional[Callable[[str], None]] = None,
26
+ ) -> int:
27
+ """Index every document from ``source`` into ``store``. Returns chunk count."""
28
+ store.clear()
29
+
30
+ pending_rows: List[dict] = []
31
+ pending_text: List[str] = []
32
+ total = 0
33
+
34
+ def flush() -> None:
35
+ nonlocal total
36
+ if not pending_text:
37
+ return
38
+ vectors = embedder(pending_text)
39
+ rows = []
40
+ for row, vec in zip(pending_rows, vectors):
41
+ enriched = dict(row)
42
+ enriched["embedding"] = vec
43
+ rows.append(enriched)
44
+ store.add(rows)
45
+ total += len(rows)
46
+ pending_rows.clear()
47
+ pending_text.clear()
48
+
49
+ for doc in source.documents(): # type: Document
50
+ path = doc.metadata.get("path", doc.id)
51
+ for start, end, text in chunk_text(
52
+ doc.text, max_lines=max_lines, overlap=overlap
53
+ ):
54
+ pending_rows.append(
55
+ {
56
+ "id": "{0}:{1}-{2}".format(doc.id, start, end),
57
+ "doc_id": doc.id,
58
+ "path": path,
59
+ "start_line": start,
60
+ "end_line": end,
61
+ "text": text,
62
+ }
63
+ )
64
+ pending_text.append(text)
65
+ if len(pending_text) >= batch_size:
66
+ flush()
67
+ if progress is not None:
68
+ progress(doc.id)
69
+
70
+ flush()
71
+ return total
@@ -0,0 +1,128 @@
1
+ """A minimal Model Context Protocol (MCP) server over stdio.
2
+
3
+ Implements just enough of the protocol — ``initialize``, ``tools/list`` and
4
+ ``tools/call`` exchanged as newline-delimited JSON-RPC 2.0 — to expose a single
5
+ ``search_code`` tool to any MCP client, with no third-party dependencies.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import sys
12
+ from typing import Any, Callable, Dict, List, Optional
13
+
14
+ from .search import format_results
15
+ from .store import ScoredChunk
16
+
17
+ PROTOCOL_VERSION = "2024-11-05"
18
+
19
+ SEARCH_TOOL: Dict[str, Any] = {
20
+ "name": "search_code",
21
+ "description": (
22
+ "Semantic search over the indexed codebase. Returns the most relevant "
23
+ "code chunks with their file path and line range."
24
+ ),
25
+ "inputSchema": {
26
+ "type": "object",
27
+ "properties": {
28
+ "query": {
29
+ "type": "string",
30
+ "description": "A natural-language or code query.",
31
+ },
32
+ "k": {
33
+ "type": "integer",
34
+ "description": "How many results to return (default 5).",
35
+ },
36
+ },
37
+ "required": ["query"],
38
+ },
39
+ }
40
+
41
+ # A search function takes (query, k) and returns scored chunks.
42
+ SearchFn = Callable[[str, int], List[ScoredChunk]]
43
+
44
+
45
+ class MCPServer:
46
+ """Dispatch MCP JSON-RPC requests to a search backend."""
47
+
48
+ def __init__(
49
+ self,
50
+ search_fn: SearchFn,
51
+ *,
52
+ server_name: str = "codeseek",
53
+ server_version: str = "0.1.0",
54
+ ) -> None:
55
+ self.search_fn = search_fn
56
+ self.server_name = server_name
57
+ self.server_version = server_version
58
+
59
+ def _error(self, req_id: Any, code: int, message: str) -> Dict[str, Any]:
60
+ return {
61
+ "jsonrpc": "2.0",
62
+ "id": req_id,
63
+ "error": {"code": code, "message": message},
64
+ }
65
+
66
+ def handle(self, request: Dict[str, Any]) -> Optional[Dict[str, Any]]:
67
+ """Handle one request. Returns a response, or None for notifications."""
68
+ method = request.get("method")
69
+ req_id = request.get("id")
70
+
71
+ if method == "initialize":
72
+ result: Dict[str, Any] = {
73
+ "protocolVersion": PROTOCOL_VERSION,
74
+ "capabilities": {"tools": {}},
75
+ "serverInfo": {
76
+ "name": self.server_name,
77
+ "version": self.server_version,
78
+ },
79
+ }
80
+ elif method == "tools/list":
81
+ result = {"tools": [SEARCH_TOOL]}
82
+ elif method == "tools/call":
83
+ params = request.get("params") or {}
84
+ if params.get("name") != "search_code":
85
+ return self._error(
86
+ req_id, -32602, "unknown tool: {0}".format(params.get("name"))
87
+ )
88
+ args = params.get("arguments") or {}
89
+ query = args.get("query", "")
90
+ try:
91
+ k = int(args.get("k", 5) or 5)
92
+ except (TypeError, ValueError):
93
+ k = 5
94
+ try:
95
+ results = self.search_fn(query, k)
96
+ result = {"content": [{"type": "text", "text": format_results(results)}]}
97
+ except Exception as exc: # surface errors to the client, do not crash
98
+ return {
99
+ "jsonrpc": "2.0",
100
+ "id": req_id,
101
+ "result": {
102
+ "content": [{"type": "text", "text": "error: {0}".format(exc)}],
103
+ "isError": True,
104
+ },
105
+ }
106
+ elif method and method.startswith("notifications/"):
107
+ return None
108
+ else:
109
+ return self._error(req_id, -32601, "method not found: {0}".format(method))
110
+
111
+ return {"jsonrpc": "2.0", "id": req_id, "result": result}
112
+
113
+ def serve(self, stdin: Any = None, stdout: Any = None) -> None:
114
+ """Read newline-delimited JSON-RPC from stdin, write responses to stdout."""
115
+ stdin = stdin if stdin is not None else sys.stdin
116
+ stdout = stdout if stdout is not None else sys.stdout
117
+ for line in stdin:
118
+ line = line.strip()
119
+ if not line:
120
+ continue
121
+ try:
122
+ request = json.loads(line)
123
+ except json.JSONDecodeError:
124
+ continue
125
+ response = self.handle(request)
126
+ if response is not None:
127
+ stdout.write(json.dumps(response) + "\n")
128
+ stdout.flush()
@@ -0,0 +1,33 @@
1
+ """Query the index: embed the query, return the most similar chunks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, List
6
+
7
+ from .store import ScoredChunk, VectorStore
8
+
9
+ Embedder = Callable[[List[str]], List[List[float]]]
10
+
11
+
12
+ def search_index(
13
+ store: VectorStore, embedder: Embedder, query: str, k: int = 5
14
+ ) -> List[ScoredChunk]:
15
+ """Return the top ``k`` chunks most similar to ``query``."""
16
+ if not query.strip():
17
+ return []
18
+ query_vec = embedder([query])[0]
19
+ return store.search(query_vec, k)
20
+
21
+
22
+ def format_results(results: List[ScoredChunk]) -> str:
23
+ """Render scored chunks as readable markdown."""
24
+ if not results:
25
+ return "No matches."
26
+ parts = []
27
+ for score, row in results:
28
+ parts.append(
29
+ "### {0}:{1}-{2} (score {3:.3f})\n```\n{4}\n```".format(
30
+ row["path"], row["start_line"], row["end_line"], score, row["text"]
31
+ )
32
+ )
33
+ return "\n\n".join(parts)
@@ -0,0 +1,78 @@
1
+ """Document sources for indexing.
2
+
3
+ A *source* is anything that yields :class:`Document` objects. The rest of
4
+ codeseek (chunking, embedding, storage, search) never cares where documents come
5
+ from, so a new source — code files, markdown notes, a wiki export — is all it
6
+ takes to point the same engine at different data.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ from dataclasses import dataclass, field
13
+ from typing import Dict, Iterable, Iterator, Optional, Set
14
+
15
+
16
+ @dataclass
17
+ class Document:
18
+ """A unit of text to index."""
19
+
20
+ id: str
21
+ text: str
22
+ metadata: Dict[str, str] = field(default_factory=dict)
23
+
24
+
25
+ # A pragmatic default set of source-code and text extensions.
26
+ DEFAULT_EXTENSIONS: Set[str] = {
27
+ ".py", ".pyi", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs", ".java", ".kt",
28
+ ".c", ".h", ".cc", ".cpp", ".hpp", ".cs", ".rb", ".php", ".swift", ".scala",
29
+ ".sh", ".bash", ".ps1", ".lua", ".r", ".jl", ".sql", ".html", ".css", ".scss",
30
+ ".md", ".rst", ".txt", ".toml", ".yaml", ".yml", ".json", ".ini", ".cfg",
31
+ }
32
+
33
+ DEFAULT_EXCLUDES: Set[str] = {
34
+ ".git", ".hg", ".svn", "__pycache__", "node_modules", ".venv", "venv",
35
+ "dist", "build", ".mypy_cache", ".pytest_cache", ".ruff_cache", ".idea",
36
+ ".tox", "target", "vendor",
37
+ }
38
+
39
+ # Skip files larger than this; they are usually generated or binary.
40
+ MAX_FILE_BYTES = 400_000
41
+
42
+
43
+ class FileSource:
44
+ """Yield documents from source files under a directory tree."""
45
+
46
+ def __init__(
47
+ self,
48
+ root: str,
49
+ *,
50
+ extensions: Optional[Iterable[str]] = None,
51
+ exclude_dirs: Optional[Iterable[str]] = None,
52
+ max_file_bytes: int = MAX_FILE_BYTES,
53
+ ) -> None:
54
+ self.root = root
55
+ self.extensions = set(extensions) if extensions else set(DEFAULT_EXTENSIONS)
56
+ self.exclude_dirs = set(exclude_dirs) if exclude_dirs else set(DEFAULT_EXCLUDES)
57
+ self.max_file_bytes = max_file_bytes
58
+
59
+ def documents(self) -> Iterator[Document]:
60
+ for dirpath, dirnames, filenames in os.walk(self.root):
61
+ # Prune excluded directories in place so os.walk does not descend.
62
+ dirnames[:] = [d for d in dirnames if d not in self.exclude_dirs]
63
+ for name in filenames:
64
+ ext = os.path.splitext(name)[1].lower()
65
+ if ext not in self.extensions:
66
+ continue
67
+ full = os.path.join(dirpath, name)
68
+ try:
69
+ if os.path.getsize(full) > self.max_file_bytes:
70
+ continue
71
+ with open(full, "r", encoding="utf-8") as handle:
72
+ text = handle.read()
73
+ except (OSError, UnicodeDecodeError):
74
+ continue
75
+ if not text.strip():
76
+ continue
77
+ rel = os.path.relpath(full, self.root).replace(os.sep, "/")
78
+ yield Document(id=rel, text=text, metadata={"path": rel})
@@ -0,0 +1,103 @@
1
+ """A small SQLite-backed vector store with brute-force cosine search.
2
+
3
+ Brute force is intentional: for a single repository (a few thousand chunks) a
4
+ linear scan in Python is fast enough and keeps the dependency count at zero. For
5
+ much larger corpora a real vector index would be the next step.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import math
12
+ import sqlite3
13
+ from typing import Dict, List, Optional, Sequence, Tuple
14
+
15
+ ScoredChunk = Tuple[float, Dict[str, object]]
16
+
17
+ _SCHEMA = """
18
+ CREATE TABLE IF NOT EXISTS chunks (
19
+ id TEXT PRIMARY KEY,
20
+ doc_id TEXT,
21
+ path TEXT,
22
+ start_line INTEGER,
23
+ end_line INTEGER,
24
+ text TEXT,
25
+ embedding TEXT
26
+ )
27
+ """
28
+
29
+
30
+ def cosine(a: Sequence[float], b: Sequence[float]) -> float:
31
+ """Cosine similarity between two equal-length vectors."""
32
+ if len(a) != len(b):
33
+ raise ValueError("vectors have different lengths")
34
+ dot = sum(x * y for x, y in zip(a, b))
35
+ norm_a = math.sqrt(sum(x * x for x in a))
36
+ norm_b = math.sqrt(sum(y * y for y in b))
37
+ if norm_a == 0.0 or norm_b == 0.0:
38
+ return 0.0
39
+ return dot / (norm_a * norm_b)
40
+
41
+
42
+ class VectorStore:
43
+ """Persisted store of embedded text chunks."""
44
+
45
+ def __init__(self, path: str = ":memory:") -> None:
46
+ self.conn = sqlite3.connect(path)
47
+ self.conn.execute(_SCHEMA)
48
+ self.conn.commit()
49
+
50
+ def clear(self) -> None:
51
+ self.conn.execute("DELETE FROM chunks")
52
+ self.conn.commit()
53
+
54
+ def add(self, rows: List[Dict[str, object]]) -> None:
55
+ """Insert (or replace) chunk rows. Each row carries an ``embedding`` list."""
56
+ self.conn.executemany(
57
+ "INSERT OR REPLACE INTO chunks "
58
+ "(id, doc_id, path, start_line, end_line, text, embedding) "
59
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
60
+ [
61
+ (
62
+ r["id"],
63
+ r["doc_id"],
64
+ r["path"],
65
+ r["start_line"],
66
+ r["end_line"],
67
+ r["text"],
68
+ json.dumps(r["embedding"]),
69
+ )
70
+ for r in rows
71
+ ],
72
+ )
73
+ self.conn.commit()
74
+
75
+ def count(self) -> int:
76
+ return int(self.conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0])
77
+
78
+ def search(self, query_vec: Sequence[float], k: int = 5) -> List[ScoredChunk]:
79
+ cur = self.conn.execute(
80
+ "SELECT id, doc_id, path, start_line, end_line, text, embedding FROM chunks"
81
+ )
82
+ scored: List[ScoredChunk] = []
83
+ for row in cur:
84
+ embedding = json.loads(row[6])
85
+ score = cosine(query_vec, embedding)
86
+ scored.append(
87
+ (
88
+ score,
89
+ {
90
+ "id": row[0],
91
+ "doc_id": row[1],
92
+ "path": row[2],
93
+ "start_line": row[3],
94
+ "end_line": row[4],
95
+ "text": row[5],
96
+ },
97
+ )
98
+ )
99
+ scored.sort(key=lambda item: item[0], reverse=True)
100
+ return scored[:k]
101
+
102
+ def close(self) -> None:
103
+ self.conn.close()
@@ -0,0 +1,211 @@
1
+ """Offline tests for codeseek. No network, no real embeddings."""
2
+
3
+ import json
4
+ import os
5
+ import tempfile
6
+
7
+ import pytest
8
+
9
+ from codeseek import embeddings
10
+ from codeseek.chunking import chunk_text
11
+ from codeseek.cli import build_parser
12
+ from codeseek.index import build_index
13
+ from codeseek.mcp_server import MCPServer
14
+ from codeseek.search import format_results, search_index
15
+ from codeseek.sources import Document, FileSource
16
+ from codeseek.store import VectorStore, cosine
17
+
18
+
19
+ # --- chunking -----------------------------------------------------------------
20
+
21
+
22
+ def test_chunk_short_text_single_chunk():
23
+ chunks = chunk_text("a\nb\nc", max_lines=60, overlap=10)
24
+ assert len(chunks) == 1
25
+ assert chunks[0] == (1, 3, "a\nb\nc")
26
+
27
+
28
+ def test_chunk_overlap_and_windows():
29
+ text = "\n".join(str(i) for i in range(1, 26)) # 25 lines
30
+ chunks = chunk_text(text, max_lines=10, overlap=2)
31
+ assert chunks[0][0] == 1 and chunks[0][1] == 10
32
+ # step = 8, so second window starts at line 9
33
+ assert chunks[1][0] == 9
34
+ assert chunks[-1][1] == 25 # last window reaches the end
35
+
36
+
37
+ def test_chunk_empty():
38
+ assert chunk_text("", max_lines=10) == []
39
+
40
+
41
+ # --- embeddings parsing -------------------------------------------------------
42
+
43
+
44
+ def test_parse_embeddings_ok():
45
+ data = {"data": [{"embedding": [0.1, 0.2]}, {"embedding": [0.3, 0.4]}]}
46
+ assert embeddings.parse_embeddings(data) == [[0.1, 0.2], [0.3, 0.4]]
47
+
48
+
49
+ def test_parse_embeddings_bad():
50
+ with pytest.raises(embeddings.EmbeddingError):
51
+ embeddings.parse_embeddings({"oops": 1})
52
+
53
+
54
+ def test_embed_empty_returns_empty():
55
+ assert embeddings.embed([], model="m", api_key="k") == []
56
+
57
+
58
+ # --- store --------------------------------------------------------------------
59
+
60
+
61
+ def test_cosine_basics():
62
+ assert cosine([1, 0], [1, 0]) == pytest.approx(1.0)
63
+ assert cosine([1, 0], [0, 1]) == pytest.approx(0.0)
64
+ assert cosine([0, 0], [1, 1]) == 0.0
65
+
66
+
67
+ def test_store_add_and_search():
68
+ store = VectorStore(":memory:")
69
+ store.add(
70
+ [
71
+ {"id": "a:1-1", "doc_id": "a", "path": "a.py", "start_line": 1,
72
+ "end_line": 1, "text": "alpha", "embedding": [1.0, 0.0]},
73
+ {"id": "b:1-1", "doc_id": "b", "path": "b.py", "start_line": 1,
74
+ "end_line": 1, "text": "beta", "embedding": [0.0, 1.0]},
75
+ ]
76
+ )
77
+ assert store.count() == 2
78
+ results = store.search([0.9, 0.1], k=1)
79
+ assert len(results) == 1
80
+ assert results[0][1]["path"] == "a.py"
81
+ store.close()
82
+
83
+
84
+ # --- sources ------------------------------------------------------------------
85
+
86
+
87
+ def test_file_source_walks_and_filters():
88
+ with tempfile.TemporaryDirectory() as root:
89
+ with open(os.path.join(root, "keep.py"), "w", encoding="utf-8") as f:
90
+ f.write("print('hi')\n")
91
+ with open(os.path.join(root, "skip.bin"), "w", encoding="utf-8") as f:
92
+ f.write("nope\n")
93
+ os.makedirs(os.path.join(root, "node_modules"))
94
+ with open(os.path.join(root, "node_modules", "x.py"), "w", encoding="utf-8") as f:
95
+ f.write("should be excluded\n")
96
+
97
+ docs = list(FileSource(root).documents())
98
+ ids = {d.id for d in docs}
99
+ assert "keep.py" in ids
100
+ assert "skip.bin" not in ids
101
+ assert "node_modules/x.py" not in ids
102
+
103
+
104
+ # --- index (with a fake embedder) --------------------------------------------
105
+
106
+
107
+ class _FakeSource:
108
+ def documents(self):
109
+ yield Document(id="f.py", text="line1\nline2\nline3", metadata={"path": "f.py"})
110
+
111
+
112
+ def _fake_embedder(texts):
113
+ # deterministic vector: [len, number of lines]
114
+ return [[float(len(t)), float(t.count("\n") + 1)] for t in texts]
115
+
116
+
117
+ def test_build_index_counts_chunks():
118
+ store = VectorStore(":memory:")
119
+ count = build_index(_FakeSource(), store, _fake_embedder, max_lines=2, overlap=0)
120
+ assert count == store.count()
121
+ assert count >= 2 # 3 lines, 2 per chunk -> at least 2 chunks
122
+ store.close()
123
+
124
+
125
+ def test_search_index_roundtrip():
126
+ store = VectorStore(":memory:")
127
+ build_index(_FakeSource(), store, _fake_embedder, max_lines=2, overlap=0)
128
+ results = search_index(store, _fake_embedder, "line1\nline2", k=1)
129
+ assert len(results) == 1
130
+ assert "f.py" in results[0][1]["path"]
131
+ store.close()
132
+
133
+
134
+ def test_search_empty_query():
135
+ store = VectorStore(":memory:")
136
+ assert search_index(store, _fake_embedder, " ", k=3) == []
137
+ store.close()
138
+
139
+
140
+ def test_format_results_empty():
141
+ assert format_results([]) == "No matches."
142
+
143
+
144
+ # --- mcp server ---------------------------------------------------------------
145
+
146
+
147
+ def _server():
148
+ def search_fn(query, k):
149
+ return [(0.99, {"path": "x.py", "start_line": 1, "end_line": 2, "text": "code"})]
150
+
151
+ return MCPServer(search_fn)
152
+
153
+
154
+ def test_mcp_initialize():
155
+ resp = _server().handle({"jsonrpc": "2.0", "id": 1, "method": "initialize"})
156
+ assert resp["result"]["protocolVersion"]
157
+ assert resp["result"]["serverInfo"]["name"] == "codeseek"
158
+
159
+
160
+ def test_mcp_tools_list():
161
+ resp = _server().handle({"jsonrpc": "2.0", "id": 2, "method": "tools/list"})
162
+ names = [t["name"] for t in resp["result"]["tools"]]
163
+ assert names == ["search_code"]
164
+
165
+
166
+ def test_mcp_tools_call():
167
+ resp = _server().handle(
168
+ {
169
+ "jsonrpc": "2.0",
170
+ "id": 3,
171
+ "method": "tools/call",
172
+ "params": {"name": "search_code", "arguments": {"query": "code", "k": 1}},
173
+ }
174
+ )
175
+ assert "x.py" in resp["result"]["content"][0]["text"]
176
+
177
+
178
+ def test_mcp_unknown_tool():
179
+ resp = _server().handle(
180
+ {"jsonrpc": "2.0", "id": 4, "method": "tools/call", "params": {"name": "nope"}}
181
+ )
182
+ assert resp["error"]["code"] == -32602
183
+
184
+
185
+ def test_mcp_notification_returns_none():
186
+ assert _server().handle({"jsonrpc": "2.0", "method": "notifications/initialized"}) is None
187
+
188
+
189
+ def test_mcp_unknown_method():
190
+ resp = _server().handle({"jsonrpc": "2.0", "id": 5, "method": "bogus"})
191
+ assert resp["error"]["code"] == -32601
192
+
193
+
194
+ # --- cli parser ---------------------------------------------------------------
195
+
196
+
197
+ def test_parser_requires_subcommand():
198
+ with pytest.raises(SystemExit):
199
+ build_parser().parse_args([])
200
+
201
+
202
+ def test_parser_index_defaults():
203
+ args = build_parser().parse_args(["index", "src"])
204
+ assert args.command == "index"
205
+ assert args.path == "src"
206
+
207
+
208
+ def test_parser_search():
209
+ args = build_parser().parse_args(["search", "how does auth work", "-k", "3"])
210
+ assert args.command == "search"
211
+ assert args.k == 3