codexlr8 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codexlr8-0.0.1 → codexlr8-0.0.3}/PKG-INFO +51 -4
- {codexlr8-0.0.1 → codexlr8-0.0.3}/README.md +48 -3
- {codexlr8-0.0.1 → codexlr8-0.0.3}/pyproject.toml +4 -1
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/__init__.py +1 -1
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/cli.py +336 -6
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/config.py +10 -0
- codexlr8-0.0.3/src/codexlr8/embeddings.py +147 -0
- codexlr8-0.0.3/src/codexlr8/eval.py +246 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/mcp_server.py +10 -1
- codexlr8-0.0.3/src/codexlr8/search.py +690 -0
- codexlr8-0.0.3/src/codexlr8/train.py +175 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/PKG-INFO +51 -4
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/SOURCES.txt +4 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/requires.txt +3 -0
- codexlr8-0.0.3/tests/test_eval.py +209 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/tests/test_mcp_server.py +26 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/tests/test_search.py +144 -5
- codexlr8-0.0.1/src/codexlr8/search.py +0 -405
- {codexlr8-0.0.1 → codexlr8-0.0.3}/LICENSE +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/setup.cfg +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/meta.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8/scanner.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/dependency_links.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/entry_points.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/src/codexlr8.egg-info/top_level.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/tests/test_meta.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.3}/tests/test_scanner.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codexlr8
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: A codebase search engine for LLM coding agents
|
|
5
5
|
Author-email: Sadig Akhund <sadigaxund@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -25,6 +25,8 @@ Requires-Dist: mcp>=1.0
|
|
|
25
25
|
Provides-Extra: dev
|
|
26
26
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
28
|
+
Provides-Extra: embeddings
|
|
29
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "embeddings"
|
|
28
30
|
Dynamic: license-file
|
|
29
31
|
|
|
30
32
|
# CodeXLR8
|
|
@@ -64,11 +66,56 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
|
|
|
64
66
|
|
|
65
67
|
| Layer | Source | Boost |
|
|
66
68
|
|---|---|---|
|
|
67
|
-
| 1 | Raw file content
|
|
68
|
-
|
|
|
69
|
+
| 1 | Raw file content | 0.3× per token |
|
|
70
|
+
| 2a | File path (filename, directory) | 0.5× – 0.8× |
|
|
71
|
+
| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
|
|
69
72
|
| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
|
|
70
73
|
|
|
71
|
-
Search uses
|
|
74
|
+
Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
|
|
75
|
+
|
|
76
|
+
### Scoped search and clustering
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Narrow to a specific directory (like grep -rn "pattern" dir/)
|
|
80
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
81
|
+
|
|
82
|
+
# Cluster results by directory to see where matches concentrate
|
|
83
|
+
codexlr8 search . "get_visible" --grouped
|
|
84
|
+
# 12 results in 3 directories (8 files) across project:
|
|
85
|
+
# lib/mpl_toolkits/mplot3d/ (5 files)
|
|
86
|
+
# ─ axes3d.py:388 [score: 0.90]
|
|
87
|
+
# ...
|
|
88
|
+
|
|
89
|
+
# Diagnose your query — see which terms hit, which don't
|
|
90
|
+
codexlr8 search . "axes not hiding" --explain
|
|
91
|
+
# Query analysis:
|
|
92
|
+
# "axes" 212 matches — broad term (212/212 results)
|
|
93
|
+
# "not" 77 matches
|
|
94
|
+
# "hiding" 0 matches — consider dropping or replacing
|
|
95
|
+
# Top score: 1.20 (strong match)
|
|
96
|
+
|
|
97
|
+
# Combine both — group, then scope to drill down
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Search Quality & Fine-Tuning
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# Measure search accuracy against known queries
|
|
104
|
+
codexlr8 eval . --queries queries.json
|
|
105
|
+
# Precision@1: 67%, MRR: 0.83, Recall@5: 67%
|
|
106
|
+
|
|
107
|
+
# Typos are auto-corrected (fuzzy fallback on zero results)
|
|
108
|
+
codexlr8 search . "funtion" # → corrects to "function"
|
|
109
|
+
|
|
110
|
+
# Opt-in embeddings: hybrid BM25 + semantic search
|
|
111
|
+
# pip install codexlr8[embeddings]
|
|
112
|
+
# set embeddings.enabled: true in .codexlr8.yaml
|
|
113
|
+
|
|
114
|
+
# Fine-tune a model on YOUR codebase vocabulary
|
|
115
|
+
codexlr8 recommend-model . # picks the right model for your size
|
|
116
|
+
codexlr8 train . # TSDAE training, 5-45min on CPU
|
|
117
|
+
codexlr8 eval . # measure improvement
|
|
118
|
+
```
|
|
72
119
|
|
|
73
120
|
## .meta.yaml Sidecars
|
|
74
121
|
|
|
@@ -35,11 +35,56 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
|
|
|
35
35
|
|
|
36
36
|
| Layer | Source | Boost |
|
|
37
37
|
|---|---|---|
|
|
38
|
-
| 1 | Raw file content
|
|
39
|
-
|
|
|
38
|
+
| 1 | Raw file content | 0.3× per token |
|
|
39
|
+
| 2a | File path (filename, directory) | 0.5× – 0.8× |
|
|
40
|
+
| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
|
|
40
41
|
| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
|
|
41
42
|
|
|
42
|
-
Search uses
|
|
43
|
+
Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
|
|
44
|
+
|
|
45
|
+
### Scoped search and clustering
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Narrow to a specific directory (like grep -rn "pattern" dir/)
|
|
49
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
50
|
+
|
|
51
|
+
# Cluster results by directory to see where matches concentrate
|
|
52
|
+
codexlr8 search . "get_visible" --grouped
|
|
53
|
+
# 12 results in 3 directories (8 files) across project:
|
|
54
|
+
# lib/mpl_toolkits/mplot3d/ (5 files)
|
|
55
|
+
# ─ axes3d.py:388 [score: 0.90]
|
|
56
|
+
# ...
|
|
57
|
+
|
|
58
|
+
# Diagnose your query — see which terms hit, which don't
|
|
59
|
+
codexlr8 search . "axes not hiding" --explain
|
|
60
|
+
# Query analysis:
|
|
61
|
+
# "axes" 212 matches — broad term (212/212 results)
|
|
62
|
+
# "not" 77 matches
|
|
63
|
+
# "hiding" 0 matches — consider dropping or replacing
|
|
64
|
+
# Top score: 1.20 (strong match)
|
|
65
|
+
|
|
66
|
+
# Combine both — group, then scope to drill down
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Search Quality & Fine-Tuning
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Measure search accuracy against known queries
|
|
73
|
+
codexlr8 eval . --queries queries.json
|
|
74
|
+
# Precision@1: 67%, MRR: 0.83, Recall@5: 67%
|
|
75
|
+
|
|
76
|
+
# Typos are auto-corrected (fuzzy fallback on zero results)
|
|
77
|
+
codexlr8 search . "funtion" # → corrects to "function"
|
|
78
|
+
|
|
79
|
+
# Opt-in embeddings: hybrid BM25 + semantic search
|
|
80
|
+
# pip install codexlr8[embeddings]
|
|
81
|
+
# set embeddings.enabled: true in .codexlr8.yaml
|
|
82
|
+
|
|
83
|
+
# Fine-tune a model on YOUR codebase vocabulary
|
|
84
|
+
codexlr8 recommend-model . # picks the right model for your size
|
|
85
|
+
codexlr8 train . # TSDAE training, 5-45min on CPU
|
|
86
|
+
codexlr8 eval . # measure improvement
|
|
87
|
+
```
|
|
43
88
|
|
|
44
89
|
## .meta.yaml Sidecars
|
|
45
90
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "codexlr8"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.3"
|
|
8
8
|
description = "A codebase search engine for LLM coding agents"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -37,6 +37,9 @@ dev = [
|
|
|
37
37
|
"pytest>=7.0",
|
|
38
38
|
"pytest-cov>=4.0",
|
|
39
39
|
]
|
|
40
|
+
embeddings = [
|
|
41
|
+
"sentence-transformers>=3.0",
|
|
42
|
+
]
|
|
40
43
|
|
|
41
44
|
[project.scripts]
|
|
42
45
|
codexlr8 = "codexlr8.cli:main"
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""CodeXLR8 CLI — search-first codebase navigation for agents."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import os
|
|
4
5
|
import click
|
|
5
6
|
|
|
6
7
|
from .config import load_config
|
|
7
8
|
from .scanner import scan_project
|
|
8
9
|
from .meta import generate_missing_sidecars
|
|
9
|
-
from .search import SearchEngine
|
|
10
|
+
from .search import SearchEngine, _group_results, _explain_query, _tokenize
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
EXCLUDE_HELP = (
|
|
@@ -62,10 +63,19 @@ def scan(project_path: str, output: str | None):
|
|
|
62
63
|
@click.argument("query")
|
|
63
64
|
@click.option("--exclude", "-x", "exclude_patterns", multiple=True,
|
|
64
65
|
callback=_parse_excludes, help=EXCLUDE_HELP)
|
|
66
|
+
@click.option("--scope", "-s", default=None,
|
|
67
|
+
help="Restrict search to files under a path prefix (e.g. src/ or lib/mpl_toolkits/)")
|
|
68
|
+
@click.option("--grouped", "-g", is_flag=True, default=False,
|
|
69
|
+
help="Cluster results by directory before listing files")
|
|
70
|
+
@click.option("--explain", "-e", is_flag=True, default=False,
|
|
71
|
+
help="Show token breakdown and query diagnostics")
|
|
72
|
+
@click.option("--group-depth", default=3,
|
|
73
|
+
help="Max directory depth for grouping (default: 3)")
|
|
65
74
|
@click.option("--format", "-f", "output_format",
|
|
66
75
|
type=click.Choice(["text", "json"]), default="text")
|
|
67
76
|
@click.option("--limit", "-n", default=10, help="Maximum number of results")
|
|
68
77
|
def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
78
|
+
scope: str | None, grouped: bool, explain: bool, group_depth: int,
|
|
69
79
|
output_format: str, limit: int):
|
|
70
80
|
"""Search the codebase for code matching QUERY.
|
|
71
81
|
|
|
@@ -74,19 +84,52 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
|
74
84
|
\b
|
|
75
85
|
Examples:
|
|
76
86
|
codexlr8 search . "login auth"
|
|
87
|
+
codexlr8 search . "login auth" --grouped
|
|
88
|
+
codexlr8 search . "login auth" --explain
|
|
77
89
|
codexlr8 search . "login auth" --exclude "tests/*"
|
|
78
90
|
codexlr8 search . "login auth" -x "tests/*" -x "vendor/*"
|
|
91
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
79
92
|
"""
|
|
80
93
|
engine = SearchEngine(project_path)
|
|
81
|
-
results = engine.search(query, limit=limit, exclude=exclude_patterns)
|
|
94
|
+
results = engine.search(query, limit=limit, exclude=exclude_patterns, scope=scope)
|
|
82
95
|
|
|
83
96
|
if output_format == "json":
|
|
84
97
|
import json
|
|
85
|
-
|
|
98
|
+
output = {"results": results}
|
|
99
|
+
if explain:
|
|
100
|
+
output["explain"] = _explain_query(query, _tokenize(query), results)
|
|
101
|
+
if grouped:
|
|
102
|
+
groups_data = _group_results(results, group_depth)
|
|
103
|
+
output["grouped"] = True
|
|
104
|
+
output["groups"] = groups_data["groups"]
|
|
105
|
+
output["summary"] = {
|
|
106
|
+
"total_results": groups_data["total_results"],
|
|
107
|
+
"total_files": groups_data["total_files"],
|
|
108
|
+
"total_groups": len(groups_data["groups"]),
|
|
109
|
+
}
|
|
110
|
+
click.echo(json.dumps(output, indent=2))
|
|
86
111
|
return
|
|
87
112
|
|
|
88
113
|
if not results:
|
|
89
114
|
click.echo("No results found.")
|
|
115
|
+
if explain:
|
|
116
|
+
tokens = _tokenize(query)
|
|
117
|
+
click.echo()
|
|
118
|
+
click.echo("Query analysis:")
|
|
119
|
+
for t in tokens:
|
|
120
|
+
click.echo(f" \"{t}\" \u2717 no matches")
|
|
121
|
+
click.echo()
|
|
122
|
+
click.echo("0 tokens matched. All terms are absent from the codebase.")
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
if explain:
|
|
126
|
+
tokens = _tokenize(query)
|
|
127
|
+
explain_data = _explain_query(query, tokens, results)
|
|
128
|
+
_print_explain(explain_data)
|
|
129
|
+
click.echo()
|
|
130
|
+
|
|
131
|
+
if grouped:
|
|
132
|
+
_print_grouped(results, group_depth, scope)
|
|
90
133
|
return
|
|
91
134
|
|
|
92
135
|
for i, r in enumerate(results, 1):
|
|
@@ -96,6 +139,8 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
|
96
139
|
click.echo(f" meta: {r['summary']}")
|
|
97
140
|
if r.get("tags"):
|
|
98
141
|
click.echo(f" tags: {', '.join(r['tags'])}")
|
|
142
|
+
if r.get("matched_tokens"):
|
|
143
|
+
click.echo(f" matched: {', '.join(r['matched_tokens'])}")
|
|
99
144
|
if r.get("preview"):
|
|
100
145
|
click.echo(" preview: |")
|
|
101
146
|
for line in r["preview"].strip().splitlines()[:6]:
|
|
@@ -151,6 +196,144 @@ def status(project_path: str):
|
|
|
151
196
|
click.echo(f"Files without .meta.yaml: {state['files_without_meta']}")
|
|
152
197
|
click.echo(f"Total lines indexed: {state['total_lines']}")
|
|
153
198
|
click.echo(f"Index age: {state.get('index_age', 'N/A')}")
|
|
199
|
+
click.echo(f"Coverage: {state.get('coverage_pct', 0)}%")
|
|
200
|
+
if state.get("warning"):
|
|
201
|
+
click.echo()
|
|
202
|
+
click.secho(f" Warning: {state['warning']}", fg="yellow")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@main.command()
|
|
206
|
+
@click.argument("project_path", type=click.Path(exists=True, file_okay=False))
|
|
207
|
+
@click.option("--queries", "-q", required=True,
|
|
208
|
+
type=click.Path(exists=True, dir_okay=False),
|
|
209
|
+
help="Path to JSON file with query definitions")
|
|
210
|
+
@click.option("--limit", "-n", default=10,
|
|
211
|
+
help="Max results per query (default: 10)")
|
|
212
|
+
def eval_cmd(project_path: str, queries: str, limit: int):
|
|
213
|
+
"""Evaluate search quality against a query set.
|
|
214
|
+
|
|
215
|
+
QUERIES is a JSON file with an array of query objects:
|
|
216
|
+
[{"query": "...", "expected": "path/to/file.py", "min_rank": 1}]
|
|
217
|
+
|
|
218
|
+
Outputs a per-query pass/fail table and aggregate metrics:
|
|
219
|
+
Precision@1, Mean Reciprocal Rank (MRR), Recall@5.
|
|
220
|
+
"""
|
|
221
|
+
from .eval import load_queries, run_eval
|
|
222
|
+
import json
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
query_defs = load_queries(queries)
|
|
226
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
227
|
+
raise click.ClickException(f"Invalid queries file: {e}")
|
|
228
|
+
|
|
229
|
+
if not query_defs:
|
|
230
|
+
raise click.ClickException("Queries file contains no queries.")
|
|
231
|
+
|
|
232
|
+
metrics = run_eval(project_path, query_defs, limit=limit)
|
|
233
|
+
|
|
234
|
+
# Per-query table
|
|
235
|
+
click.secho(" Query Expected Mode Lines Rank Score Status", fg="cyan", bold=True)
|
|
236
|
+
click.secho(" " + "─" * 105, fg="cyan")
|
|
237
|
+
|
|
238
|
+
for r in metrics["results"]:
|
|
239
|
+
query_str = f'"{r["query"]}"'.ljust(34)
|
|
240
|
+
expected_str = r["expected"].ljust(20)
|
|
241
|
+
mode_str = r.get("assert", "file").ljust(7)
|
|
242
|
+
lines_str = ""
|
|
243
|
+
if r.get("line_start"):
|
|
244
|
+
lines_str = f"{r['line_start']}-{r['line_end']}".ljust(8)
|
|
245
|
+
else:
|
|
246
|
+
lines_str = "—".ljust(8)
|
|
247
|
+
rank_str = str(r["rank"]).ljust(6) if r["rank"] else "— "
|
|
248
|
+
score_str = f'{r["score"]:.2f}'.ljust(8) if r["score"] else "— "
|
|
249
|
+
status = r["status"]
|
|
250
|
+
|
|
251
|
+
if status.startswith("pass"):
|
|
252
|
+
status_style = {"fg": "green"}
|
|
253
|
+
elif "found" in status:
|
|
254
|
+
status_style = {"fg": "yellow"}
|
|
255
|
+
else:
|
|
256
|
+
status_style = {"fg": "red"}
|
|
257
|
+
|
|
258
|
+
click.echo(f" {query_str} {expected_str} {mode_str} {lines_str} {rank_str} {score_str} {click.style(status, **status_style)}")
|
|
259
|
+
|
|
260
|
+
# Aggregate metrics
|
|
261
|
+
click.echo()
|
|
262
|
+
click.echo(click.style(" " + "─" * 40, fg="cyan"))
|
|
263
|
+
click.secho(f" Precision@1: {metrics['precision_at_1']:.2%} "
|
|
264
|
+
f"({metrics['passed']}/{metrics['num_queries']} passed)", fg="green")
|
|
265
|
+
click.secho(f" MRR: {metrics['mrr']:.4f}", fg="green")
|
|
266
|
+
click.secho(f" Recall@5: {metrics['recall_at_5']:.2%}", fg="green")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@main.command()
|
|
270
|
+
@click.argument("project_path", type=click.Path(exists=True, file_okay=False), default=".")
|
|
271
|
+
@click.option("--model", "-m", default="all-MiniLM-L6-v2",
|
|
272
|
+
help="Embedding model to fine-tune")
|
|
273
|
+
@click.option("--epochs", "-e", default=3,
|
|
274
|
+
help="Training epochs (default: 3)")
|
|
275
|
+
@click.option("--incremental", "-i", is_flag=True, default=False,
|
|
276
|
+
help="Fine-tune only on changed files")
|
|
277
|
+
def train(project_path: str, model: str, epochs: int, incremental: bool):
|
|
278
|
+
"""Fine-tune an embedding model on this codebase for better search accuracy.
|
|
279
|
+
|
|
280
|
+
Uses TSDAE (denoising auto-encoder) to adapt a pretrained model to
|
|
281
|
+
your codebase's vocabulary. The fine-tuned model is saved to
|
|
282
|
+
.codexlr8_model/ and referenced in .codexlr8.yaml.
|
|
283
|
+
|
|
284
|
+
Requirements: pip install codexlr8[embeddings]
|
|
285
|
+
"""
|
|
286
|
+
try:
|
|
287
|
+
from .train import train_model
|
|
288
|
+
except ImportError as e:
|
|
289
|
+
raise click.ClickException(
|
|
290
|
+
"Training requires 'pip install codexlr8[embeddings]'"
|
|
291
|
+
) from e
|
|
292
|
+
|
|
293
|
+
click.echo()
|
|
294
|
+
click.secho(" Training embedding model on this codebase...", fg="cyan", bold=True)
|
|
295
|
+
click.echo(f" Model: {model}")
|
|
296
|
+
click.echo(f" Epochs: {epochs}")
|
|
297
|
+
click.echo()
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
result = train_model(project_path, model_name=model,
|
|
301
|
+
epochs=epochs, incremental=incremental)
|
|
302
|
+
except ValueError as e:
|
|
303
|
+
raise click.ClickException(str(e))
|
|
304
|
+
|
|
305
|
+
dur = result["duration_sec"]
|
|
306
|
+
dur_str = f"{dur}s" if dur < 60 else f"{dur // 60}m{dur % 60}s"
|
|
307
|
+
|
|
308
|
+
click.echo()
|
|
309
|
+
click.secho(f" Trained on {result['num_examples']} files in {dur_str}", fg="green")
|
|
310
|
+
click.secho(f" Model saved to {result['model_path']}", fg="green")
|
|
311
|
+
click.secho(f" Embeddings enabled in .codexlr8.yaml", fg="green")
|
|
312
|
+
click.echo()
|
|
313
|
+
click.secho(" Run 'codexlr8 eval .' to measure improvement.", dim=True)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@main.command()
|
|
317
|
+
@click.argument("project_path", type=click.Path(exists=True, file_okay=False), default=".")
|
|
318
|
+
def recommend_model_cmd(project_path: str):
|
|
319
|
+
"""Suggest the best embedding model for this codebase size."""
|
|
320
|
+
try:
|
|
321
|
+
from .train import recommend_model
|
|
322
|
+
except ImportError as e:
|
|
323
|
+
raise click.ClickException(
|
|
324
|
+
"Requires 'pip install codexlr8[embeddings]'"
|
|
325
|
+
) from e
|
|
326
|
+
|
|
327
|
+
rec = recommend_model(project_path)
|
|
328
|
+
|
|
329
|
+
click.echo()
|
|
330
|
+
click.secho(f" Codebase: {rec['num_files']} files, ~{rec['est_tokens']:,} tokens", fg="cyan")
|
|
331
|
+
click.echo()
|
|
332
|
+
click.secho(f" Recommended: {rec['model']} ({rec['param_count']})", fg="green", bold=True)
|
|
333
|
+
click.echo(f" Est. training time: {rec['est_training_time']}")
|
|
334
|
+
click.echo(f" Expected quality gain: {rec.get('quality_gain', '+5-12% MRR')}")
|
|
335
|
+
click.echo()
|
|
336
|
+
click.secho(" Run 'codexlr8 train .' to start training.", dim=True)
|
|
154
337
|
|
|
155
338
|
|
|
156
339
|
@main.command()
|
|
@@ -240,7 +423,8 @@ def setup(project_path: str):
|
|
|
240
423
|
include = [p.strip() for p in custom_include.split(",") if p.strip()]
|
|
241
424
|
click.echo()
|
|
242
425
|
|
|
243
|
-
defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*"
|
|
426
|
+
defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*",
|
|
427
|
+
"examples/*", "docs/*", "tutorials/*", "benchmarks/*"]
|
|
244
428
|
custom_exclude = click.prompt(
|
|
245
429
|
click.style(" Exclude (comma-separated)", fg="bright_white"),
|
|
246
430
|
default=", ".join(defaults),
|
|
@@ -306,6 +490,102 @@ def setup(project_path: str):
|
|
|
306
490
|
click.secho(" Run 'codexlr8 index .' to build your first search index.", dim=True)
|
|
307
491
|
|
|
308
492
|
|
|
493
|
+
def _print_explain(data: dict):
|
|
494
|
+
"""Print query diagnostic breakdown."""
|
|
495
|
+
click.secho("Query analysis:", fg="cyan", bold=True)
|
|
496
|
+
click.echo(f" Original: \"{data['query']}\"")
|
|
497
|
+
click.echo(f" Tokens: {', '.join(data['tokens'])}")
|
|
498
|
+
click.echo()
|
|
499
|
+
|
|
500
|
+
for token in data["tokens"]:
|
|
501
|
+
hits = data["token_hits"].get(token, 0)
|
|
502
|
+
total = data["total_results"]
|
|
503
|
+
if hits == 0:
|
|
504
|
+
status = click.style(f"{hits} matches", fg="red")
|
|
505
|
+
hint = " — consider dropping or replacing"
|
|
506
|
+
elif hits <= 3:
|
|
507
|
+
status = click.style(f"{hits} matches", fg="yellow")
|
|
508
|
+
hint = " — very specific"
|
|
509
|
+
elif hits <= total * 0.1:
|
|
510
|
+
status = click.style(f"{hits} matches", fg="green")
|
|
511
|
+
hint = ""
|
|
512
|
+
else:
|
|
513
|
+
status = click.style(f"{hits} matches", fg="yellow")
|
|
514
|
+
hint = f" — broad term ({hits}/{total} results)"
|
|
515
|
+
|
|
516
|
+
click.echo(f" \"{token}\" {status}{hint}")
|
|
517
|
+
|
|
518
|
+
for fw in data["filtered"]:
|
|
519
|
+
click.echo(f" \"{fw}\" {click.style('filtered', fg='yellow')} — single letter, ignored")
|
|
520
|
+
|
|
521
|
+
click.echo()
|
|
522
|
+
top = data["top_score"]
|
|
523
|
+
if top < 0.60:
|
|
524
|
+
quality = click.style("weak", fg="red")
|
|
525
|
+
elif top < 1.20:
|
|
526
|
+
quality = click.style("moderate", fg="yellow")
|
|
527
|
+
else:
|
|
528
|
+
quality = click.style("strong", fg="green")
|
|
529
|
+
click.echo(f" Top score: {top} ({quality} match)")
|
|
530
|
+
|
|
531
|
+
if data["filtered"]:
|
|
532
|
+
click.echo(click.style(" Tip:", dim=True) + " single-letter words are ignored. Use full terms.")
|
|
533
|
+
zero_match = [t for t in data["tokens"] if data["token_hits"].get(t, 0) == 0]
|
|
534
|
+
if zero_match:
|
|
535
|
+
click.echo(click.style(" Tip:", dim=True) + f" \"{zero_match[0]}\" doesn't exist — try a synonym or drop it.")
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def _print_grouped(results: list[dict], group_depth: int, scope: str | None):
|
|
539
|
+
"""Print search results clustered by directory."""
|
|
540
|
+
groups_data = _group_results(results, group_depth)
|
|
541
|
+
groups = groups_data["groups"]
|
|
542
|
+
total = groups_data["total_results"]
|
|
543
|
+
files = groups_data["total_files"]
|
|
544
|
+
|
|
545
|
+
scope_label = f"in {scope}" if scope else "across project"
|
|
546
|
+
click.echo(f"{total} results in {len(groups)} directories ({files} files) {scope_label}:")
|
|
547
|
+
click.echo()
|
|
548
|
+
|
|
549
|
+
top_groups = groups[:5]
|
|
550
|
+
for g in top_groups:
|
|
551
|
+
# Directory header with match count
|
|
552
|
+
label = g["prefix"].rstrip(os.sep)
|
|
553
|
+
click.echo(f"{label}/ ({g['count']} files)")
|
|
554
|
+
|
|
555
|
+
for f in g["files"]:
|
|
556
|
+
line_info = f"{f['path']}:{f['line_start']}-{f['line_end']}"
|
|
557
|
+
score_info = f"{f['score']:.2f}"
|
|
558
|
+
click.echo(f" {click.style(line_info, fg='cyan')} "
|
|
559
|
+
f"[score: {score_info}]")
|
|
560
|
+
|
|
561
|
+
# Summary line from preview or metadata
|
|
562
|
+
if f.get("summary"):
|
|
563
|
+
click.echo(f" {f['summary']}")
|
|
564
|
+
elif f.get("preview"):
|
|
565
|
+
first_line = f["preview"].strip().splitlines()[0].strip() if f["preview"].strip() else ""
|
|
566
|
+
if first_line:
|
|
567
|
+
click.echo(f" {first_line[:100]}")
|
|
568
|
+
|
|
569
|
+
if g["has_more"]:
|
|
570
|
+
click.echo(f" ... and {g['remaining']} more files")
|
|
571
|
+
click.echo()
|
|
572
|
+
|
|
573
|
+
if len(groups) > 5:
|
|
574
|
+
click.echo(f"... and {len(groups) - 5} more directories")
|
|
575
|
+
|
|
576
|
+
# Scope hint
|
|
577
|
+
click.echo()
|
|
578
|
+
if scope:
|
|
579
|
+
click.echo(click.style("Already scoped. Remove --scope to broaden.", dim=True))
|
|
580
|
+
else:
|
|
581
|
+
click.echo(
|
|
582
|
+
click.style(
|
|
583
|
+
f"Use --scope <dir> to narrow results (e.g. --scope {top_groups[0]['prefix']})",
|
|
584
|
+
dim=True
|
|
585
|
+
)
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
309
589
|
def _inject_mcp_config(config_path: str, mcp_json: str) -> None:
|
|
310
590
|
"""Inject the CodeXLR8 MCP config into an existing client config file.
|
|
311
591
|
|
|
@@ -406,7 +686,51 @@ codebase_search(query="stripe charge customer refund")
|
|
|
406
686
|
codebase_search(query="shopping cart checkout payment")
|
|
407
687
|
```
|
|
408
688
|
|
|
409
|
-
|
|
689
|
+
### Query strategy
|
|
690
|
+
|
|
691
|
+
Describe what you're looking for in natural language. The engine uses OR semantics with a scoring layer — more terms increase precision through token-coverage ranking, not a hard AND requirement.
|
|
692
|
+
|
|
693
|
+
**Good queries use distinct, discriminating terms:**
|
|
694
|
+
|
|
695
|
+
| Task | Good query | Why |
|
|
696
|
+
|---|---|---|
|
|
697
|
+
| Fix login bug | `"login auth session token"` | Covers auth module, session, tokens — distinct terms, not synonyms |
|
|
698
|
+
| Payment refund | `"stripe refund charge customer"` | Each term narrows to a different aspect of the feature |
|
|
699
|
+
| 3D plot visibility | `"axes3d draw visible renderer"` | Domain term + method + symptom — different dimensions of the bug |
|
|
700
|
+
| Checkout flow | `"checkout cart payment order"` | Covers all stages of the flow |
|
|
701
|
+
|
|
702
|
+
**What to avoid:**
|
|
703
|
+
- Single-word queries (`"login"`) — too broad, returns everything mentioning login
|
|
704
|
+
- Synonyms (`"login authenticate signin"`) — redundant, wastes tokens without improving coverage
|
|
705
|
+
- Full sentences (`"I need to find where user login happens"`) — stop words like `"I"`, `"need"`, `"to"` are filtered out
|
|
706
|
+
|
|
707
|
+
### Using scope and grouping
|
|
708
|
+
|
|
709
|
+
When you know which directory the code lives in, scope the search:
|
|
710
|
+
|
|
711
|
+
```
|
|
712
|
+
codebase_search(query="get_visible", scope="lib/mpl_toolkits/")
|
|
713
|
+
```
|
|
714
|
+
|
|
715
|
+
When you don't know, run a shell command to see where results cluster:
|
|
716
|
+
|
|
717
|
+
```bash
|
|
718
|
+
codexlr8 search . "get_visible" --grouped
|
|
719
|
+
```
|
|
720
|
+
|
|
721
|
+
This prints directories ranked by their highest-scoring file, with a `--scope` hint to copy into your next MCP call.
|
|
722
|
+
|
|
723
|
+
### When results don't look right
|
|
724
|
+
|
|
725
|
+
Check the `matched` field on each result. If a file you expected isn't showing, the missing token tells you what to adjust. If all results only match 1 of 4 tokens, your terms are too scattered — try removing one.
|
|
726
|
+
|
|
727
|
+
For deeper diagnostics, run:
|
|
728
|
+
|
|
729
|
+
```bash
|
|
730
|
+
codexlr8 search . "your query" --explain
|
|
731
|
+
```
|
|
732
|
+
|
|
733
|
+
This shows per-token hit counts and flags zero-match terms so you can refine before calling `codebase_search` again.
|
|
410
734
|
|
|
411
735
|
## Interpreting results
|
|
412
736
|
|
|
@@ -418,9 +742,10 @@ Results include:
|
|
|
418
742
|
| `score` | Relevance (higher = better) |
|
|
419
743
|
| `summary` | Human-written description of the file's purpose |
|
|
420
744
|
| `tags` | Curated keywords (auth, payment, cart, etc.) |
|
|
745
|
+
| `matched` | Which query tokens the file matched — use this to debug failed searches |
|
|
421
746
|
| `preview` | First ~10 lines around the best match |
|
|
422
747
|
|
|
423
|
-
**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest. Raw content matches rank
|
|
748
|
+
**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest, followed by filename matches, then path directory matches. Raw content matches rank lowest. `__init__.py` re-exports are penalized.
|
|
424
749
|
|
|
425
750
|
## Maintaining the index
|
|
426
751
|
|
|
@@ -508,6 +833,11 @@ Exclude patterns are globs that match file paths. Use `*` for wildcards.
|
|
|
508
833
|
| Task | Tool call |
|
|
509
834
|
|---|---|
|
|
510
835
|
| Find code for a feature | `codebase_search(query="...")` |
|
|
836
|
+
| Search within a directory | `codebase_search(query="...", scope="src/")` |
|
|
837
|
+
| Cluster results by directory | Shell: `codexlr8 search . "query" --grouped` |
|
|
838
|
+
| Diagnose query terms | Shell: `codexlr8 search . "query" --explain` |
|
|
839
|
+
| Measure search accuracy | Shell: `codexlr8 eval . --queries q.json` |
|
|
840
|
+
| Fine-tune embeddings | Shell: `codexlr8 train .` (needs `[embeddings]` extra) |
|
|
511
841
|
| Build/update index | `codebase_index(incremental=true)` |
|
|
512
842
|
| Check metadata coverage | Shell: `codexlr8 status .` |
|
|
513
843
|
| Bootstrap missing sidecars | Shell: `codexlr8 init .` |
|
|
@@ -24,6 +24,12 @@ def load_config(project_path: str) -> dict:
|
|
|
24
24
|
def _defaults() -> dict:
|
|
25
25
|
return {
|
|
26
26
|
"root": ".",
|
|
27
|
+
"fuzzy": True,
|
|
28
|
+
"embeddings": {
|
|
29
|
+
"enabled": False,
|
|
30
|
+
"model": "all-MiniLM-L6-v2",
|
|
31
|
+
"bm25_weight": 0.6,
|
|
32
|
+
},
|
|
27
33
|
"include": [],
|
|
28
34
|
"exclude": [
|
|
29
35
|
"tests/*",
|
|
@@ -32,6 +38,10 @@ def _defaults() -> dict:
|
|
|
32
38
|
"__tests__/*",
|
|
33
39
|
"test_*",
|
|
34
40
|
"*_test.*",
|
|
41
|
+
"examples/*",
|
|
42
|
+
"docs/*",
|
|
43
|
+
"tutorials/*",
|
|
44
|
+
"benchmarks/*",
|
|
35
45
|
],
|
|
36
46
|
"extensions": [
|
|
37
47
|
".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".rb",
|