codexlr8 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codexlr8-0.0.1 → codexlr8-0.0.2}/PKG-INFO +29 -4
- {codexlr8-0.0.1 → codexlr8-0.0.2}/README.md +28 -3
- {codexlr8-0.0.1 → codexlr8-0.0.2}/pyproject.toml +1 -1
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/__init__.py +1 -1
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/cli.py +200 -6
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/config.py +4 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/mcp_server.py +10 -1
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/search.py +159 -27
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/PKG-INFO +29 -4
- {codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_mcp_server.py +26 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_search.py +120 -5
- {codexlr8-0.0.1 → codexlr8-0.0.2}/LICENSE +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/setup.cfg +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/meta.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/scanner.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/SOURCES.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/dependency_links.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/entry_points.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/requires.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/top_level.txt +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_meta.py +0 -0
- {codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_scanner.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codexlr8
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: A codebase search engine for LLM coding agents
|
|
5
5
|
Author-email: Sadig Akhund <sadigaxund@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
|
|
|
64
64
|
|
|
65
65
|
| Layer | Source | Boost |
|
|
66
66
|
|---|---|---|
|
|
67
|
-
| 1 | Raw file content
|
|
68
|
-
|
|
|
67
|
+
| 1 | Raw file content | 0.3× per token |
|
|
68
|
+
| 2a | File path (filename, directory) | 0.5× – 0.8× |
|
|
69
|
+
| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
|
|
69
70
|
| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
|
|
70
71
|
|
|
71
|
-
Search uses
|
|
72
|
+
Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
|
|
73
|
+
|
|
74
|
+
### Scoped search and clustering
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Narrow to a specific directory (like grep -rn "pattern" dir/)
|
|
78
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
79
|
+
|
|
80
|
+
# Cluster results by directory to see where matches concentrate
|
|
81
|
+
codexlr8 search . "get_visible" --grouped
|
|
82
|
+
# 12 results in 3 directories (8 files) across project:
|
|
83
|
+
# lib/mpl_toolkits/mplot3d/ (5 files)
|
|
84
|
+
# ─ axes3d.py:388 [score: 0.90]
|
|
85
|
+
# ...
|
|
86
|
+
|
|
87
|
+
# Diagnose your query — see which terms hit, which don't
|
|
88
|
+
codexlr8 search . "axes not hiding" --explain
|
|
89
|
+
# Query analysis:
|
|
90
|
+
# "axes" 212 matches — broad term (212/212 results)
|
|
91
|
+
# "not" 77 matches
|
|
92
|
+
# "hiding" 0 matches — consider dropping or replacing
|
|
93
|
+
# Top score: 1.20 (strong match)
|
|
94
|
+
|
|
95
|
+
# Combine both — group, then scope to drill down
|
|
96
|
+
```
|
|
72
97
|
|
|
73
98
|
## .meta.yaml Sidecars
|
|
74
99
|
|
|
@@ -35,11 +35,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
|
|
|
35
35
|
|
|
36
36
|
| Layer | Source | Boost |
|
|
37
37
|
|---|---|---|
|
|
38
|
-
| 1 | Raw file content
|
|
39
|
-
|
|
|
38
|
+
| 1 | Raw file content | 0.3× per token |
|
|
39
|
+
| 2a | File path (filename, directory) | 0.5× – 0.8× |
|
|
40
|
+
| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
|
|
40
41
|
| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
|
|
41
42
|
|
|
42
|
-
Search uses
|
|
43
|
+
Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
|
|
44
|
+
|
|
45
|
+
### Scoped search and clustering
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Narrow to a specific directory (like grep -rn "pattern" dir/)
|
|
49
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
50
|
+
|
|
51
|
+
# Cluster results by directory to see where matches concentrate
|
|
52
|
+
codexlr8 search . "get_visible" --grouped
|
|
53
|
+
# 12 results in 3 directories (8 files) across project:
|
|
54
|
+
# lib/mpl_toolkits/mplot3d/ (5 files)
|
|
55
|
+
# ─ axes3d.py:388 [score: 0.90]
|
|
56
|
+
# ...
|
|
57
|
+
|
|
58
|
+
# Diagnose your query — see which terms hit, which don't
|
|
59
|
+
codexlr8 search . "axes not hiding" --explain
|
|
60
|
+
# Query analysis:
|
|
61
|
+
# "axes" 212 matches — broad term (212/212 results)
|
|
62
|
+
# "not" 77 matches
|
|
63
|
+
# "hiding" 0 matches — consider dropping or replacing
|
|
64
|
+
# Top score: 1.20 (strong match)
|
|
65
|
+
|
|
66
|
+
# Combine both — group, then scope to drill down
|
|
67
|
+
```
|
|
43
68
|
|
|
44
69
|
## .meta.yaml Sidecars
|
|
45
70
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""CodeXLR8 CLI — search-first codebase navigation for agents."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import os
|
|
4
5
|
import click
|
|
5
6
|
|
|
6
7
|
from .config import load_config
|
|
7
8
|
from .scanner import scan_project
|
|
8
9
|
from .meta import generate_missing_sidecars
|
|
9
|
-
from .search import SearchEngine
|
|
10
|
+
from .search import SearchEngine, _group_results, _explain_query, _tokenize
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
EXCLUDE_HELP = (
|
|
@@ -62,10 +63,19 @@ def scan(project_path: str, output: str | None):
|
|
|
62
63
|
@click.argument("query")
|
|
63
64
|
@click.option("--exclude", "-x", "exclude_patterns", multiple=True,
|
|
64
65
|
callback=_parse_excludes, help=EXCLUDE_HELP)
|
|
66
|
+
@click.option("--scope", "-s", default=None,
|
|
67
|
+
help="Restrict search to files under a path prefix (e.g. src/ or lib/mpl_toolkits/)")
|
|
68
|
+
@click.option("--grouped", "-g", is_flag=True, default=False,
|
|
69
|
+
help="Cluster results by directory before listing files")
|
|
70
|
+
@click.option("--explain", "-e", is_flag=True, default=False,
|
|
71
|
+
help="Show token breakdown and query diagnostics")
|
|
72
|
+
@click.option("--group-depth", default=3,
|
|
73
|
+
help="Max directory depth for grouping (default: 3)")
|
|
65
74
|
@click.option("--format", "-f", "output_format",
|
|
66
75
|
type=click.Choice(["text", "json"]), default="text")
|
|
67
76
|
@click.option("--limit", "-n", default=10, help="Maximum number of results")
|
|
68
77
|
def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
78
|
+
scope: str | None, grouped: bool, explain: bool, group_depth: int,
|
|
69
79
|
output_format: str, limit: int):
|
|
70
80
|
"""Search the codebase for code matching QUERY.
|
|
71
81
|
|
|
@@ -74,19 +84,52 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
|
74
84
|
\b
|
|
75
85
|
Examples:
|
|
76
86
|
codexlr8 search . "login auth"
|
|
87
|
+
codexlr8 search . "login auth" --grouped
|
|
88
|
+
codexlr8 search . "login auth" --explain
|
|
77
89
|
codexlr8 search . "login auth" --exclude "tests/*"
|
|
78
90
|
codexlr8 search . "login auth" -x "tests/*" -x "vendor/*"
|
|
91
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
79
92
|
"""
|
|
80
93
|
engine = SearchEngine(project_path)
|
|
81
|
-
results = engine.search(query, limit=limit, exclude=exclude_patterns)
|
|
94
|
+
results = engine.search(query, limit=limit, exclude=exclude_patterns, scope=scope)
|
|
82
95
|
|
|
83
96
|
if output_format == "json":
|
|
84
97
|
import json
|
|
85
|
-
|
|
98
|
+
output = {"results": results}
|
|
99
|
+
if explain:
|
|
100
|
+
output["explain"] = _explain_query(query, _tokenize(query), results)
|
|
101
|
+
if grouped:
|
|
102
|
+
groups_data = _group_results(results, group_depth)
|
|
103
|
+
output["grouped"] = True
|
|
104
|
+
output["groups"] = groups_data["groups"]
|
|
105
|
+
output["summary"] = {
|
|
106
|
+
"total_results": groups_data["total_results"],
|
|
107
|
+
"total_files": groups_data["total_files"],
|
|
108
|
+
"total_groups": len(groups_data["groups"]),
|
|
109
|
+
}
|
|
110
|
+
click.echo(json.dumps(output, indent=2))
|
|
86
111
|
return
|
|
87
112
|
|
|
88
113
|
if not results:
|
|
89
114
|
click.echo("No results found.")
|
|
115
|
+
if explain:
|
|
116
|
+
tokens = _tokenize(query)
|
|
117
|
+
click.echo()
|
|
118
|
+
click.echo("Query analysis:")
|
|
119
|
+
for t in tokens:
|
|
120
|
+
click.echo(f" \"{t}\" \u2717 no matches")
|
|
121
|
+
click.echo()
|
|
122
|
+
click.echo("0 tokens matched. All terms are absent from the codebase.")
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
if explain:
|
|
126
|
+
tokens = _tokenize(query)
|
|
127
|
+
explain_data = _explain_query(query, tokens, results)
|
|
128
|
+
_print_explain(explain_data)
|
|
129
|
+
click.echo()
|
|
130
|
+
|
|
131
|
+
if grouped:
|
|
132
|
+
_print_grouped(results, group_depth, scope)
|
|
90
133
|
return
|
|
91
134
|
|
|
92
135
|
for i, r in enumerate(results, 1):
|
|
@@ -96,6 +139,8 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
|
|
|
96
139
|
click.echo(f" meta: {r['summary']}")
|
|
97
140
|
if r.get("tags"):
|
|
98
141
|
click.echo(f" tags: {', '.join(r['tags'])}")
|
|
142
|
+
if r.get("matched_tokens"):
|
|
143
|
+
click.echo(f" matched: {', '.join(r['matched_tokens'])}")
|
|
99
144
|
if r.get("preview"):
|
|
100
145
|
click.echo(" preview: |")
|
|
101
146
|
for line in r["preview"].strip().splitlines()[:6]:
|
|
@@ -151,6 +196,10 @@ def status(project_path: str):
|
|
|
151
196
|
click.echo(f"Files without .meta.yaml: {state['files_without_meta']}")
|
|
152
197
|
click.echo(f"Total lines indexed: {state['total_lines']}")
|
|
153
198
|
click.echo(f"Index age: {state.get('index_age', 'N/A')}")
|
|
199
|
+
click.echo(f"Coverage: {state.get('coverage_pct', 0)}%")
|
|
200
|
+
if state.get("warning"):
|
|
201
|
+
click.echo()
|
|
202
|
+
click.secho(f" Warning: {state['warning']}", fg="yellow")
|
|
154
203
|
|
|
155
204
|
|
|
156
205
|
@main.command()
|
|
@@ -240,7 +289,8 @@ def setup(project_path: str):
|
|
|
240
289
|
include = [p.strip() for p in custom_include.split(",") if p.strip()]
|
|
241
290
|
click.echo()
|
|
242
291
|
|
|
243
|
-
defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*"
|
|
292
|
+
defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*",
|
|
293
|
+
"examples/*", "docs/*", "tutorials/*", "benchmarks/*"]
|
|
244
294
|
custom_exclude = click.prompt(
|
|
245
295
|
click.style(" Exclude (comma-separated)", fg="bright_white"),
|
|
246
296
|
default=", ".join(defaults),
|
|
@@ -306,6 +356,102 @@ def setup(project_path: str):
|
|
|
306
356
|
click.secho(" Run 'codexlr8 index .' to build your first search index.", dim=True)
|
|
307
357
|
|
|
308
358
|
|
|
359
|
+
def _print_explain(data: dict):
|
|
360
|
+
"""Print query diagnostic breakdown."""
|
|
361
|
+
click.secho("Query analysis:", fg="cyan", bold=True)
|
|
362
|
+
click.echo(f" Original: \"{data['query']}\"")
|
|
363
|
+
click.echo(f" Tokens: {', '.join(data['tokens'])}")
|
|
364
|
+
click.echo()
|
|
365
|
+
|
|
366
|
+
for token in data["tokens"]:
|
|
367
|
+
hits = data["token_hits"].get(token, 0)
|
|
368
|
+
total = data["total_results"]
|
|
369
|
+
if hits == 0:
|
|
370
|
+
status = click.style(f"{hits} matches", fg="red")
|
|
371
|
+
hint = " — consider dropping or replacing"
|
|
372
|
+
elif hits <= 3:
|
|
373
|
+
status = click.style(f"{hits} matches", fg="yellow")
|
|
374
|
+
hint = " — very specific"
|
|
375
|
+
elif hits <= total * 0.1:
|
|
376
|
+
status = click.style(f"{hits} matches", fg="green")
|
|
377
|
+
hint = ""
|
|
378
|
+
else:
|
|
379
|
+
status = click.style(f"{hits} matches", fg="yellow")
|
|
380
|
+
hint = f" — broad term ({hits}/{total} results)"
|
|
381
|
+
|
|
382
|
+
click.echo(f" \"{token}\" {status}{hint}")
|
|
383
|
+
|
|
384
|
+
for fw in data["filtered"]:
|
|
385
|
+
click.echo(f" \"{fw}\" {click.style('filtered', fg='yellow')} — single letter, ignored")
|
|
386
|
+
|
|
387
|
+
click.echo()
|
|
388
|
+
top = data["top_score"]
|
|
389
|
+
if top < 0.60:
|
|
390
|
+
quality = click.style("weak", fg="red")
|
|
391
|
+
elif top < 1.20:
|
|
392
|
+
quality = click.style("moderate", fg="yellow")
|
|
393
|
+
else:
|
|
394
|
+
quality = click.style("strong", fg="green")
|
|
395
|
+
click.echo(f" Top score: {top} ({quality} match)")
|
|
396
|
+
|
|
397
|
+
if data["filtered"]:
|
|
398
|
+
click.echo(click.style(" Tip:", dim=True) + " single-letter words are ignored. Use full terms.")
|
|
399
|
+
zero_match = [t for t in data["tokens"] if data["token_hits"].get(t, 0) == 0]
|
|
400
|
+
if zero_match:
|
|
401
|
+
click.echo(click.style(" Tip:", dim=True) + f" \"{zero_match[0]}\" doesn't exist — try a synonym or drop it.")
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def _print_grouped(results: list[dict], group_depth: int, scope: str | None):
|
|
405
|
+
"""Print search results clustered by directory."""
|
|
406
|
+
groups_data = _group_results(results, group_depth)
|
|
407
|
+
groups = groups_data["groups"]
|
|
408
|
+
total = groups_data["total_results"]
|
|
409
|
+
files = groups_data["total_files"]
|
|
410
|
+
|
|
411
|
+
scope_label = f"in {scope}" if scope else "across project"
|
|
412
|
+
click.echo(f"{total} results in {len(groups)} directories ({files} files) {scope_label}:")
|
|
413
|
+
click.echo()
|
|
414
|
+
|
|
415
|
+
top_groups = groups[:5]
|
|
416
|
+
for g in top_groups:
|
|
417
|
+
# Directory header with match count
|
|
418
|
+
label = g["prefix"].rstrip(os.sep)
|
|
419
|
+
click.echo(f"{label}/ ({g['count']} files)")
|
|
420
|
+
|
|
421
|
+
for f in g["files"]:
|
|
422
|
+
line_info = f"{f['path']}:{f['line_start']}-{f['line_end']}"
|
|
423
|
+
score_info = f"{f['score']:.2f}"
|
|
424
|
+
click.echo(f" {click.style(line_info, fg='cyan')} "
|
|
425
|
+
f"[score: {score_info}]")
|
|
426
|
+
|
|
427
|
+
# Summary line from preview or metadata
|
|
428
|
+
if f.get("summary"):
|
|
429
|
+
click.echo(f" {f['summary']}")
|
|
430
|
+
elif f.get("preview"):
|
|
431
|
+
first_line = f["preview"].strip().splitlines()[0].strip() if f["preview"].strip() else ""
|
|
432
|
+
if first_line:
|
|
433
|
+
click.echo(f" {first_line[:100]}")
|
|
434
|
+
|
|
435
|
+
if g["has_more"]:
|
|
436
|
+
click.echo(f" ... and {g['remaining']} more files")
|
|
437
|
+
click.echo()
|
|
438
|
+
|
|
439
|
+
if len(groups) > 5:
|
|
440
|
+
click.echo(f"... and {len(groups) - 5} more directories")
|
|
441
|
+
|
|
442
|
+
# Scope hint
|
|
443
|
+
click.echo()
|
|
444
|
+
if scope:
|
|
445
|
+
click.echo(click.style("Already scoped. Remove --scope to broaden.", dim=True))
|
|
446
|
+
else:
|
|
447
|
+
click.echo(
|
|
448
|
+
click.style(
|
|
449
|
+
f"Use --scope <dir> to narrow results (e.g. --scope {top_groups[0]['prefix']})",
|
|
450
|
+
dim=True
|
|
451
|
+
)
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
|
|
309
455
|
def _inject_mcp_config(config_path: str, mcp_json: str) -> None:
|
|
310
456
|
"""Inject the CodeXLR8 MCP config into an existing client config file.
|
|
311
457
|
|
|
@@ -406,7 +552,51 @@ codebase_search(query="stripe charge customer refund")
|
|
|
406
552
|
codebase_search(query="shopping cart checkout payment")
|
|
407
553
|
```
|
|
408
554
|
|
|
409
|
-
|
|
555
|
+
### Query strategy
|
|
556
|
+
|
|
557
|
+
Describe what you're looking for in natural language. The engine uses OR semantics with a scoring layer — more terms increase precision through token-coverage ranking, not a hard AND requirement.
|
|
558
|
+
|
|
559
|
+
**Good queries use distinct, discriminating terms:**
|
|
560
|
+
|
|
561
|
+
| Task | Good query | Why |
|
|
562
|
+
|---|---|---|
|
|
563
|
+
| Fix login bug | `"login auth session token"` | Covers auth module, session, tokens — distinct terms, not synonyms |
|
|
564
|
+
| Payment refund | `"stripe refund charge customer"` | Each term narrows to a different aspect of the feature |
|
|
565
|
+
| 3D plot visibility | `"axes3d draw visible renderer"` | Domain term + method + symptom — different dimensions of the bug |
|
|
566
|
+
| Checkout flow | `"checkout cart payment order"` | Covers all stages of the flow |
|
|
567
|
+
|
|
568
|
+
**What to avoid:**
|
|
569
|
+
- Single-word queries (`"login"`) — too broad, returns everything mentioning login
|
|
570
|
+
- Synonyms (`"login authenticate signin"`) — redundant, wastes tokens without improving coverage
|
|
571
|
+
- Full sentences (`"I need to find where user login happens"`) — stop words like `"I"`, `"need"`, `"to"` are filtered out
|
|
572
|
+
|
|
573
|
+
### Using scope and grouping
|
|
574
|
+
|
|
575
|
+
When you know which directory the code lives in, scope the search:
|
|
576
|
+
|
|
577
|
+
```
|
|
578
|
+
codebase_search(query="get_visible", scope="lib/mpl_toolkits/")
|
|
579
|
+
```
|
|
580
|
+
|
|
581
|
+
When you don't know, run a shell command to see where results cluster:
|
|
582
|
+
|
|
583
|
+
```bash
|
|
584
|
+
codexlr8 search . "get_visible" --grouped
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
This prints directories ranked by their highest-scoring file, with a `--scope` hint to copy into your next MCP call.
|
|
588
|
+
|
|
589
|
+
### When results don't look right
|
|
590
|
+
|
|
591
|
+
Check the `matched` field on each result. If a file you expected isn't showing, the missing token tells you what to adjust. If all results only match 1 of 4 tokens, your terms are too scattered — try removing one.
|
|
592
|
+
|
|
593
|
+
For deeper diagnostics, run:
|
|
594
|
+
|
|
595
|
+
```bash
|
|
596
|
+
codexlr8 search . "your query" --explain
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
This shows per-token hit counts and flags zero-match terms so you can refine before calling `codebase_search` again.
|
|
410
600
|
|
|
411
601
|
## Interpreting results
|
|
412
602
|
|
|
@@ -418,9 +608,10 @@ Results include:
|
|
|
418
608
|
| `score` | Relevance (higher = better) |
|
|
419
609
|
| `summary` | Human-written description of the file's purpose |
|
|
420
610
|
| `tags` | Curated keywords (auth, payment, cart, etc.) |
|
|
611
|
+
| `matched` | Which query tokens the file matched — use this to debug failed searches |
|
|
421
612
|
| `preview` | First ~10 lines around the best match |
|
|
422
613
|
|
|
423
|
-
**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest. Raw content matches rank
|
|
614
|
+
**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest, followed by filename matches, then path directory matches. Raw content matches rank lowest. `__init__.py` re-exports are penalized.
|
|
424
615
|
|
|
425
616
|
## Maintaining the index
|
|
426
617
|
|
|
@@ -508,6 +699,9 @@ Exclude patterns are globs that match file paths. Use `*` for wildcards.
|
|
|
508
699
|
| Task | Tool call |
|
|
509
700
|
|---|---|
|
|
510
701
|
| Find code for a feature | `codebase_search(query="...")` |
|
|
702
|
+
| Search within a directory | `codebase_search(query="...", scope="src/")` |
|
|
703
|
+
| Cluster results by directory | Shell: `codexlr8 search . "query" --grouped` |
|
|
704
|
+
| Diagnose query terms | Shell: `codexlr8 search . "query" --explain` |
|
|
511
705
|
| Build/update index | `codebase_index(incremental=true)` |
|
|
512
706
|
| Check metadata coverage | Shell: `codexlr8 status .` |
|
|
513
707
|
| Bootstrap missing sidecars | Shell: `codexlr8 init .` |
|
|
@@ -61,6 +61,12 @@ async def list_tools() -> list[Tool]:
|
|
|
61
61
|
"description": "Glob patterns for files to exclude. "
|
|
62
62
|
"Uses .codexlr8.yaml defaults if not set.",
|
|
63
63
|
},
|
|
64
|
+
"scope": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"description": "Restrict search to files under a path prefix "
|
|
67
|
+
"(e.g. 'src/' or 'lib/mpl_toolkits/'). "
|
|
68
|
+
"Acts as grep -rn's directory filter.",
|
|
69
|
+
},
|
|
64
70
|
},
|
|
65
71
|
"required": ["query"],
|
|
66
72
|
},
|
|
@@ -111,9 +117,10 @@ async def _handle_search(args: dict) -> list[TextContent]:
|
|
|
111
117
|
query = args["query"]
|
|
112
118
|
limit = args.get("limit", 10)
|
|
113
119
|
exclude = args.get("exclude")
|
|
120
|
+
scope = args.get("scope")
|
|
114
121
|
|
|
115
122
|
engine = SearchEngine(project_path)
|
|
116
|
-
results = engine.search(query, limit=limit, exclude=exclude)
|
|
123
|
+
results = engine.search(query, limit=limit, exclude=exclude, scope=scope)
|
|
117
124
|
|
|
118
125
|
if not results:
|
|
119
126
|
return [TextContent(type="text", text="No results found.")]
|
|
@@ -128,6 +135,8 @@ async def _handle_search(args: dict) -> list[TextContent]:
|
|
|
128
135
|
lines.append(f" summary: {r['summary']}")
|
|
129
136
|
if r.get("tags"):
|
|
130
137
|
lines.append(f" tags: {', '.join(r['tags'])}")
|
|
138
|
+
if r.get("matched_tokens"):
|
|
139
|
+
lines.append(f" matched: {', '.join(r['matched_tokens'])}")
|
|
131
140
|
if r.get("preview"):
|
|
132
141
|
lines.append(" preview: |")
|
|
133
142
|
for pline in r["preview"].strip().splitlines()[:6]:
|
|
@@ -27,6 +27,54 @@ def _tokenize(text: str) -> list[str]:
|
|
|
27
27
|
return [t for t in tokens if len(t) > 1 or t.isdigit()] # skip single letters
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def _explain_query(query: str, tokens: list[str], results: list[dict]) -> dict:
|
|
31
|
+
"""Generate query diagnostic breakdown for --explain.
|
|
32
|
+
|
|
33
|
+
Returns per-token hit counts, filtered words, top score — gives
|
|
34
|
+
the agent the data it needs to course-correct a search query.
|
|
35
|
+
"""
|
|
36
|
+
# Detect words in original query that were filtered by the tokenizer
|
|
37
|
+
raw_lower = query.lower()
|
|
38
|
+
raw_words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*|\d+", raw_lower)
|
|
39
|
+
filtered = [w for w in raw_words if w not in tokens and len(w) == 1]
|
|
40
|
+
|
|
41
|
+
# Per-token hit counts across all results
|
|
42
|
+
token_hits: dict[str, int] = {}
|
|
43
|
+
for token in tokens:
|
|
44
|
+
count = 0
|
|
45
|
+
for r in results:
|
|
46
|
+
text = (
|
|
47
|
+
(r.get("summary") or "") + " " +
|
|
48
|
+
" ".join(r.get("tags", [])) + " " +
|
|
49
|
+
r.get("path", "")
|
|
50
|
+
).lower()
|
|
51
|
+
if token in text:
|
|
52
|
+
count += 1
|
|
53
|
+
token_hits[token] = count
|
|
54
|
+
|
|
55
|
+
top_score = max((r["score"] for r in results), default=0.0)
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
"query": query,
|
|
59
|
+
"tokens": tokens,
|
|
60
|
+
"token_hits": token_hits,
|
|
61
|
+
"filtered": filtered,
|
|
62
|
+
"total_results": len(results),
|
|
63
|
+
"top_score": round(top_score, 2),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _token_match_info(tokens: list[str], content: str, row) -> tuple[list[str], float]:
|
|
68
|
+
"""Return which query tokens matched and the match ratio."""
|
|
69
|
+
if not tokens:
|
|
70
|
+
return [], 0.0
|
|
71
|
+
summary = (row["summary"] or "") if row["summary"] else ""
|
|
72
|
+
tags = (row["tags"] or "") if row["tags"] else ""
|
|
73
|
+
text_lower = (content + " " + summary + " " + tags).lower()
|
|
74
|
+
matched = [t for t in tokens if t in text_lower]
|
|
75
|
+
return matched, len(matched) / len(tokens)
|
|
76
|
+
|
|
77
|
+
|
|
30
78
|
def _token_match_ratio(tokens: list[str], text: str) -> float:
|
|
31
79
|
"""What fraction of query tokens appear in the document text?"""
|
|
32
80
|
if not tokens:
|
|
@@ -47,6 +95,53 @@ def _matches_exclude(path: str, excludes: list[str]) -> bool:
|
|
|
47
95
|
return False
|
|
48
96
|
|
|
49
97
|
|
|
98
|
+
def _group_results(results: list[dict], group_depth: int = 3) -> dict:
|
|
99
|
+
"""Group flat search results by directory prefix for cluster display.
|
|
100
|
+
|
|
101
|
+
Returns a dict with 'groups', 'total_files', 'total_results'.
|
|
102
|
+
Each group has: prefix, count, max_score, files (top 3 per group).
|
|
103
|
+
"""
|
|
104
|
+
if not results:
|
|
105
|
+
return {"groups": [], "total_files": 0, "total_results": 0}
|
|
106
|
+
|
|
107
|
+
groups: dict[str, list[dict]] = {}
|
|
108
|
+
seen_paths: set[str] = set()
|
|
109
|
+
|
|
110
|
+
for r in results:
|
|
111
|
+
path = r["path"]
|
|
112
|
+
dir_parts = path.split(os.sep)[:-1] # exclude filename
|
|
113
|
+
if not dir_parts:
|
|
114
|
+
prefix = "."
|
|
115
|
+
else:
|
|
116
|
+
prefix = os.sep.join(dir_parts[:group_depth]) + os.sep
|
|
117
|
+
|
|
118
|
+
if prefix not in groups:
|
|
119
|
+
groups[prefix] = []
|
|
120
|
+
groups[prefix].append(r)
|
|
121
|
+
seen_paths.add(path)
|
|
122
|
+
|
|
123
|
+
group_list = []
|
|
124
|
+
for prefix, files in groups.items():
|
|
125
|
+
# Keep files sorted by score within group
|
|
126
|
+
files.sort(key=lambda f: f["score"], reverse=True)
|
|
127
|
+
group_list.append({
|
|
128
|
+
"prefix": prefix,
|
|
129
|
+
"count": len(files),
|
|
130
|
+
"max_score": files[0]["score"],
|
|
131
|
+
"files": files[:3], # top 3 per group for display
|
|
132
|
+
"has_more": len(files) > 3,
|
|
133
|
+
"remaining": len(files) - 3 if len(files) > 3 else 0,
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
group_list.sort(key=lambda g: g["max_score"], reverse=True)
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"groups": group_list,
|
|
140
|
+
"total_files": len(seen_paths),
|
|
141
|
+
"total_results": len(results),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
50
145
|
class SearchEngine:
|
|
51
146
|
"""SQLite FTS5-backed search engine for a codebase."""
|
|
52
147
|
|
|
@@ -212,12 +307,18 @@ class SearchEngine:
|
|
|
212
307
|
)
|
|
213
308
|
|
|
214
309
|
def search(self, query: str, limit: int = 10,
|
|
215
|
-
exclude: list[str] | None = None
|
|
310
|
+
exclude: list[str] | None = None,
|
|
311
|
+
scope: str | None = None) -> list[dict]:
|
|
216
312
|
"""Search the codebase and return ranked results.
|
|
217
313
|
|
|
218
|
-
Uses
|
|
219
|
-
|
|
220
|
-
|
|
314
|
+
Uses OR semantics: any token can match. The custom scoring layer
|
|
315
|
+
(path weighting, metadata boosts, match ratio) naturally surfaces
|
|
316
|
+
files that match more tokens. A post-filter requires >=50% of query
|
|
317
|
+
tokens to match for multi-token queries.
|
|
318
|
+
|
|
319
|
+
This replaces the previous AND-then-OR fallback, which caused precise
|
|
320
|
+
multi-token queries to return zero results (AND too strict) or too
|
|
321
|
+
many flatly-scored results (OR fallback with no differentiation).
|
|
221
322
|
"""
|
|
222
323
|
if not os.path.exists(self.db_path):
|
|
223
324
|
return []
|
|
@@ -231,44 +332,42 @@ class SearchEngine:
|
|
|
231
332
|
|
|
232
333
|
conn = self._get_connection()
|
|
233
334
|
|
|
234
|
-
#
|
|
235
|
-
|
|
335
|
+
# Build scope clause for path-prefix filtering
|
|
336
|
+
scope_clause = ""
|
|
337
|
+
scope_params: list[str] = []
|
|
338
|
+
if scope:
|
|
339
|
+
scope_norm = scope.rstrip("/")
|
|
340
|
+
scope_clause = "AND f.path LIKE ?"
|
|
341
|
+
scope_params = [scope_norm + "/%"]
|
|
342
|
+
|
|
343
|
+
# Always use OR semantics. Multi-token matches naturally rank higher
|
|
344
|
+
# via _compute_score (match_ratio scales with token coverage).
|
|
345
|
+
or_query = " OR ".join(tokens)
|
|
346
|
+
# Fetch more than needed — scoring will filter to top limit
|
|
347
|
+
fetch_limit = max(limit * 20, 200)
|
|
236
348
|
cursor = conn.execute(
|
|
237
349
|
"SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
|
|
238
350
|
" m.is_init, rank "
|
|
239
351
|
"FROM files f "
|
|
240
352
|
"JOIN file_meta m ON f.path = m.path "
|
|
241
353
|
"WHERE files MATCH ? "
|
|
354
|
+
+ scope_clause + " "
|
|
242
355
|
"ORDER BY rank "
|
|
243
356
|
"LIMIT ?",
|
|
244
|
-
|
|
357
|
+
[or_query] + scope_params + [fetch_limit],
|
|
245
358
|
)
|
|
246
359
|
rows = cursor.fetchall()
|
|
247
360
|
|
|
248
|
-
#
|
|
249
|
-
|
|
250
|
-
or_query = " OR ".join(tokens)
|
|
251
|
-
cursor = conn.execute(
|
|
252
|
-
"SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
|
|
253
|
-
" m.is_init, rank "
|
|
254
|
-
"FROM files f "
|
|
255
|
-
"JOIN file_meta m ON f.path = m.path "
|
|
256
|
-
"WHERE files MATCH ? "
|
|
257
|
-
"ORDER BY rank "
|
|
258
|
-
"LIMIT ?",
|
|
259
|
-
(or_query, limit * 10),
|
|
260
|
-
)
|
|
261
|
-
rows = cursor.fetchall()
|
|
262
|
-
|
|
263
|
-
# Stage 3: post-filter by token coverage
|
|
264
|
-
min_ratio = 0.5 if len(tokens) >= 4 else 0.0
|
|
361
|
+
# Post-filter: for multi-token queries, require >=50% token match
|
|
362
|
+
min_ratio = 0.5 if len(tokens) >= 2 else 0.0
|
|
265
363
|
results = []
|
|
266
364
|
for row in rows:
|
|
267
365
|
if _matches_exclude(row["path"], exclude):
|
|
268
366
|
continue
|
|
269
367
|
|
|
270
368
|
content = row["content"] or ""
|
|
271
|
-
|
|
369
|
+
# Compute which tokens matched and the ratio
|
|
370
|
+
matched, ratio = _token_match_info(tokens, content, row)
|
|
272
371
|
if ratio < min_ratio:
|
|
273
372
|
continue
|
|
274
373
|
|
|
@@ -281,6 +380,7 @@ class SearchEngine:
|
|
|
281
380
|
"tags": (row["tags"] or "").split(),
|
|
282
381
|
"public_api": row["public_api"] or "",
|
|
283
382
|
"score": score,
|
|
383
|
+
"matched_tokens": matched,
|
|
284
384
|
})
|
|
285
385
|
|
|
286
386
|
conn.close()
|
|
@@ -305,17 +405,27 @@ class SearchEngine:
|
|
|
305
405
|
"""Compute relevance score.
|
|
306
406
|
|
|
307
407
|
Core ranking: BM25 from FTS5 (via 'rank') provides the base score.
|
|
308
|
-
On top of that:
|
|
408
|
+
On top of that, a weighted token-count:
|
|
309
409
|
- Metadata boost: public_api (1.0) > tags (0.8) > summary (0.6)
|
|
410
|
+
- Path boost: exact filename (0.8), filename component (0.7), dir (0.5)
|
|
411
|
+
- Content match: 0.3 (base weight, only if nothing above matched)
|
|
310
412
|
- Match ratio: fraction of query tokens found in the document
|
|
311
413
|
- init.py penalty: 0.6x (applied in search())
|
|
312
414
|
"""
|
|
313
415
|
score = 0.0
|
|
314
416
|
|
|
417
|
+
path = row.get("path", "")
|
|
315
418
|
public_api = (row.get("public_api") or "").lower()
|
|
316
419
|
summary = (row.get("summary") or "").lower()
|
|
317
420
|
tags = (row.get("tags") or "").lower()
|
|
318
421
|
|
|
422
|
+
filename_lower = os.path.splitext(os.path.basename(path))[0].lower()
|
|
423
|
+
filename_parts = set(re.split(r'[_\-.]+', filename_lower))
|
|
424
|
+
dir_path = os.path.dirname(path).lower()
|
|
425
|
+
dir_tokens = set(_tokenize(dir_path.replace(os.sep, " ").replace("_", " ").replace("-", " ")))
|
|
426
|
+
# Also add dir path segments directly (e.g., "mplot3d" from "mplot3d/axes3d.py")
|
|
427
|
+
dir_tokens.update(re.split(r'[_\-.]+', dir_path.replace(os.sep, " ")))
|
|
428
|
+
|
|
319
429
|
api_tokens = set(_tokenize(public_api))
|
|
320
430
|
tag_tokens = set(tags.split())
|
|
321
431
|
summary_tokens = set(_tokenize(summary))
|
|
@@ -325,10 +435,19 @@ class SearchEngine:
|
|
|
325
435
|
score += 1.0
|
|
326
436
|
elif token in tag_tokens:
|
|
327
437
|
score += 0.8
|
|
438
|
+
elif token == filename_lower:
|
|
439
|
+
# Exact filename match: token IS the filename (axes3d.py for "axes3d")
|
|
440
|
+
score += 0.8
|
|
441
|
+
elif token in filename_parts:
|
|
442
|
+
# Token appears as a component in the filename (e.g. "axes3d" in "rotate_axes3d_sgskip.py")
|
|
443
|
+
score += 0.7
|
|
328
444
|
elif token in summary_tokens:
|
|
329
445
|
score += 0.6
|
|
446
|
+
elif token in dir_tokens:
|
|
447
|
+
# Token appears in a directory name (e.g., "mplot3d" in path mplot3d/axes3d.py)
|
|
448
|
+
score += 0.5
|
|
330
449
|
else:
|
|
331
|
-
# Content match via
|
|
450
|
+
# Content match via FTS5 — base weight
|
|
332
451
|
score += 0.3
|
|
333
452
|
|
|
334
453
|
# Multiply by match ratio: files matching more query terms rank higher
|
|
@@ -373,6 +492,8 @@ class SearchEngine:
|
|
|
373
492
|
"files_without_meta": 0,
|
|
374
493
|
"total_lines": 0,
|
|
375
494
|
"index_age": "No index yet",
|
|
495
|
+
"coverage_pct": 0.0,
|
|
496
|
+
"warning": None,
|
|
376
497
|
}
|
|
377
498
|
|
|
378
499
|
if not os.path.exists(self.db_path):
|
|
@@ -391,6 +512,17 @@ class SearchEngine:
|
|
|
391
512
|
row = conn.execute("SELECT SUM(content_size) as total FROM file_meta").fetchone()
|
|
392
513
|
result["total_lines"] = row["total"] or 0
|
|
393
514
|
|
|
515
|
+
if result["files_indexed"] > 0:
|
|
516
|
+
result["coverage_pct"] = round(
|
|
517
|
+
(result["files_with_meta"] / result["files_indexed"]) * 100, 1
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
if result["files_indexed"] > 0 and result["coverage_pct"] < 10.0:
|
|
521
|
+
result["warning"] = (
|
|
522
|
+
f"Only {result['coverage_pct']}% of files have metadata. "
|
|
523
|
+
"Search quality will be degraded. Run 'codexlr8 init .' to bootstrap."
|
|
524
|
+
)
|
|
525
|
+
|
|
394
526
|
mtime = os.path.getmtime(self.db_path)
|
|
395
527
|
mtime_dt = datetime.fromtimestamp(mtime)
|
|
396
528
|
age = datetime.now() - mtime_dt
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codexlr8
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: A codebase search engine for LLM coding agents
|
|
5
5
|
Author-email: Sadig Akhund <sadigaxund@gmail.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
|
|
|
64
64
|
|
|
65
65
|
| Layer | Source | Boost |
|
|
66
66
|
|---|---|---|
|
|
67
|
-
| 1 | Raw file content
|
|
68
|
-
|
|
|
67
|
+
| 1 | Raw file content | 0.3× per token |
|
|
68
|
+
| 2a | File path (filename, directory) | 0.5× – 0.8× |
|
|
69
|
+
| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
|
|
69
70
|
| 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
|
|
70
71
|
|
|
71
|
-
Search uses
|
|
72
|
+
Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
|
|
73
|
+
|
|
74
|
+
### Scoped search and clustering
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Narrow to a specific directory (like grep -rn "pattern" dir/)
|
|
78
|
+
codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
|
|
79
|
+
|
|
80
|
+
# Cluster results by directory to see where matches concentrate
|
|
81
|
+
codexlr8 search . "get_visible" --grouped
|
|
82
|
+
# 12 results in 3 directories (8 files) across project:
|
|
83
|
+
# lib/mpl_toolkits/mplot3d/ (5 files)
|
|
84
|
+
# ─ axes3d.py:388 [score: 0.90]
|
|
85
|
+
# ...
|
|
86
|
+
|
|
87
|
+
# Diagnose your query — see which terms hit, which don't
|
|
88
|
+
codexlr8 search . "axes not hiding" --explain
|
|
89
|
+
# Query analysis:
|
|
90
|
+
# "axes" 212 matches — broad term (212/212 results)
|
|
91
|
+
# "not" 77 matches
|
|
92
|
+
# "hiding" 0 matches — consider dropping or replacing
|
|
93
|
+
# Top score: 1.20 (strong match)
|
|
94
|
+
|
|
95
|
+
# Combine both — group, then scope to drill down
|
|
96
|
+
```
|
|
72
97
|
|
|
73
98
|
## .meta.yaml Sidecars
|
|
74
99
|
|
|
@@ -70,3 +70,29 @@ class TestMCPServerLogic:
|
|
|
70
70
|
results = engine.search("login")
|
|
71
71
|
assert len(results) > 0
|
|
72
72
|
assert "main.py" in results[0]["path"]
|
|
73
|
+
|
|
74
|
+
def test_search_with_scope(self, tmp_path):
|
|
75
|
+
"""Scope parameter restricts search to a path prefix."""
|
|
76
|
+
project = tmp_path / "proj"
|
|
77
|
+
src_dir = project / "src"
|
|
78
|
+
lib_dir = project / "lib"
|
|
79
|
+
src_dir.mkdir(parents=True)
|
|
80
|
+
lib_dir.mkdir(parents=True)
|
|
81
|
+
|
|
82
|
+
(src_dir / "auth.py").write_text("def login(): pass\n")
|
|
83
|
+
(lib_dir / "auth.py").write_text("def login(): pass\n")
|
|
84
|
+
|
|
85
|
+
engine = SearchEngine(str(project))
|
|
86
|
+
engine.build_index()
|
|
87
|
+
|
|
88
|
+
# Without scope: both files match
|
|
89
|
+
results = engine.search("login")
|
|
90
|
+
paths = {r["path"] for r in results}
|
|
91
|
+
assert "src/auth.py" in paths
|
|
92
|
+
assert "lib/auth.py" in paths
|
|
93
|
+
|
|
94
|
+
# With scope: only src/ files
|
|
95
|
+
results = engine.search("login", scope="src")
|
|
96
|
+
paths = {r["path"] for r in results}
|
|
97
|
+
assert "src/auth.py" in paths
|
|
98
|
+
assert "lib/auth.py" not in paths
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
-
from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude
|
|
5
|
+
from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude, _group_results, _explain_query
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class TestHelpers:
|
|
@@ -27,6 +27,85 @@ class TestHelpers:
|
|
|
27
27
|
assert not _matches_exclude("auth/session.py", ["tests/*", "test_*"])
|
|
28
28
|
assert not _matches_exclude("models.py", ["tests/*"])
|
|
29
29
|
|
|
30
|
+
def test_group_results_empty(self):
|
|
31
|
+
assert _group_results([]) == {"groups": [], "total_files": 0, "total_results": 0}
|
|
32
|
+
|
|
33
|
+
def test_group_results_multi_dir(self):
|
|
34
|
+
results = [
|
|
35
|
+
{"path": "lib/foo/bar.py", "score": 0.9, "summary": "bar module"},
|
|
36
|
+
{"path": "lib/foo/baz.py", "score": 0.7, "summary": "baz module"},
|
|
37
|
+
{"path": "lib/other/qux.py", "score": 0.8, "summary": "qux module"},
|
|
38
|
+
{"path": "src/main.py", "score": 0.5, "summary": "entry point"},
|
|
39
|
+
]
|
|
40
|
+
grouped = _group_results(results, group_depth=3)
|
|
41
|
+
assert grouped["total_files"] == 4
|
|
42
|
+
assert grouped["total_results"] == 4
|
|
43
|
+
assert len(grouped["groups"]) == 3
|
|
44
|
+
|
|
45
|
+
# Sorted by max score: lib/foo/ (0.9), lib/other/ (0.8), src/ (0.5)
|
|
46
|
+
assert grouped["groups"][0]["prefix"] == "lib/foo/"
|
|
47
|
+
assert grouped["groups"][0]["count"] == 2
|
|
48
|
+
assert grouped["groups"][0]["max_score"] == 0.9
|
|
49
|
+
assert not grouped["groups"][0]["has_more"]
|
|
50
|
+
assert grouped["groups"][1]["prefix"] == "lib/other/"
|
|
51
|
+
assert grouped["groups"][2]["prefix"] == "src/"
|
|
52
|
+
|
|
53
|
+
def test_group_results_root_files(self):
|
|
54
|
+
results = [
|
|
55
|
+
{"path": "main.py", "score": 0.9},
|
|
56
|
+
{"path": "utils.py", "score": 0.7},
|
|
57
|
+
]
|
|
58
|
+
grouped = _group_results(results)
|
|
59
|
+
assert len(grouped["groups"]) == 1
|
|
60
|
+
assert grouped["groups"][0]["prefix"] == "."
|
|
61
|
+
|
|
62
|
+
def test_group_results_depth_capping(self):
|
|
63
|
+
results = [
|
|
64
|
+
{"path": "a/b/c/d/e/file.py", "score": 0.9},
|
|
65
|
+
]
|
|
66
|
+
grouped = _group_results(results, group_depth=2)
|
|
67
|
+
assert grouped["groups"][0]["prefix"] == "a/b/"
|
|
68
|
+
|
|
69
|
+
def test_group_results_truncates_per_group(self):
|
|
70
|
+
results = [
|
|
71
|
+
{"path": f"lib/many/file_{i}.py", "score": 0.9 - i * 0.01}
|
|
72
|
+
for i in range(10)
|
|
73
|
+
]
|
|
74
|
+
grouped = _group_results(results)
|
|
75
|
+
g = grouped["groups"][0]
|
|
76
|
+
assert g["count"] == 10
|
|
77
|
+
assert len(g["files"]) == 3
|
|
78
|
+
assert g["has_more"]
|
|
79
|
+
assert g["remaining"] == 7
|
|
80
|
+
|
|
81
|
+
def test_group_results_sorts_by_max_score(self):
|
|
82
|
+
results = [
|
|
83
|
+
{"path": "lib/low/file.py", "score": 0.3},
|
|
84
|
+
{"path": "src/high/main.py", "score": 0.9},
|
|
85
|
+
{"path": "lib/low/other.py", "score": 0.1},
|
|
86
|
+
]
|
|
87
|
+
grouped = _group_results(results)
|
|
88
|
+
assert grouped["groups"][0]["prefix"] == "src/high/"
|
|
89
|
+
assert grouped["groups"][1]["prefix"] == "lib/low/"
|
|
90
|
+
|
|
91
|
+
def test_explain_query(self):
|
|
92
|
+
results = [
|
|
93
|
+
{"path": "auth/session.py", "score": 0.9, "summary": "auth module", "tags": ["login"]},
|
|
94
|
+
{"path": "auth/__init__.py", "score": 0.6, "summary": "", "tags": []},
|
|
95
|
+
]
|
|
96
|
+
data = _explain_query("login auth x", ["login", "auth", "x"], results)
|
|
97
|
+
assert data["query"] == "login auth x"
|
|
98
|
+
assert data["token_hits"]["login"] == 1 # only session.py tags match
|
|
99
|
+
assert data["token_hits"]["auth"] == 2 # both files have "auth" in path
|
|
100
|
+
assert data["token_hits"]["x"] == 0 # zero matches
|
|
101
|
+
assert data["top_score"] == 0.9
|
|
102
|
+
assert data["filtered"] == []
|
|
103
|
+
|
|
104
|
+
def test_explain_query_filtered(self):
|
|
105
|
+
data = _explain_query("go API v2 a", ["go", "api", "v2"], [])
|
|
106
|
+
assert "a" in data["filtered"]
|
|
107
|
+
assert data["token_hits"] == {"go": 0, "api": 0, "v2": 0}
|
|
108
|
+
|
|
30
109
|
|
|
31
110
|
class TestSearchEngine:
|
|
32
111
|
def test_build_and_search(self, sample_project):
|
|
@@ -152,10 +231,13 @@ class TestSearchEngine:
|
|
|
152
231
|
)
|
|
153
232
|
assert result.exit_code == 0
|
|
154
233
|
data = json.loads(result.output)
|
|
155
|
-
assert isinstance(data,
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
234
|
+
assert isinstance(data, dict)
|
|
235
|
+
assert "results" in data
|
|
236
|
+
results_list = data["results"]
|
|
237
|
+
assert isinstance(results_list, list)
|
|
238
|
+
if results_list:
|
|
239
|
+
assert "path" in results_list[0]
|
|
240
|
+
assert "score" in results_list[0]
|
|
159
241
|
|
|
160
242
|
def test_search_cli_exclude_flag(self, sample_project):
|
|
161
243
|
from click.testing import CliRunner
|
|
@@ -174,6 +256,39 @@ class TestSearchEngine:
|
|
|
174
256
|
auth_lines = [l for l in lines if "auth/" in l]
|
|
175
257
|
assert not auth_lines
|
|
176
258
|
|
|
259
|
+
def test_search_cli_grouped(self, sample_project):
|
|
260
|
+
from click.testing import CliRunner
|
|
261
|
+
from codexlr8.cli import search
|
|
262
|
+
|
|
263
|
+
engine = SearchEngine(str(sample_project))
|
|
264
|
+
engine.build_index()
|
|
265
|
+
|
|
266
|
+
runner = CliRunner()
|
|
267
|
+
result = runner.invoke(
|
|
268
|
+
search, [str(sample_project), "login", "--grouped"]
|
|
269
|
+
)
|
|
270
|
+
assert result.exit_code == 0
|
|
271
|
+
# Should show directory groupings and the scope hint
|
|
272
|
+
assert "Use --scope" in result.output
|
|
273
|
+
assert "(" in result.output # file count per dir
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def test_search_cli_explain(self, sample_project):
|
|
277
|
+
from click.testing import CliRunner
|
|
278
|
+
from codexlr8.cli import search
|
|
279
|
+
|
|
280
|
+
engine = SearchEngine(str(sample_project))
|
|
281
|
+
engine.build_index()
|
|
282
|
+
|
|
283
|
+
runner = CliRunner()
|
|
284
|
+
result = runner.invoke(
|
|
285
|
+
search, [str(sample_project), "login", "--explain"]
|
|
286
|
+
)
|
|
287
|
+
assert result.exit_code == 0
|
|
288
|
+
assert "Query analysis" in result.output
|
|
289
|
+
assert '"login"' in result.output
|
|
290
|
+
assert "matches" in result.output
|
|
291
|
+
|
|
177
292
|
|
|
178
293
|
class TestCLIIndexAndStatus:
|
|
179
294
|
def test_index_command(self, sample_project):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|