codexlr8 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codexlr8
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: A codebase search engine for LLM coding agents
5
5
  Author-email: Sadig Akhund <sadigaxund@gmail.com>
6
6
  License: Apache-2.0
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
64
64
 
65
65
  | Layer | Source | Boost |
66
66
  |---|---|---|
67
- | 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
68
- | 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
67
+ | 1 | Raw file content | 0.3× per token |
68
+ | 2a | File path (filename, directory) | 0.5× – 0.8× |
69
+ | 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
69
70
  | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
70
71
 
71
- Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
72
+ Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
73
+
74
+ ### Scoped search and clustering
75
+
76
+ ```bash
77
+ # Narrow to a specific directory (like grep -rn "pattern" dir/)
78
+ codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
79
+
80
+ # Cluster results by directory to see where matches concentrate
81
+ codexlr8 search . "get_visible" --grouped
82
+ # 12 results in 3 directories (8 files) across project:
83
+ # lib/mpl_toolkits/mplot3d/ (5 files)
84
+ # ─ axes3d.py:388 [score: 0.90]
85
+ # ...
86
+
87
+ # Diagnose your query — see which terms hit, which don't
88
+ codexlr8 search . "axes not hiding" --explain
89
+ # Query analysis:
90
+ # "axes" 212 matches — broad term (212/212 results)
91
+ # "not" 77 matches
92
+ # "hiding" 0 matches — consider dropping or replacing
93
+ # Top score: 1.20 (strong match)
94
+
95
+ # Combine both — group, then scope to drill down
96
+ ```
72
97
 
73
98
  ## .meta.yaml Sidecars
74
99
 
@@ -35,11 +35,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
35
35
 
36
36
  | Layer | Source | Boost |
37
37
  |---|---|---|
38
- | 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
39
- | 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
38
+ | 1 | Raw file content | 0.3× per token |
39
+ | 2a | File path (filename, directory) | 0.5× – 0.8× |
40
+ | 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
40
41
  | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
41
42
 
42
- Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
43
+ Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
44
+
45
+ ### Scoped search and clustering
46
+
47
+ ```bash
48
+ # Narrow to a specific directory (like grep -rn "pattern" dir/)
49
+ codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
50
+
51
+ # Cluster results by directory to see where matches concentrate
52
+ codexlr8 search . "get_visible" --grouped
53
+ # 12 results in 3 directories (8 files) across project:
54
+ # lib/mpl_toolkits/mplot3d/ (5 files)
55
+ # ─ axes3d.py:388 [score: 0.90]
56
+ # ...
57
+
58
+ # Diagnose your query — see which terms hit, which don't
59
+ codexlr8 search . "axes not hiding" --explain
60
+ # Query analysis:
61
+ # "axes" 212 matches — broad term (212/212 results)
62
+ # "not" 77 matches
63
+ # "hiding" 0 matches — consider dropping or replacing
64
+ # Top score: 1.20 (strong match)
65
+
66
+ # Combine both — group, then scope to drill down
67
+ ```
43
68
 
44
69
  ## .meta.yaml Sidecars
45
70
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codexlr8"
7
- version = "0.0.1"
7
+ version = "0.0.2"
8
8
  description = "A codebase search engine for LLM coding agents"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,3 +1,3 @@
1
1
  """CodeXLR8 — A codebase search engine for LLM coding agents."""
2
2
 
3
- __version__ = "0.0.1"
3
+ __version__ = "0.0.2"
@@ -1,12 +1,13 @@
1
1
  """CodeXLR8 CLI — search-first codebase navigation for agents."""
2
2
 
3
3
  import asyncio
4
+ import os
4
5
  import click
5
6
 
6
7
  from .config import load_config
7
8
  from .scanner import scan_project
8
9
  from .meta import generate_missing_sidecars
9
- from .search import SearchEngine
10
+ from .search import SearchEngine, _group_results, _explain_query, _tokenize
10
11
 
11
12
 
12
13
  EXCLUDE_HELP = (
@@ -62,10 +63,19 @@ def scan(project_path: str, output: str | None):
62
63
  @click.argument("query")
63
64
  @click.option("--exclude", "-x", "exclude_patterns", multiple=True,
64
65
  callback=_parse_excludes, help=EXCLUDE_HELP)
66
+ @click.option("--scope", "-s", default=None,
67
+ help="Restrict search to files under a path prefix (e.g. src/ or lib/mpl_toolkits/)")
68
+ @click.option("--grouped", "-g", is_flag=True, default=False,
69
+ help="Cluster results by directory before listing files")
70
+ @click.option("--explain", "-e", is_flag=True, default=False,
71
+ help="Show token breakdown and query diagnostics")
72
+ @click.option("--group-depth", default=3,
73
+ help="Max directory depth for grouping (default: 3)")
65
74
  @click.option("--format", "-f", "output_format",
66
75
  type=click.Choice(["text", "json"]), default="text")
67
76
  @click.option("--limit", "-n", default=10, help="Maximum number of results")
68
77
  def search(project_path: str, query: str, exclude_patterns: list[str],
78
+ scope: str | None, grouped: bool, explain: bool, group_depth: int,
69
79
  output_format: str, limit: int):
70
80
  """Search the codebase for code matching QUERY.
71
81
 
@@ -74,19 +84,52 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
74
84
  \b
75
85
  Examples:
76
86
  codexlr8 search . "login auth"
87
+ codexlr8 search . "login auth" --grouped
88
+ codexlr8 search . "login auth" --explain
77
89
  codexlr8 search . "login auth" --exclude "tests/*"
78
90
  codexlr8 search . "login auth" -x "tests/*" -x "vendor/*"
91
+ codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
79
92
  """
80
93
  engine = SearchEngine(project_path)
81
- results = engine.search(query, limit=limit, exclude=exclude_patterns)
94
+ results = engine.search(query, limit=limit, exclude=exclude_patterns, scope=scope)
82
95
 
83
96
  if output_format == "json":
84
97
  import json
85
- click.echo(json.dumps(results, indent=2))
98
+ output = {"results": results}
99
+ if explain:
100
+ output["explain"] = _explain_query(query, _tokenize(query), results)
101
+ if grouped:
102
+ groups_data = _group_results(results, group_depth)
103
+ output["grouped"] = True
104
+ output["groups"] = groups_data["groups"]
105
+ output["summary"] = {
106
+ "total_results": groups_data["total_results"],
107
+ "total_files": groups_data["total_files"],
108
+ "total_groups": len(groups_data["groups"]),
109
+ }
110
+ click.echo(json.dumps(output, indent=2))
86
111
  return
87
112
 
88
113
  if not results:
89
114
  click.echo("No results found.")
115
+ if explain:
116
+ tokens = _tokenize(query)
117
+ click.echo()
118
+ click.echo("Query analysis:")
119
+ for t in tokens:
120
+ click.echo(f" \"{t}\" \u2717 no matches")
121
+ click.echo()
122
+ click.echo("0 tokens matched. All terms are absent from the codebase.")
123
+ return
124
+
125
+ if explain:
126
+ tokens = _tokenize(query)
127
+ explain_data = _explain_query(query, tokens, results)
128
+ _print_explain(explain_data)
129
+ click.echo()
130
+
131
+ if grouped:
132
+ _print_grouped(results, group_depth, scope)
90
133
  return
91
134
 
92
135
  for i, r in enumerate(results, 1):
@@ -96,6 +139,8 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
96
139
  click.echo(f" meta: {r['summary']}")
97
140
  if r.get("tags"):
98
141
  click.echo(f" tags: {', '.join(r['tags'])}")
142
+ if r.get("matched_tokens"):
143
+ click.echo(f" matched: {', '.join(r['matched_tokens'])}")
99
144
  if r.get("preview"):
100
145
  click.echo(" preview: |")
101
146
  for line in r["preview"].strip().splitlines()[:6]:
@@ -151,6 +196,10 @@ def status(project_path: str):
151
196
  click.echo(f"Files without .meta.yaml: {state['files_without_meta']}")
152
197
  click.echo(f"Total lines indexed: {state['total_lines']}")
153
198
  click.echo(f"Index age: {state.get('index_age', 'N/A')}")
199
+ click.echo(f"Coverage: {state.get('coverage_pct', 0)}%")
200
+ if state.get("warning"):
201
+ click.echo()
202
+ click.secho(f" Warning: {state['warning']}", fg="yellow")
154
203
 
155
204
 
156
205
  @main.command()
@@ -240,7 +289,8 @@ def setup(project_path: str):
240
289
  include = [p.strip() for p in custom_include.split(",") if p.strip()]
241
290
  click.echo()
242
291
 
243
- defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*"]
292
+ defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*",
293
+ "examples/*", "docs/*", "tutorials/*", "benchmarks/*"]
244
294
  custom_exclude = click.prompt(
245
295
  click.style(" Exclude (comma-separated)", fg="bright_white"),
246
296
  default=", ".join(defaults),
@@ -306,6 +356,102 @@ def setup(project_path: str):
306
356
  click.secho(" Run 'codexlr8 index .' to build your first search index.", dim=True)
307
357
 
308
358
 
359
+ def _print_explain(data: dict):
360
+ """Print query diagnostic breakdown."""
361
+ click.secho("Query analysis:", fg="cyan", bold=True)
362
+ click.echo(f" Original: \"{data['query']}\"")
363
+ click.echo(f" Tokens: {', '.join(data['tokens'])}")
364
+ click.echo()
365
+
366
+ for token in data["tokens"]:
367
+ hits = data["token_hits"].get(token, 0)
368
+ total = data["total_results"]
369
+ if hits == 0:
370
+ status = click.style(f"{hits} matches", fg="red")
371
+ hint = " — consider dropping or replacing"
372
+ elif hits <= 3:
373
+ status = click.style(f"{hits} matches", fg="yellow")
374
+ hint = " — very specific"
375
+ elif hits <= total * 0.1:
376
+ status = click.style(f"{hits} matches", fg="green")
377
+ hint = ""
378
+ else:
379
+ status = click.style(f"{hits} matches", fg="yellow")
380
+ hint = f" — broad term ({hits}/{total} results)"
381
+
382
+ click.echo(f" \"{token}\" {status}{hint}")
383
+
384
+ for fw in data["filtered"]:
385
+ click.echo(f" \"{fw}\" {click.style('filtered', fg='yellow')} — single letter, ignored")
386
+
387
+ click.echo()
388
+ top = data["top_score"]
389
+ if top < 0.60:
390
+ quality = click.style("weak", fg="red")
391
+ elif top < 1.20:
392
+ quality = click.style("moderate", fg="yellow")
393
+ else:
394
+ quality = click.style("strong", fg="green")
395
+ click.echo(f" Top score: {top} ({quality} match)")
396
+
397
+ if data["filtered"]:
398
+ click.echo(click.style(" Tip:", dim=True) + " single-letter words are ignored. Use full terms.")
399
+ zero_match = [t for t in data["tokens"] if data["token_hits"].get(t, 0) == 0]
400
+ if zero_match:
401
+ click.echo(click.style(" Tip:", dim=True) + f" \"{zero_match[0]}\" doesn't exist — try a synonym or drop it.")
402
+
403
+
404
+ def _print_grouped(results: list[dict], group_depth: int, scope: str | None):
405
+ """Print search results clustered by directory."""
406
+ groups_data = _group_results(results, group_depth)
407
+ groups = groups_data["groups"]
408
+ total = groups_data["total_results"]
409
+ files = groups_data["total_files"]
410
+
411
+ scope_label = f"in {scope}" if scope else "across project"
412
+ click.echo(f"{total} results in {len(groups)} directories ({files} files) {scope_label}:")
413
+ click.echo()
414
+
415
+ top_groups = groups[:5]
416
+ for g in top_groups:
417
+ # Directory header with match count
418
+ label = g["prefix"].rstrip(os.sep)
419
+ click.echo(f"{label}/ ({g['count']} files)")
420
+
421
+ for f in g["files"]:
422
+ line_info = f"{f['path']}:{f['line_start']}-{f['line_end']}"
423
+ score_info = f"{f['score']:.2f}"
424
+ click.echo(f" {click.style(line_info, fg='cyan')} "
425
+ f"[score: {score_info}]")
426
+
427
+ # Summary line from preview or metadata
428
+ if f.get("summary"):
429
+ click.echo(f" {f['summary']}")
430
+ elif f.get("preview"):
431
+ first_line = f["preview"].strip().splitlines()[0].strip() if f["preview"].strip() else ""
432
+ if first_line:
433
+ click.echo(f" {first_line[:100]}")
434
+
435
+ if g["has_more"]:
436
+ click.echo(f" ... and {g['remaining']} more files")
437
+ click.echo()
438
+
439
+ if len(groups) > 5:
440
+ click.echo(f"... and {len(groups) - 5} more directories")
441
+
442
+ # Scope hint
443
+ click.echo()
444
+ if scope:
445
+ click.echo(click.style("Already scoped. Remove --scope to broaden.", dim=True))
446
+ else:
447
+ click.echo(
448
+ click.style(
449
+ f"Use --scope <dir> to narrow results (e.g. --scope {top_groups[0]['prefix']})",
450
+ dim=True
451
+ )
452
+ )
453
+
454
+
309
455
  def _inject_mcp_config(config_path: str, mcp_json: str) -> None:
310
456
  """Inject the CodeXLR8 MCP config into an existing client config file.
311
457
 
@@ -406,7 +552,51 @@ codebase_search(query="stripe charge customer refund")
406
552
  codebase_search(query="shopping cart checkout payment")
407
553
  ```
408
554
 
409
- Describe what you're looking for in natural language. The engine uses AND semantics — more terms increase precision, not noise.
555
+ ### Query strategy
556
+
557
+ Describe what you're looking for in natural language. The engine uses OR semantics with a scoring layer — more terms increase precision through token-coverage ranking, not a hard AND requirement.
558
+
559
+ **Good queries use distinct, discriminating terms:**
560
+
561
+ | Task | Good query | Why |
562
+ |---|---|---|
563
+ | Fix login bug | `"login auth session token"` | Covers auth module, session, tokens — distinct terms, not synonyms |
564
+ | Payment refund | `"stripe refund charge customer"` | Each term narrows to a different aspect of the feature |
565
+ | 3D plot visibility | `"axes3d draw visible renderer"` | Domain term + method + symptom — different dimensions of the bug |
566
+ | Checkout flow | `"checkout cart payment order"` | Covers all stages of the flow |
567
+
568
+ **What to avoid:**
569
+ - Single-word queries (`"login"`) — too broad, returns everything mentioning login
570
+ - Synonyms (`"login authenticate signin"`) — redundant, wastes tokens without improving coverage
571
+ - Full sentences (`"I need to find where user login happens"`) — stop words like `"I"`, `"need"`, `"to"` are filtered out
572
+
573
+ ### Using scope and grouping
574
+
575
+ When you know which directory the code lives in, scope the search:
576
+
577
+ ```
578
+ codebase_search(query="get_visible", scope="lib/mpl_toolkits/")
579
+ ```
580
+
581
+ When you don't know, run a shell command to see where results cluster:
582
+
583
+ ```bash
584
+ codexlr8 search . "get_visible" --grouped
585
+ ```
586
+
587
+ This prints directories ranked by their highest-scoring file, with a `--scope` hint to copy into your next MCP call.
588
+
589
+ ### When results don't look right
590
+
591
+ Check the `matched` field on each result. If a file you expected isn't showing, the missing token tells you what to adjust. If all results only match 1 of 4 tokens, your terms are too scattered — try removing one.
592
+
593
+ For deeper diagnostics, run:
594
+
595
+ ```bash
596
+ codexlr8 search . "your query" --explain
597
+ ```
598
+
599
+ This shows per-token hit counts and flags zero-match terms so you can refine before calling `codebase_search` again.
410
600
 
411
601
  ## Interpreting results
412
602
 
@@ -418,9 +608,10 @@ Results include:
418
608
  | `score` | Relevance (higher = better) |
419
609
  | `summary` | Human-written description of the file's purpose |
420
610
  | `tags` | Curated keywords (auth, payment, cart, etc.) |
611
+ | `matched` | Which query tokens the file matched — use this to debug failed searches |
421
612
  | `preview` | First ~10 lines around the best match |
422
613
 
423
- **Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest. Raw content matches rank lower. `__init__.py` re-exports are penalized.
614
+ **Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest, followed by filename matches, then path directory matches. Raw content matches rank lowest. `__init__.py` re-exports are penalized.
424
615
 
425
616
  ## Maintaining the index
426
617
 
@@ -508,6 +699,9 @@ Exclude patterns are globs that match file paths. Use `*` for wildcards.
508
699
  | Task | Tool call |
509
700
  |---|---|
510
701
  | Find code for a feature | `codebase_search(query="...")` |
702
+ | Search within a directory | `codebase_search(query="...", scope="src/")` |
703
+ | Cluster results by directory | Shell: `codexlr8 search . "query" --grouped` |
704
+ | Diagnose query terms | Shell: `codexlr8 search . "query" --explain` |
511
705
  | Build/update index | `codebase_index(incremental=true)` |
512
706
  | Check metadata coverage | Shell: `codexlr8 status .` |
513
707
  | Bootstrap missing sidecars | Shell: `codexlr8 init .` |
@@ -32,6 +32,10 @@ def _defaults() -> dict:
32
32
  "__tests__/*",
33
33
  "test_*",
34
34
  "*_test.*",
35
+ "examples/*",
36
+ "docs/*",
37
+ "tutorials/*",
38
+ "benchmarks/*",
35
39
  ],
36
40
  "extensions": [
37
41
  ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".rb",
@@ -61,6 +61,12 @@ async def list_tools() -> list[Tool]:
61
61
  "description": "Glob patterns for files to exclude. "
62
62
  "Uses .codexlr8.yaml defaults if not set.",
63
63
  },
64
+ "scope": {
65
+ "type": "string",
66
+ "description": "Restrict search to files under a path prefix "
67
+ "(e.g. 'src/' or 'lib/mpl_toolkits/'). "
68
+ "Acts as grep -rn's directory filter.",
69
+ },
64
70
  },
65
71
  "required": ["query"],
66
72
  },
@@ -111,9 +117,10 @@ async def _handle_search(args: dict) -> list[TextContent]:
111
117
  query = args["query"]
112
118
  limit = args.get("limit", 10)
113
119
  exclude = args.get("exclude")
120
+ scope = args.get("scope")
114
121
 
115
122
  engine = SearchEngine(project_path)
116
- results = engine.search(query, limit=limit, exclude=exclude)
123
+ results = engine.search(query, limit=limit, exclude=exclude, scope=scope)
117
124
 
118
125
  if not results:
119
126
  return [TextContent(type="text", text="No results found.")]
@@ -128,6 +135,8 @@ async def _handle_search(args: dict) -> list[TextContent]:
128
135
  lines.append(f" summary: {r['summary']}")
129
136
  if r.get("tags"):
130
137
  lines.append(f" tags: {', '.join(r['tags'])}")
138
+ if r.get("matched_tokens"):
139
+ lines.append(f" matched: {', '.join(r['matched_tokens'])}")
131
140
  if r.get("preview"):
132
141
  lines.append(" preview: |")
133
142
  for pline in r["preview"].strip().splitlines()[:6]:
@@ -27,6 +27,54 @@ def _tokenize(text: str) -> list[str]:
27
27
  return [t for t in tokens if len(t) > 1 or t.isdigit()] # skip single letters
28
28
 
29
29
 
30
+ def _explain_query(query: str, tokens: list[str], results: list[dict]) -> dict:
31
+ """Generate query diagnostic breakdown for --explain.
32
+
33
+ Returns per-token hit counts, filtered words, top score — gives
34
+ the agent the data it needs to course-correct a search query.
35
+ """
36
+ # Detect words in original query that were filtered by the tokenizer
37
+ raw_lower = query.lower()
38
+ raw_words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*|\d+", raw_lower)
39
+ filtered = [w for w in raw_words if w not in tokens and len(w) == 1]
40
+
41
+ # Per-token hit counts across all results
42
+ token_hits: dict[str, int] = {}
43
+ for token in tokens:
44
+ count = 0
45
+ for r in results:
46
+ text = (
47
+ (r.get("summary") or "") + " " +
48
+ " ".join(r.get("tags", [])) + " " +
49
+ r.get("path", "")
50
+ ).lower()
51
+ if token in text:
52
+ count += 1
53
+ token_hits[token] = count
54
+
55
+ top_score = max((r["score"] for r in results), default=0.0)
56
+
57
+ return {
58
+ "query": query,
59
+ "tokens": tokens,
60
+ "token_hits": token_hits,
61
+ "filtered": filtered,
62
+ "total_results": len(results),
63
+ "top_score": round(top_score, 2),
64
+ }
65
+
66
+
67
+ def _token_match_info(tokens: list[str], content: str, row) -> tuple[list[str], float]:
68
+ """Return which query tokens matched and the match ratio."""
69
+ if not tokens:
70
+ return [], 0.0
71
+ summary = (row["summary"] or "") if row["summary"] else ""
72
+ tags = (row["tags"] or "") if row["tags"] else ""
73
+ text_lower = (content + " " + summary + " " + tags).lower()
74
+ matched = [t for t in tokens if t in text_lower]
75
+ return matched, len(matched) / len(tokens)
76
+
77
+
30
78
  def _token_match_ratio(tokens: list[str], text: str) -> float:
31
79
  """What fraction of query tokens appear in the document text?"""
32
80
  if not tokens:
@@ -47,6 +95,53 @@ def _matches_exclude(path: str, excludes: list[str]) -> bool:
47
95
  return False
48
96
 
49
97
 
98
+ def _group_results(results: list[dict], group_depth: int = 3) -> dict:
99
+ """Group flat search results by directory prefix for cluster display.
100
+
101
+ Returns a dict with 'groups', 'total_files', 'total_results'.
102
+ Each group has: prefix, count, max_score, files (top 3 per group).
103
+ """
104
+ if not results:
105
+ return {"groups": [], "total_files": 0, "total_results": 0}
106
+
107
+ groups: dict[str, list[dict]] = {}
108
+ seen_paths: set[str] = set()
109
+
110
+ for r in results:
111
+ path = r["path"]
112
+ dir_parts = path.split(os.sep)[:-1] # exclude filename
113
+ if not dir_parts:
114
+ prefix = "."
115
+ else:
116
+ prefix = os.sep.join(dir_parts[:group_depth]) + os.sep
117
+
118
+ if prefix not in groups:
119
+ groups[prefix] = []
120
+ groups[prefix].append(r)
121
+ seen_paths.add(path)
122
+
123
+ group_list = []
124
+ for prefix, files in groups.items():
125
+ # Keep files sorted by score within group
126
+ files.sort(key=lambda f: f["score"], reverse=True)
127
+ group_list.append({
128
+ "prefix": prefix,
129
+ "count": len(files),
130
+ "max_score": files[0]["score"],
131
+ "files": files[:3], # top 3 per group for display
132
+ "has_more": len(files) > 3,
133
+ "remaining": len(files) - 3 if len(files) > 3 else 0,
134
+ })
135
+
136
+ group_list.sort(key=lambda g: g["max_score"], reverse=True)
137
+
138
+ return {
139
+ "groups": group_list,
140
+ "total_files": len(seen_paths),
141
+ "total_results": len(results),
142
+ }
143
+
144
+
50
145
  class SearchEngine:
51
146
  """SQLite FTS5-backed search engine for a codebase."""
52
147
 
@@ -212,12 +307,18 @@ class SearchEngine:
212
307
  )
213
308
 
214
309
  def search(self, query: str, limit: int = 10,
215
- exclude: list[str] | None = None) -> list[dict]:
310
+ exclude: list[str] | None = None,
311
+ scope: str | None = None) -> list[dict]:
216
312
  """Search the codebase and return ranked results.
217
313
 
218
- Uses AND semantics: all query tokens must match (like Google).
219
- Falls back to OR if AND returns nothing, with a post-filter
220
- requiring at least 50% of query tokens to match the document.
314
+ Uses OR semantics: any token can match. The custom scoring layer
315
+ (path weighting, metadata boosts, match ratio) naturally surfaces
316
+ files that match more tokens. A post-filter requires >=50% of query
317
+ tokens to match for multi-token queries.
318
+
319
+ This replaces the previous AND-then-OR fallback, which caused precise
320
+ multi-token queries to return zero results (AND too strict) or too
321
+ many flatly-scored results (OR fallback with no differentiation).
221
322
  """
222
323
  if not os.path.exists(self.db_path):
223
324
  return []
@@ -231,44 +332,42 @@ class SearchEngine:
231
332
 
232
333
  conn = self._get_connection()
233
334
 
234
- # Stage 1: try AND (best precision)
235
- and_query = " AND ".join(tokens)
335
+ # Build scope clause for path-prefix filtering
336
+ scope_clause = ""
337
+ scope_params: list[str] = []
338
+ if scope:
339
+ scope_norm = scope.rstrip("/")
340
+ scope_clause = "AND f.path LIKE ?"
341
+ scope_params = [scope_norm + "/%"]
342
+
343
+ # Always use OR semantics. Multi-token matches naturally rank higher
344
+ # via _compute_score (match_ratio scales with token coverage).
345
+ or_query = " OR ".join(tokens)
346
+ # Fetch more than needed — scoring will filter to top limit
347
+ fetch_limit = max(limit * 20, 200)
236
348
  cursor = conn.execute(
237
349
  "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
238
350
  " m.is_init, rank "
239
351
  "FROM files f "
240
352
  "JOIN file_meta m ON f.path = m.path "
241
353
  "WHERE files MATCH ? "
354
+ + scope_clause + " "
242
355
  "ORDER BY rank "
243
356
  "LIMIT ?",
244
- (and_query, limit * 5),
357
+ [or_query] + scope_params + [fetch_limit],
245
358
  )
246
359
  rows = cursor.fetchall()
247
360
 
248
- # Stage 2: fall back to OR if AND found nothing
249
- if not rows and len(tokens) > 1:
250
- or_query = " OR ".join(tokens)
251
- cursor = conn.execute(
252
- "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
253
- " m.is_init, rank "
254
- "FROM files f "
255
- "JOIN file_meta m ON f.path = m.path "
256
- "WHERE files MATCH ? "
257
- "ORDER BY rank "
258
- "LIMIT ?",
259
- (or_query, limit * 10),
260
- )
261
- rows = cursor.fetchall()
262
-
263
- # Stage 3: post-filter by token coverage
264
- min_ratio = 0.5 if len(tokens) >= 4 else 0.0
361
+ # Post-filter: for multi-token queries, require >=50% token match
362
+ min_ratio = 0.5 if len(tokens) >= 2 else 0.0
265
363
  results = []
266
364
  for row in rows:
267
365
  if _matches_exclude(row["path"], exclude):
268
366
  continue
269
367
 
270
368
  content = row["content"] or ""
271
- ratio = _token_match_ratio(tokens, content + (row["summary"] or "") + (row["tags"] or ""))
369
+ # Compute which tokens matched and the ratio
370
+ matched, ratio = _token_match_info(tokens, content, row)
272
371
  if ratio < min_ratio:
273
372
  continue
274
373
 
@@ -281,6 +380,7 @@ class SearchEngine:
281
380
  "tags": (row["tags"] or "").split(),
282
381
  "public_api": row["public_api"] or "",
283
382
  "score": score,
383
+ "matched_tokens": matched,
284
384
  })
285
385
 
286
386
  conn.close()
@@ -305,17 +405,27 @@ class SearchEngine:
305
405
  """Compute relevance score.
306
406
 
307
407
  Core ranking: BM25 from FTS5 (via 'rank') provides the base score.
308
- On top of that:
408
+ On top of that, a weighted token-count:
309
409
  - Metadata boost: public_api (1.0) > tags (0.8) > summary (0.6)
410
+ - Path boost: exact filename (0.8), filename component (0.7), dir (0.5)
411
+ - Content match: 0.3 (base weight, only if nothing above matched)
310
412
  - Match ratio: fraction of query tokens found in the document
311
413
  - init.py penalty: 0.6x (applied in search())
312
414
  """
313
415
  score = 0.0
314
416
 
417
+ path = row.get("path", "")
315
418
  public_api = (row.get("public_api") or "").lower()
316
419
  summary = (row.get("summary") or "").lower()
317
420
  tags = (row.get("tags") or "").lower()
318
421
 
422
+ filename_lower = os.path.splitext(os.path.basename(path))[0].lower()
423
+ filename_parts = set(re.split(r'[_\-.]+', filename_lower))
424
+ dir_path = os.path.dirname(path).lower()
425
+ dir_tokens = set(_tokenize(dir_path.replace(os.sep, " ").replace("_", " ").replace("-", " ")))
426
+ # Also add dir path segments directly (e.g., "mplot3d" from "mplot3d/axes3d.py")
427
+ dir_tokens.update(re.split(r'[_\-.]+', dir_path.replace(os.sep, " ")))
428
+
319
429
  api_tokens = set(_tokenize(public_api))
320
430
  tag_tokens = set(tags.split())
321
431
  summary_tokens = set(_tokenize(summary))
@@ -325,10 +435,19 @@ class SearchEngine:
325
435
  score += 1.0
326
436
  elif token in tag_tokens:
327
437
  score += 0.8
438
+ elif token == filename_lower:
439
+ # Exact filename match: token IS the filename (axes3d.py for "axes3d")
440
+ score += 0.8
441
+ elif token in filename_parts:
442
+ # Token appears as a component in the filename (e.g. "axes3d" in "rotate_axes3d_sgskip.py")
443
+ score += 0.7
328
444
  elif token in summary_tokens:
329
445
  score += 0.6
446
+ elif token in dir_tokens:
447
+ # Token appears in a directory name (e.g., "mplot3d" in path mplot3d/axes3d.py)
448
+ score += 0.5
330
449
  else:
331
- # Content match via BM25 — base weight
450
+ # Content match via FTS5 — base weight
332
451
  score += 0.3
333
452
 
334
453
  # Multiply by match ratio: files matching more query terms rank higher
@@ -373,6 +492,8 @@ class SearchEngine:
373
492
  "files_without_meta": 0,
374
493
  "total_lines": 0,
375
494
  "index_age": "No index yet",
495
+ "coverage_pct": 0.0,
496
+ "warning": None,
376
497
  }
377
498
 
378
499
  if not os.path.exists(self.db_path):
@@ -391,6 +512,17 @@ class SearchEngine:
391
512
  row = conn.execute("SELECT SUM(content_size) as total FROM file_meta").fetchone()
392
513
  result["total_lines"] = row["total"] or 0
393
514
 
515
+ if result["files_indexed"] > 0:
516
+ result["coverage_pct"] = round(
517
+ (result["files_with_meta"] / result["files_indexed"]) * 100, 1
518
+ )
519
+
520
+ if result["files_indexed"] > 0 and result["coverage_pct"] < 10.0:
521
+ result["warning"] = (
522
+ f"Only {result['coverage_pct']}% of files have metadata. "
523
+ "Search quality will be degraded. Run 'codexlr8 init .' to bootstrap."
524
+ )
525
+
394
526
  mtime = os.path.getmtime(self.db_path)
395
527
  mtime_dt = datetime.fromtimestamp(mtime)
396
528
  age = datetime.now() - mtime_dt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codexlr8
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: A codebase search engine for LLM coding agents
5
5
  Author-email: Sadig Akhund <sadigaxund@gmail.com>
6
6
  License: Apache-2.0
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
64
64
 
65
65
  | Layer | Source | Boost |
66
66
  |---|---|---|
67
- | 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
68
- | 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
67
+ | 1 | Raw file content | 0.3× per token |
68
+ | 2a | File path (filename, directory) | 0.5× – 0.8× |
69
+ | 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
69
70
  | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
70
71
 
71
- Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
72
+ Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
73
+
74
+ ### Scoped search and clustering
75
+
76
+ ```bash
77
+ # Narrow to a specific directory (like grep -rn "pattern" dir/)
78
+ codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
79
+
80
+ # Cluster results by directory to see where matches concentrate
81
+ codexlr8 search . "get_visible" --grouped
82
+ # 12 results in 3 directories (8 files) across project:
83
+ # lib/mpl_toolkits/mplot3d/ (5 files)
84
+ # ─ axes3d.py:388 [score: 0.90]
85
+ # ...
86
+
87
+ # Diagnose your query — see which terms hit, which don't
88
+ codexlr8 search . "axes not hiding" --explain
89
+ # Query analysis:
90
+ # "axes" 212 matches — broad term (212/212 results)
91
+ # "not" 77 matches
92
+ # "hiding" 0 matches — consider dropping or replacing
93
+ # Top score: 1.20 (strong match)
94
+
95
+ # Combine both — group, then scope to drill down
96
+ ```
72
97
 
73
98
  ## .meta.yaml Sidecars
74
99
 
@@ -70,3 +70,29 @@ class TestMCPServerLogic:
70
70
  results = engine.search("login")
71
71
  assert len(results) > 0
72
72
  assert "main.py" in results[0]["path"]
73
+
74
+ def test_search_with_scope(self, tmp_path):
75
+ """Scope parameter restricts search to a path prefix."""
76
+ project = tmp_path / "proj"
77
+ src_dir = project / "src"
78
+ lib_dir = project / "lib"
79
+ src_dir.mkdir(parents=True)
80
+ lib_dir.mkdir(parents=True)
81
+
82
+ (src_dir / "auth.py").write_text("def login(): pass\n")
83
+ (lib_dir / "auth.py").write_text("def login(): pass\n")
84
+
85
+ engine = SearchEngine(str(project))
86
+ engine.build_index()
87
+
88
+ # Without scope: both files match
89
+ results = engine.search("login")
90
+ paths = {r["path"] for r in results}
91
+ assert "src/auth.py" in paths
92
+ assert "lib/auth.py" in paths
93
+
94
+ # With scope: only src/ files
95
+ results = engine.search("login", scope="src")
96
+ paths = {r["path"] for r in results}
97
+ assert "src/auth.py" in paths
98
+ assert "lib/auth.py" not in paths
@@ -2,7 +2,7 @@
2
2
 
3
3
  import json
4
4
 
5
- from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude
5
+ from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude, _group_results, _explain_query
6
6
 
7
7
 
8
8
  class TestHelpers:
@@ -27,6 +27,85 @@ class TestHelpers:
27
27
  assert not _matches_exclude("auth/session.py", ["tests/*", "test_*"])
28
28
  assert not _matches_exclude("models.py", ["tests/*"])
29
29
 
30
+ def test_group_results_empty(self):
31
+ assert _group_results([]) == {"groups": [], "total_files": 0, "total_results": 0}
32
+
33
+ def test_group_results_multi_dir(self):
34
+ results = [
35
+ {"path": "lib/foo/bar.py", "score": 0.9, "summary": "bar module"},
36
+ {"path": "lib/foo/baz.py", "score": 0.7, "summary": "baz module"},
37
+ {"path": "lib/other/qux.py", "score": 0.8, "summary": "qux module"},
38
+ {"path": "src/main.py", "score": 0.5, "summary": "entry point"},
39
+ ]
40
+ grouped = _group_results(results, group_depth=3)
41
+ assert grouped["total_files"] == 4
42
+ assert grouped["total_results"] == 4
43
+ assert len(grouped["groups"]) == 3
44
+
45
+ # Sorted by max score: lib/foo/ (0.9), lib/other/ (0.8), src/ (0.5)
46
+ assert grouped["groups"][0]["prefix"] == "lib/foo/"
47
+ assert grouped["groups"][0]["count"] == 2
48
+ assert grouped["groups"][0]["max_score"] == 0.9
49
+ assert not grouped["groups"][0]["has_more"]
50
+ assert grouped["groups"][1]["prefix"] == "lib/other/"
51
+ assert grouped["groups"][2]["prefix"] == "src/"
52
+
53
+ def test_group_results_root_files(self):
54
+ results = [
55
+ {"path": "main.py", "score": 0.9},
56
+ {"path": "utils.py", "score": 0.7},
57
+ ]
58
+ grouped = _group_results(results)
59
+ assert len(grouped["groups"]) == 1
60
+ assert grouped["groups"][0]["prefix"] == "."
61
+
62
+ def test_group_results_depth_capping(self):
63
+ results = [
64
+ {"path": "a/b/c/d/e/file.py", "score": 0.9},
65
+ ]
66
+ grouped = _group_results(results, group_depth=2)
67
+ assert grouped["groups"][0]["prefix"] == "a/b/"
68
+
69
+ def test_group_results_truncates_per_group(self):
70
+ results = [
71
+ {"path": f"lib/many/file_{i}.py", "score": 0.9 - i * 0.01}
72
+ for i in range(10)
73
+ ]
74
+ grouped = _group_results(results)
75
+ g = grouped["groups"][0]
76
+ assert g["count"] == 10
77
+ assert len(g["files"]) == 3
78
+ assert g["has_more"]
79
+ assert g["remaining"] == 7
80
+
81
+ def test_group_results_sorts_by_max_score(self):
82
+ results = [
83
+ {"path": "lib/low/file.py", "score": 0.3},
84
+ {"path": "src/high/main.py", "score": 0.9},
85
+ {"path": "lib/low/other.py", "score": 0.1},
86
+ ]
87
+ grouped = _group_results(results)
88
+ assert grouped["groups"][0]["prefix"] == "src/high/"
89
+ assert grouped["groups"][1]["prefix"] == "lib/low/"
90
+
91
+ def test_explain_query(self):
92
+ results = [
93
+ {"path": "auth/session.py", "score": 0.9, "summary": "auth module", "tags": ["login"]},
94
+ {"path": "auth/__init__.py", "score": 0.6, "summary": "", "tags": []},
95
+ ]
96
+ data = _explain_query("login auth x", ["login", "auth", "x"], results)
97
+ assert data["query"] == "login auth x"
98
+ assert data["token_hits"]["login"] == 1 # only session.py tags match
99
+ assert data["token_hits"]["auth"] == 2 # both files have "auth" in path
100
+ assert data["token_hits"]["x"] == 0 # zero matches
101
+ assert data["top_score"] == 0.9
102
+ assert data["filtered"] == []
103
+
104
+ def test_explain_query_filtered(self):
105
+ data = _explain_query("go API v2 a", ["go", "api", "v2"], [])
106
+ assert "a" in data["filtered"]
107
+ assert data["token_hits"] == {"go": 0, "api": 0, "v2": 0}
108
+
30
109
 
31
110
  class TestSearchEngine:
32
111
  def test_build_and_search(self, sample_project):
@@ -152,10 +231,13 @@ class TestSearchEngine:
152
231
  )
153
232
  assert result.exit_code == 0
154
233
  data = json.loads(result.output)
155
- assert isinstance(data, list)
156
- if data:
157
- assert "path" in data[0]
158
- assert "score" in data[0]
234
+ assert isinstance(data, dict)
235
+ assert "results" in data
236
+ results_list = data["results"]
237
+ assert isinstance(results_list, list)
238
+ if results_list:
239
+ assert "path" in results_list[0]
240
+ assert "score" in results_list[0]
159
241
 
160
242
  def test_search_cli_exclude_flag(self, sample_project):
161
243
  from click.testing import CliRunner
@@ -174,6 +256,39 @@ class TestSearchEngine:
174
256
  auth_lines = [l for l in lines if "auth/" in l]
175
257
  assert not auth_lines
176
258
 
259
+ def test_search_cli_grouped(self, sample_project):
260
+ from click.testing import CliRunner
261
+ from codexlr8.cli import search
262
+
263
+ engine = SearchEngine(str(sample_project))
264
+ engine.build_index()
265
+
266
+ runner = CliRunner()
267
+ result = runner.invoke(
268
+ search, [str(sample_project), "login", "--grouped"]
269
+ )
270
+ assert result.exit_code == 0
271
+ # Should show directory groupings and the scope hint
272
+ assert "Use --scope" in result.output
273
+ assert "(" in result.output # file count per dir
274
+
275
+
276
+ def test_search_cli_explain(self, sample_project):
277
+ from click.testing import CliRunner
278
+ from codexlr8.cli import search
279
+
280
+ engine = SearchEngine(str(sample_project))
281
+ engine.build_index()
282
+
283
+ runner = CliRunner()
284
+ result = runner.invoke(
285
+ search, [str(sample_project), "login", "--explain"]
286
+ )
287
+ assert result.exit_code == 0
288
+ assert "Query analysis" in result.output
289
+ assert '"login"' in result.output
290
+ assert "matches" in result.output
291
+
177
292
 
178
293
  class TestCLIIndexAndStatus:
179
294
  def test_index_command(self, sample_project):
File without changes
File without changes
File without changes
File without changes
File without changes