PyPI - codexlr8 - Versions diffs - 0.0.1__tar.gz → 0.0.2__tar.gz - Mend

codexlr8 0.0.1tar.gz → 0.0.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{codexlr8-0.0.1 → codexlr8-0.0.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codexlr8
-Version: 0.0.1
+Version: 0.0.2
 Summary: A codebase search engine for LLM coding agents
 Author-email: Sadig Akhund <sadigaxund@gmail.com>
 License: Apache-2.0
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
 | Layer | Source | Boost |
 |---|---|---|
-| 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
-| 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
+| 1 | Raw file content | 0.3× per token |
+| 2a | File path (filename, directory) | 0.5× – 0.8× |
+| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
 | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
-Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
+Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
+### Scoped search and clustering
+```bash
+# Narrow to a specific directory (like grep -rn "pattern" dir/)
+codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
+# Cluster results by directory to see where matches concentrate
+codexlr8 search . "get_visible" --grouped
+# 12 results in 3 directories (8 files) across project:
+#   lib/mpl_toolkits/mplot3d/  (5 files)
+#     ─ axes3d.py:388  [score: 0.90]
+#     ...
+# Diagnose your query — see which terms hit, which don't
+codexlr8 search . "axes not hiding" --explain
+# Query analysis:
+#   "axes"    212 matches  — broad term (212/212 results)
+#   "not"     77 matches
+#   "hiding"  0 matches    — consider dropping or replacing
+#   Top score: 1.20 (strong match)
+# Combine both — group, then scope to drill down
+```
 ## .meta.yaml Sidecars

{codexlr8-0.0.1 → codexlr8-0.0.2}/README.md RENAMED Viewed

@@ -35,11 +35,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
 | Layer | Source | Boost |
 |---|---|---|
-| 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
-| 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
+| 1 | Raw file content | 0.3× per token |
+| 2a | File path (filename, directory) | 0.5× – 0.8× |
+| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
 | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
-Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
+Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
+### Scoped search and clustering
+```bash
+# Narrow to a specific directory (like grep -rn "pattern" dir/)
+codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
+# Cluster results by directory to see where matches concentrate
+codexlr8 search . "get_visible" --grouped
+# 12 results in 3 directories (8 files) across project:
+#   lib/mpl_toolkits/mplot3d/  (5 files)
+#     ─ axes3d.py:388  [score: 0.90]
+#     ...
+# Diagnose your query — see which terms hit, which don't
+codexlr8 search . "axes not hiding" --explain
+# Query analysis:
+#   "axes"    212 matches  — broad term (212/212 results)
+#   "not"     77 matches
+#   "hiding"  0 matches    — consider dropping or replacing
+#   Top score: 1.20 (strong match)
+# Combine both — group, then scope to drill down
+```
 ## .meta.yaml Sidecars

{codexlr8-0.0.1 → codexlr8-0.0.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "codexlr8"
-version = "0.0.1"
+version = "0.0.2"
 description = "A codebase search engine for LLM coding agents"
 readme = "README.md"
 requires-python = ">=3.10"

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """CodeXLR8 — A codebase search engine for LLM coding agents."""
-__version__ = "0.0.1"
+__version__ = "0.0.2"

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/cli.py RENAMED Viewed

@@ -1,12 +1,13 @@
 """CodeXLR8 CLI — search-first codebase navigation for agents."""
 import asyncio
+import os
 import click
 from .config import load_config
 from .scanner import scan_project
 from .meta import generate_missing_sidecars
-from .search import SearchEngine
+from .search import SearchEngine, _group_results, _explain_query, _tokenize
 EXCLUDE_HELP = (
@@ -62,10 +63,19 @@ def scan(project_path: str, output: str | None):
 @click.argument("query")
 @click.option("--exclude", "-x", "exclude_patterns", multiple=True,
               callback=_parse_excludes, help=EXCLUDE_HELP)
+@click.option("--scope", "-s", default=None,
+              help="Restrict search to files under a path prefix (e.g. src/ or lib/mpl_toolkits/)")
+@click.option("--grouped", "-g", is_flag=True, default=False,
+              help="Cluster results by directory before listing files")
+@click.option("--explain", "-e", is_flag=True, default=False,
+              help="Show token breakdown and query diagnostics")
+@click.option("--group-depth", default=3,
+              help="Max directory depth for grouping (default: 3)")
 @click.option("--format", "-f", "output_format",
               type=click.Choice(["text", "json"]), default="text")
 @click.option("--limit", "-n", default=10, help="Maximum number of results")
 def search(project_path: str, query: str, exclude_patterns: list[str],
+           scope: str | None, grouped: bool, explain: bool, group_depth: int,
            output_format: str, limit: int):
     """Search the codebase for code matching QUERY.
@@ -74,19 +84,52 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
     \b
     Examples:
       codexlr8 search . "login auth"
+      codexlr8 search . "login auth" --grouped
+      codexlr8 search . "login auth" --explain
       codexlr8 search . "login auth" --exclude "tests/*"
       codexlr8 search . "login auth" -x "tests/*" -x "vendor/*"
+      codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
     """
     engine = SearchEngine(project_path)
-    results = engine.search(query, limit=limit, exclude=exclude_patterns)
+    results = engine.search(query, limit=limit, exclude=exclude_patterns, scope=scope)
     if output_format == "json":
         import json
-        click.echo(json.dumps(results, indent=2))
+        output = {"results": results}
+        if explain:
+            output["explain"] = _explain_query(query, _tokenize(query), results)
+        if grouped:
+            groups_data = _group_results(results, group_depth)
+            output["grouped"] = True
+            output["groups"] = groups_data["groups"]
+            output["summary"] = {
+                "total_results": groups_data["total_results"],
+                "total_files": groups_data["total_files"],
+                "total_groups": len(groups_data["groups"]),
+            }
+        click.echo(json.dumps(output, indent=2))
         return
     if not results:
         click.echo("No results found.")
+        if explain:
+            tokens = _tokenize(query)
+            click.echo()
+            click.echo("Query analysis:")
+            for t in tokens:
+                click.echo(f"  \"{t}\"  \u2717 no matches")
+            click.echo()
+            click.echo("0 tokens matched. All terms are absent from the codebase.")
+        return
+    if explain:
+        tokens = _tokenize(query)
+        explain_data = _explain_query(query, tokens, results)
+        _print_explain(explain_data)
+        click.echo()
+    if grouped:
+        _print_grouped(results, group_depth, scope)
         return
     for i, r in enumerate(results, 1):
@@ -96,6 +139,8 @@ def search(project_path: str, query: str, exclude_patterns: list[str],
             click.echo(f"   meta:   {r['summary']}")
         if r.get("tags"):
             click.echo(f"   tags:   {', '.join(r['tags'])}")
+        if r.get("matched_tokens"):
+            click.echo(f"   matched: {', '.join(r['matched_tokens'])}")
         if r.get("preview"):
             click.echo("   preview: |")
             for line in r["preview"].strip().splitlines()[:6]:
@@ -151,6 +196,10 @@ def status(project_path: str):
     click.echo(f"Files without .meta.yaml: {state['files_without_meta']}")
     click.echo(f"Total lines indexed: {state['total_lines']}")
     click.echo(f"Index age: {state.get('index_age', 'N/A')}")
+    click.echo(f"Coverage: {state.get('coverage_pct', 0)}%")
+    if state.get("warning"):
+        click.echo()
+        click.secho(f"  Warning: {state['warning']}", fg="yellow")
 @main.command()
@@ -240,7 +289,8 @@ def setup(project_path: str):
     include = [p.strip() for p in custom_include.split(",") if p.strip()]
     click.echo()
-    defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*"]
+    defaults = ["tests/*", "test/*", "spec/*", "__tests__/*", "test_*", "*_test.*",
+                "examples/*", "docs/*", "tutorials/*", "benchmarks/*"]
     custom_exclude = click.prompt(
         click.style("    Exclude (comma-separated)", fg="bright_white"),
         default=", ".join(defaults),
@@ -306,6 +356,102 @@ def setup(project_path: str):
     click.secho("  Run 'codexlr8 index .' to build your first search index.", dim=True)
+def _print_explain(data: dict):
+    """Print query diagnostic breakdown."""
+    click.secho("Query analysis:", fg="cyan", bold=True)
+    click.echo(f"  Original:  \"{data['query']}\"")
+    click.echo(f"  Tokens:    {', '.join(data['tokens'])}")
+    click.echo()
+    for token in data["tokens"]:
+        hits = data["token_hits"].get(token, 0)
+        total = data["total_results"]
+        if hits == 0:
+            status = click.style(f"{hits} matches", fg="red")
+            hint = " — consider dropping or replacing"
+        elif hits <= 3:
+            status = click.style(f"{hits} matches", fg="yellow")
+            hint = " — very specific"
+        elif hits <= total * 0.1:
+            status = click.style(f"{hits} matches", fg="green")
+            hint = ""
+        else:
+            status = click.style(f"{hits} matches", fg="yellow")
+            hint = f" — broad term ({hits}/{total} results)"
+        click.echo(f"  \"{token}\"  {status}{hint}")
+    for fw in data["filtered"]:
+        click.echo(f"  \"{fw}\"  {click.style('filtered', fg='yellow')} — single letter, ignored")
+    click.echo()
+    top = data["top_score"]
+    if top < 0.60:
+        quality = click.style("weak", fg="red")
+    elif top < 1.20:
+        quality = click.style("moderate", fg="yellow")
+    else:
+        quality = click.style("strong", fg="green")
+    click.echo(f"  Top score: {top} ({quality} match)")
+    if data["filtered"]:
+        click.echo(click.style("  Tip:", dim=True) + " single-letter words are ignored. Use full terms.")
+    zero_match = [t for t in data["tokens"] if data["token_hits"].get(t, 0) == 0]
+    if zero_match:
+        click.echo(click.style("  Tip:", dim=True) + f" \"{zero_match[0]}\" doesn't exist — try a synonym or drop it.")
+def _print_grouped(results: list[dict], group_depth: int, scope: str | None):
+    """Print search results clustered by directory."""
+    groups_data = _group_results(results, group_depth)
+    groups = groups_data["groups"]
+    total = groups_data["total_results"]
+    files = groups_data["total_files"]
+    scope_label = f"in {scope}" if scope else "across project"
+    click.echo(f"{total} results in {len(groups)} directories ({files} files) {scope_label}:")
+    click.echo()
+    top_groups = groups[:5]
+    for g in top_groups:
+        # Directory header with match count
+        label = g["prefix"].rstrip(os.sep)
+        click.echo(f"{label}/  ({g['count']} files)")
+        for f in g["files"]:
+            line_info = f"{f['path']}:{f['line_start']}-{f['line_end']}"
+            score_info = f"{f['score']:.2f}"
+            click.echo(f"  {click.style(line_info, fg='cyan')}  "
+                       f"[score: {score_info}]")
+            # Summary line from preview or metadata
+            if f.get("summary"):
+                click.echo(f"    {f['summary']}")
+            elif f.get("preview"):
+                first_line = f["preview"].strip().splitlines()[0].strip() if f["preview"].strip() else ""
+                if first_line:
+                    click.echo(f"    {first_line[:100]}")
+        if g["has_more"]:
+            click.echo(f"  ... and {g['remaining']} more files")
+        click.echo()
+    if len(groups) > 5:
+        click.echo(f"... and {len(groups) - 5} more directories")
+    # Scope hint
+    click.echo()
+    if scope:
+        click.echo(click.style("Already scoped. Remove --scope to broaden.", dim=True))
+    else:
+        click.echo(
+            click.style(
+                f"Use --scope <dir> to narrow results (e.g. --scope {top_groups[0]['prefix']})",
+                dim=True
+            )
+        )
 def _inject_mcp_config(config_path: str, mcp_json: str) -> None:
     """Inject the CodeXLR8 MCP config into an existing client config file.
@@ -406,7 +552,51 @@ codebase_search(query="stripe charge customer refund")
 codebase_search(query="shopping cart checkout payment")
 ```
-Describe what you're looking for in natural language. The engine uses AND semantics — more terms increase precision, not noise.
+### Query strategy
+Describe what you're looking for in natural language. The engine uses OR semantics with a scoring layer — more terms increase precision through token-coverage ranking, not a hard AND requirement.
+**Good queries use distinct, discriminating terms:**
+| Task | Good query | Why |
+|---|---|---|
+| Fix login bug | `"login auth session token"` | Covers auth module, session, tokens — distinct terms, not synonyms |
+| Payment refund | `"stripe refund charge customer"` | Each term narrows to a different aspect of the feature |
+| 3D plot visibility | `"axes3d draw visible renderer"` | Domain term + method + symptom — different dimensions of the bug |
+| Checkout flow | `"checkout cart payment order"` | Covers all stages of the flow |
+**What to avoid:**
+- Single-word queries (`"login"`) — too broad, returns everything mentioning login
+- Synonyms (`"login authenticate signin"`) — redundant, wastes tokens without improving coverage
+- Full sentences (`"I need to find where user login happens"`) — stop words like `"I"`, `"need"`, `"to"` are filtered out
+### Using scope and grouping
+When you know which directory the code lives in, scope the search:
+```
+codebase_search(query="get_visible", scope="lib/mpl_toolkits/")
+```
+When you don't know, run a shell command to see where results cluster:
+```bash
+codexlr8 search . "get_visible" --grouped
+```
+This prints directories ranked by their highest-scoring file, with a `--scope` hint to copy into your next MCP call.
+### When results don't look right
+Check the `matched` field on each result. If a file you expected isn't showing, the missing token tells you what to adjust. If all results only match 1 of 4 tokens, your terms are too scattered — try removing one.
+For deeper diagnostics, run:
+```bash
+codexlr8 search . "your query" --explain
+```
+This shows per-token hit counts and flags zero-match terms so you can refine before calling `codebase_search` again.
 ## Interpreting results
@@ -418,9 +608,10 @@ Results include:
 | `score` | Relevance (higher = better) |
 | `summary` | Human-written description of the file's purpose |
 | `tags` | Curated keywords (auth, payment, cart, etc.) |
+| `matched` | Which query tokens the file matched — use this to debug failed searches |
 | `preview` | First ~10 lines around the best match |
-**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest. Raw content matches rank lower. `__init__.py` re-exports are penalized.
+**Ranking:** Files with curated `.meta.yaml` (summary + tags) rank highest, followed by filename matches, then path directory matches. Raw content matches rank lowest. `__init__.py` re-exports are penalized.
 ## Maintaining the index
@@ -508,6 +699,9 @@ Exclude patterns are globs that match file paths. Use `*` for wildcards.
 | Task | Tool call |
 |---|---|
 | Find code for a feature | `codebase_search(query="...")` |
+| Search within a directory | `codebase_search(query="...", scope="src/")` |
+| Cluster results by directory | Shell: `codexlr8 search . "query" --grouped` |
+| Diagnose query terms | Shell: `codexlr8 search . "query" --explain` |
 | Build/update index | `codebase_index(incremental=true)` |
 | Check metadata coverage | Shell: `codexlr8 status .` |
 | Bootstrap missing sidecars | Shell: `codexlr8 init .` |

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/config.py RENAMED Viewed

@@ -32,6 +32,10 @@ def _defaults() -> dict:
             "__tests__/*",
             "test_*",
             "*_test.*",
+            "examples/*",
+            "docs/*",
+            "tutorials/*",
+            "benchmarks/*",
         ],
         "extensions": [
             ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".rb",

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/mcp_server.py RENAMED Viewed

@@ -61,6 +61,12 @@ async def list_tools() -> list[Tool]:
                         "description": "Glob patterns for files to exclude. "
                                        "Uses .codexlr8.yaml defaults if not set.",
                     },
+                    "scope": {
+                        "type": "string",
+                        "description": "Restrict search to files under a path prefix "
+                                       "(e.g. 'src/' or 'lib/mpl_toolkits/'). "
+                                       "Acts as grep -rn's directory filter.",
+                    },
                 },
                 "required": ["query"],
             },
@@ -111,9 +117,10 @@ async def _handle_search(args: dict) -> list[TextContent]:
     query = args["query"]
     limit = args.get("limit", 10)
     exclude = args.get("exclude")
+    scope = args.get("scope")
     engine = SearchEngine(project_path)
-    results = engine.search(query, limit=limit, exclude=exclude)
+    results = engine.search(query, limit=limit, exclude=exclude, scope=scope)
     if not results:
         return [TextContent(type="text", text="No results found.")]
@@ -128,6 +135,8 @@ async def _handle_search(args: dict) -> list[TextContent]:
             lines.append(f"   summary: {r['summary']}")
         if r.get("tags"):
             lines.append(f"   tags: {', '.join(r['tags'])}")
+        if r.get("matched_tokens"):
+            lines.append(f"   matched: {', '.join(r['matched_tokens'])}")
         if r.get("preview"):
             lines.append("   preview: |")
             for pline in r["preview"].strip().splitlines()[:6]:

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8/search.py RENAMED Viewed

@@ -27,6 +27,54 @@ def _tokenize(text: str) -> list[str]:
     return [t for t in tokens if len(t) > 1 or t.isdigit()]  # skip single letters
+def _explain_query(query: str, tokens: list[str], results: list[dict]) -> dict:
+    """Generate query diagnostic breakdown for --explain.
+    Returns per-token hit counts, filtered words, top score — gives
+    the agent the data it needs to course-correct a search query.
+    """
+    # Detect words in original query that were filtered by the tokenizer
+    raw_lower = query.lower()
+    raw_words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*|\d+", raw_lower)
+    filtered = [w for w in raw_words if w not in tokens and len(w) == 1]
+    # Per-token hit counts across all results
+    token_hits: dict[str, int] = {}
+    for token in tokens:
+        count = 0
+        for r in results:
+            text = (
+                (r.get("summary") or "") + " " +
+                " ".join(r.get("tags", [])) + " " +
+                r.get("path", "")
+            ).lower()
+            if token in text:
+                count += 1
+        token_hits[token] = count
+    top_score = max((r["score"] for r in results), default=0.0)
+    return {
+        "query": query,
+        "tokens": tokens,
+        "token_hits": token_hits,
+        "filtered": filtered,
+        "total_results": len(results),
+        "top_score": round(top_score, 2),
+    }
+def _token_match_info(tokens: list[str], content: str, row) -> tuple[list[str], float]:
+    """Return which query tokens matched and the match ratio."""
+    if not tokens:
+        return [], 0.0
+    summary = (row["summary"] or "") if row["summary"] else ""
+    tags = (row["tags"] or "") if row["tags"] else ""
+    text_lower = (content + " " + summary + " " + tags).lower()
+    matched = [t for t in tokens if t in text_lower]
+    return matched, len(matched) / len(tokens)
 def _token_match_ratio(tokens: list[str], text: str) -> float:
     """What fraction of query tokens appear in the document text?"""
     if not tokens:
@@ -47,6 +95,53 @@ def _matches_exclude(path: str, excludes: list[str]) -> bool:
     return False
+def _group_results(results: list[dict], group_depth: int = 3) -> dict:
+    """Group flat search results by directory prefix for cluster display.
+    Returns a dict with 'groups', 'total_files', 'total_results'.
+    Each group has: prefix, count, max_score, files (top 3 per group).
+    """
+    if not results:
+        return {"groups": [], "total_files": 0, "total_results": 0}
+    groups: dict[str, list[dict]] = {}
+    seen_paths: set[str] = set()
+    for r in results:
+        path = r["path"]
+        dir_parts = path.split(os.sep)[:-1]  # exclude filename
+        if not dir_parts:
+            prefix = "."
+        else:
+            prefix = os.sep.join(dir_parts[:group_depth]) + os.sep
+        if prefix not in groups:
+            groups[prefix] = []
+        groups[prefix].append(r)
+        seen_paths.add(path)
+    group_list = []
+    for prefix, files in groups.items():
+        # Keep files sorted by score within group
+        files.sort(key=lambda f: f["score"], reverse=True)
+        group_list.append({
+            "prefix": prefix,
+            "count": len(files),
+            "max_score": files[0]["score"],
+            "files": files[:3],  # top 3 per group for display
+            "has_more": len(files) > 3,
+            "remaining": len(files) - 3 if len(files) > 3 else 0,
+        })
+    group_list.sort(key=lambda g: g["max_score"], reverse=True)
+    return {
+        "groups": group_list,
+        "total_files": len(seen_paths),
+        "total_results": len(results),
+    }
 class SearchEngine:
     """SQLite FTS5-backed search engine for a codebase."""
@@ -212,12 +307,18 @@ class SearchEngine:
         )
     def search(self, query: str, limit: int = 10,
-               exclude: list[str] | None = None) -> list[dict]:
+               exclude: list[str] | None = None,
+               scope: str | None = None) -> list[dict]:
         """Search the codebase and return ranked results.
-        Uses AND semantics: all query tokens must match (like Google).
-        Falls back to OR if AND returns nothing, with a post-filter
-        requiring at least 50% of query tokens to match the document.
+        Uses OR semantics: any token can match. The custom scoring layer
+        (path weighting, metadata boosts, match ratio) naturally surfaces
+        files that match more tokens. A post-filter requires >=50% of query
+        tokens to match for multi-token queries.
+        This replaces the previous AND-then-OR fallback, which caused precise
+        multi-token queries to return zero results (AND too strict) or too
+        many flatly-scored results (OR fallback with no differentiation).
         """
         if not os.path.exists(self.db_path):
             return []
@@ -231,44 +332,42 @@ class SearchEngine:
         conn = self._get_connection()
-        # Stage 1: try AND (best precision)
-        and_query = " AND ".join(tokens)
+        # Build scope clause for path-prefix filtering
+        scope_clause = ""
+        scope_params: list[str] = []
+        if scope:
+            scope_norm = scope.rstrip("/")
+            scope_clause = "AND f.path LIKE ?"
+            scope_params = [scope_norm + "/%"]
+        # Always use OR semantics. Multi-token matches naturally rank higher
+        # via _compute_score (match_ratio scales with token coverage).
+        or_query = " OR ".join(tokens)
+        # Fetch more than needed — scoring will filter to top limit
+        fetch_limit = max(limit * 20, 200)
         cursor = conn.execute(
             "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
             "       m.is_init, rank "
             "FROM files f "
             "JOIN file_meta m ON f.path = m.path "
             "WHERE files MATCH ? "
+            + scope_clause + " "
             "ORDER BY rank "
             "LIMIT ?",
-            (and_query, limit * 5),
+            [or_query] + scope_params + [fetch_limit],
         )
         rows = cursor.fetchall()
-        # Stage 2: fall back to OR if AND found nothing
-        if not rows and len(tokens) > 1:
-            or_query = " OR ".join(tokens)
-            cursor = conn.execute(
-                "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
-                "       m.is_init, rank "
-                "FROM files f "
-                "JOIN file_meta m ON f.path = m.path "
-                "WHERE files MATCH ? "
-                "ORDER BY rank "
-                "LIMIT ?",
-                (or_query, limit * 10),
-            )
-            rows = cursor.fetchall()
-        # Stage 3: post-filter by token coverage
-        min_ratio = 0.5 if len(tokens) >= 4 else 0.0
+        # Post-filter: for multi-token queries, require >=50% token match
+        min_ratio = 0.5 if len(tokens) >= 2 else 0.0
         results = []
         for row in rows:
             if _matches_exclude(row["path"], exclude):
                 continue
             content = row["content"] or ""
-            ratio = _token_match_ratio(tokens, content + (row["summary"] or "") + (row["tags"] or ""))
+            # Compute which tokens matched and the ratio
+            matched, ratio = _token_match_info(tokens, content, row)
             if ratio < min_ratio:
                 continue
@@ -281,6 +380,7 @@ class SearchEngine:
                 "tags": (row["tags"] or "").split(),
                 "public_api": row["public_api"] or "",
                 "score": score,
+                "matched_tokens": matched,
             })
         conn.close()
@@ -305,17 +405,27 @@ class SearchEngine:
         """Compute relevance score.
         Core ranking: BM25 from FTS5 (via 'rank') provides the base score.
-        On top of that:
+        On top of that, a weighted token-count:
         - Metadata boost: public_api (1.0) > tags (0.8) > summary (0.6)
+        - Path boost: exact filename (0.8), filename component (0.7), dir (0.5)
+        - Content match: 0.3 (base weight, only if nothing above matched)
         - Match ratio: fraction of query tokens found in the document
         - init.py penalty: 0.6x (applied in search())
         """
         score = 0.0
+        path = row.get("path", "")
         public_api = (row.get("public_api") or "").lower()
         summary = (row.get("summary") or "").lower()
         tags = (row.get("tags") or "").lower()
+        filename_lower = os.path.splitext(os.path.basename(path))[0].lower()
+        filename_parts = set(re.split(r'[_\-.]+', filename_lower))
+        dir_path = os.path.dirname(path).lower()
+        dir_tokens = set(_tokenize(dir_path.replace(os.sep, " ").replace("_", " ").replace("-", " ")))
+        # Also add dir path segments directly (e.g., "mplot3d" from "mplot3d/axes3d.py")
+        dir_tokens.update(re.split(r'[_\-.]+', dir_path.replace(os.sep, " ")))
         api_tokens = set(_tokenize(public_api))
         tag_tokens = set(tags.split())
         summary_tokens = set(_tokenize(summary))
@@ -325,10 +435,19 @@ class SearchEngine:
                 score += 1.0
             elif token in tag_tokens:
                 score += 0.8
+            elif token == filename_lower:
+                # Exact filename match: token IS the filename (axes3d.py for "axes3d")
+                score += 0.8
+            elif token in filename_parts:
+                # Token appears as a component in the filename (e.g. "axes3d" in "rotate_axes3d_sgskip.py")
+                score += 0.7
             elif token in summary_tokens:
                 score += 0.6
+            elif token in dir_tokens:
+                # Token appears in a directory name (e.g., "mplot3d" in path mplot3d/axes3d.py)
+                score += 0.5
             else:
-                # Content match via BM25 — base weight
+                # Content match via FTS5 — base weight
                 score += 0.3
         # Multiply by match ratio: files matching more query terms rank higher
@@ -373,6 +492,8 @@ class SearchEngine:
             "files_without_meta": 0,
             "total_lines": 0,
             "index_age": "No index yet",
+            "coverage_pct": 0.0,
+            "warning": None,
         }
         if not os.path.exists(self.db_path):
@@ -391,6 +512,17 @@ class SearchEngine:
         row = conn.execute("SELECT SUM(content_size) as total FROM file_meta").fetchone()
         result["total_lines"] = row["total"] or 0
+        if result["files_indexed"] > 0:
+            result["coverage_pct"] = round(
+                (result["files_with_meta"] / result["files_indexed"]) * 100, 1
+            )
+        if result["files_indexed"] > 0 and result["coverage_pct"] < 10.0:
+            result["warning"] = (
+                f"Only {result['coverage_pct']}% of files have metadata. "
+                "Search quality will be degraded. Run 'codexlr8 init .' to bootstrap."
+            )
         mtime = os.path.getmtime(self.db_path)
         mtime_dt = datetime.fromtimestamp(mtime)
         age = datetime.now() - mtime_dt

{codexlr8-0.0.1 → codexlr8-0.0.2}/src/codexlr8.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codexlr8
-Version: 0.0.1
+Version: 0.0.2
 Summary: A codebase search engine for LLM coding agents
 Author-email: Sadig Akhund <sadigaxund@gmail.com>
 License: Apache-2.0
@@ -64,11 +64,36 @@ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `
 | Layer | Source | Boost |
 |---|---|---|
-| 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
-| 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
+| 1 | Raw file content | 0.3× per token |
+| 2a | File path (filename, directory) | 0.5× – 0.8× |
+| 2b | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
 | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
-Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
+Search uses OR semantics with token-coverage scoring: more matching tokens = higher score. A ≥50% post-filter eliminates single-token noise for multi-word queries. Path weighting (Layer 2a) provides differentiation even without metadata — a file whose name IS the query token ranks above one that merely mentions it.
+### Scoped search and clustering
+```bash
+# Narrow to a specific directory (like grep -rn "pattern" dir/)
+codexlr8 search . "get_visible" --scope lib/mpl_toolkits/
+# Cluster results by directory to see where matches concentrate
+codexlr8 search . "get_visible" --grouped
+# 12 results in 3 directories (8 files) across project:
+#   lib/mpl_toolkits/mplot3d/  (5 files)
+#     ─ axes3d.py:388  [score: 0.90]
+#     ...
+# Diagnose your query — see which terms hit, which don't
+codexlr8 search . "axes not hiding" --explain
+# Query analysis:
+#   "axes"    212 matches  — broad term (212/212 results)
+#   "not"     77 matches
+#   "hiding"  0 matches    — consider dropping or replacing
+#   Top score: 1.20 (strong match)
+# Combine both — group, then scope to drill down
+```
 ## .meta.yaml Sidecars

{codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_mcp_server.py RENAMED Viewed

@@ -70,3 +70,29 @@ class TestMCPServerLogic:
         results = engine.search("login")
         assert len(results) > 0
         assert "main.py" in results[0]["path"]
+    def test_search_with_scope(self, tmp_path):
+        """Scope parameter restricts search to a path prefix."""
+        project = tmp_path / "proj"
+        src_dir = project / "src"
+        lib_dir = project / "lib"
+        src_dir.mkdir(parents=True)
+        lib_dir.mkdir(parents=True)
+        (src_dir / "auth.py").write_text("def login(): pass\n")
+        (lib_dir / "auth.py").write_text("def login(): pass\n")
+        engine = SearchEngine(str(project))
+        engine.build_index()
+        # Without scope: both files match
+        results = engine.search("login")
+        paths = {r["path"] for r in results}
+        assert "src/auth.py" in paths
+        assert "lib/auth.py" in paths
+        # With scope: only src/ files
+        results = engine.search("login", scope="src")
+        paths = {r["path"] for r in results}
+        assert "src/auth.py" in paths
+        assert "lib/auth.py" not in paths

{codexlr8-0.0.1 → codexlr8-0.0.2}/tests/test_search.py RENAMED Viewed

@@ -2,7 +2,7 @@
 import json
-from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude
+from codexlr8.search import SearchEngine, _is_init_file, _tokenize, _matches_exclude, _group_results, _explain_query
 class TestHelpers:
@@ -27,6 +27,85 @@ class TestHelpers:
         assert not _matches_exclude("auth/session.py", ["tests/*", "test_*"])
         assert not _matches_exclude("models.py", ["tests/*"])
+    def test_group_results_empty(self):
+        assert _group_results([]) == {"groups": [], "total_files": 0, "total_results": 0}
+    def test_group_results_multi_dir(self):
+        results = [
+            {"path": "lib/foo/bar.py", "score": 0.9, "summary": "bar module"},
+            {"path": "lib/foo/baz.py", "score": 0.7, "summary": "baz module"},
+            {"path": "lib/other/qux.py", "score": 0.8, "summary": "qux module"},
+            {"path": "src/main.py", "score": 0.5, "summary": "entry point"},
+        ]
+        grouped = _group_results(results, group_depth=3)
+        assert grouped["total_files"] == 4
+        assert grouped["total_results"] == 4
+        assert len(grouped["groups"]) == 3
+        # Sorted by max score: lib/foo/ (0.9), lib/other/ (0.8), src/ (0.5)
+        assert grouped["groups"][0]["prefix"] == "lib/foo/"
+        assert grouped["groups"][0]["count"] == 2
+        assert grouped["groups"][0]["max_score"] == 0.9
+        assert not grouped["groups"][0]["has_more"]
+        assert grouped["groups"][1]["prefix"] == "lib/other/"
+        assert grouped["groups"][2]["prefix"] == "src/"
+    def test_group_results_root_files(self):
+        results = [
+            {"path": "main.py", "score": 0.9},
+            {"path": "utils.py", "score": 0.7},
+        ]
+        grouped = _group_results(results)
+        assert len(grouped["groups"]) == 1
+        assert grouped["groups"][0]["prefix"] == "."
+    def test_group_results_depth_capping(self):
+        results = [
+            {"path": "a/b/c/d/e/file.py", "score": 0.9},
+        ]
+        grouped = _group_results(results, group_depth=2)
+        assert grouped["groups"][0]["prefix"] == "a/b/"
+    def test_group_results_truncates_per_group(self):
+        results = [
+            {"path": f"lib/many/file_{i}.py", "score": 0.9 - i * 0.01}
+            for i in range(10)
+        ]
+        grouped = _group_results(results)
+        g = grouped["groups"][0]
+        assert g["count"] == 10
+        assert len(g["files"]) == 3
+        assert g["has_more"]
+        assert g["remaining"] == 7
+    def test_group_results_sorts_by_max_score(self):
+        results = [
+            {"path": "lib/low/file.py", "score": 0.3},
+            {"path": "src/high/main.py", "score": 0.9},
+            {"path": "lib/low/other.py", "score": 0.1},
+        ]
+        grouped = _group_results(results)
+        assert grouped["groups"][0]["prefix"] == "src/high/"
+        assert grouped["groups"][1]["prefix"] == "lib/low/"
+    def test_explain_query(self):
+        results = [
+            {"path": "auth/session.py", "score": 0.9, "summary": "auth module", "tags": ["login"]},
+            {"path": "auth/__init__.py", "score": 0.6, "summary": "", "tags": []},
+        ]
+        data = _explain_query("login auth x", ["login", "auth", "x"], results)
+        assert data["query"] == "login auth x"
+        assert data["token_hits"]["login"] == 1  # only session.py tags match
+        assert data["token_hits"]["auth"] == 2   # both files have "auth" in path
+        assert data["token_hits"]["x"] == 0      # zero matches
+        assert data["top_score"] == 0.9
+        assert data["filtered"] == []
+    def test_explain_query_filtered(self):
+        data = _explain_query("go API v2 a", ["go", "api", "v2"], [])
+        assert "a" in data["filtered"]
+        assert data["token_hits"] == {"go": 0, "api": 0, "v2": 0}
 class TestSearchEngine:
     def test_build_and_search(self, sample_project):
@@ -152,10 +231,13 @@ class TestSearchEngine:
         )
         assert result.exit_code == 0
         data = json.loads(result.output)
-        assert isinstance(data, list)
-        if data:
-            assert "path" in data[0]
-            assert "score" in data[0]
+        assert isinstance(data, dict)
+        assert "results" in data
+        results_list = data["results"]
+        assert isinstance(results_list, list)
+        if results_list:
+            assert "path" in results_list[0]
+            assert "score" in results_list[0]
     def test_search_cli_exclude_flag(self, sample_project):
         from click.testing import CliRunner
@@ -174,6 +256,39 @@ class TestSearchEngine:
         auth_lines = [l for l in lines if "auth/" in l]
         assert not auth_lines
+    def test_search_cli_grouped(self, sample_project):
+        from click.testing import CliRunner
+        from codexlr8.cli import search
+        engine = SearchEngine(str(sample_project))
+        engine.build_index()
+        runner = CliRunner()
+        result = runner.invoke(
+            search, [str(sample_project), "login", "--grouped"]
+        )
+        assert result.exit_code == 0
+        # Should show directory groupings and the scope hint
+        assert "Use --scope" in result.output
+        assert "(" in result.output  # file count per dir
+    def test_search_cli_explain(self, sample_project):
+        from click.testing import CliRunner
+        from codexlr8.cli import search
+        engine = SearchEngine(str(sample_project))
+        engine.build_index()
+        runner = CliRunner()
+        result = runner.invoke(
+            search, [str(sample_project), "login", "--explain"]
+        )
+        assert result.exit_code == 0
+        assert "Query analysis" in result.output
+        assert '"login"' in result.output
+        assert "matches" in result.output
 class TestCLIIndexAndStatus:
     def test_index_command(self, sample_project):