PyPI - crumbs-cli - Versions diffs - 0.3.0__py3-none-any.whl - Mend

crumbs-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

crumbs/__init__.py +9 -0
crumbs/__main__.py +6 -0
crumbs/cli.py +186 -0
crumbs/digest.py +75 -0
crumbs/extractors.py +255 -0
crumbs/indexer.py +133 -0
crumbs/mcp.py +291 -0
crumbs/query.py +80 -0
crumbs/store.py +117 -0
crumbs_cli-0.3.0.dist-info/METADATA +110 -0
crumbs_cli-0.3.0.dist-info/RECORD +15 -0
crumbs_cli-0.3.0.dist-info/WHEEL +5 -0
crumbs_cli-0.3.0.dist-info/entry_points.txt +2 -0
crumbs_cli-0.3.0.dist-info/licenses/LICENSE +21 -0
crumbs_cli-0.3.0.dist-info/top_level.txt +1 -0

crumbs/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""crumbs - local, token-efficient cross-repo context for LLMs.
+crumbs indexes repositories into compact "context crumbs" (file maps and symbol
+signatures, not full file bodies) stored locally. An assistant can query these
+crumbs to understand many repos at once without reading -- and paying tokens for
+-- the entire source tree.
+"""
+__version__ = "0.3.0"

crumbs/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+import sys
+from .cli import main
+if __name__ == "__main__":
+    sys.exit(main())

crumbs/cli.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""crumbs command-line interface."""
+from __future__ import annotations
+import argparse
+import json
+import sys
+import time
+from typing import List, Optional
+from . import __version__, digest, indexer, query, store
+def _fmt_age(ts: float) -> str:
+    secs = max(0, int(time.time() - ts))
+    for unit, n in (("d", 86400), ("h", 3600), ("m", 60)):
+        if secs >= n:
+            return f"{secs // n}{unit} ago"
+    return "just now"
+def cmd_index(args: argparse.Namespace) -> int:
+    paths = args.paths or ["."]
+    for p in paths:
+        try:
+            data = indexer.index_repo(p, name=args.name)
+        except (NotADirectoryError, FileNotFoundError) as e:
+            print(f"error: {e}", file=sys.stderr)
+            return 1
+        st = data["stats"]
+        m = digest.repo_map(data["id"])
+        sav = digest.savings(data, m)
+        print(
+            f"indexed {data['name']}  "
+            f"{st['files']} files, {st['symbols']} symbols  "
+            f"(map ~{sav['map_tokens']} tok vs ~{sav['source_tokens']} tok source, "
+            f"-{sav['saved_pct']}%)"
+        )
+    return 0
+def cmd_list(args: argparse.Namespace) -> int:
+    reg = store.load_registry()
+    if not reg:
+        print("no repos indexed. run: crumbs index <path>")
+        return 0
+    if args.json:
+        print(json.dumps(reg, indent=2))
+        return 0
+    rows = sorted(reg.items(), key=lambda kv: kv[1]["name"])
+    name_w = max((len(m["name"]) for _, m in rows), default=4)
+    for rid, m in rows:
+        st = m["stats"]
+        print(
+            f"{m['name']:<{name_w}}  {rid}  "
+            f"{st['files']:>4} files  {st['symbols']:>5} symbols  "
+            f"{_fmt_age(m['indexed_at'])}"
+        )
+    return 0
+def cmd_map(args: argparse.Namespace) -> int:
+    rid = store.resolve(args.repo)
+    if not rid:
+        print(f"error: no indexed repo matches '{args.repo}'", file=sys.stderr)
+        return 1
+    text = digest.repo_map(rid, max_symbols_per_file=args.max_symbols)
+    print(text)
+    if args.stats:
+        data = store.load_repo(rid)
+        sav = digest.savings(data, text)
+        print(
+            f"\n_~{sav['map_tokens']} tokens (vs ~{sav['source_tokens']} for full source, "
+            f"-{sav['saved_pct']}%)_",
+            file=sys.stderr,
+        )
+    return 0
+def cmd_search(args: argparse.Namespace) -> int:
+    hits = query.search(args.query, repo=args.repo, limit=args.limit)
+    if args.json:
+        print(json.dumps(hits, indent=2))
+        return 0
+    if not hits:
+        print("no matches")
+        return 0
+    for h in hits:
+        sig = h["sig"] or f"{h['kind']} {h['name']}"
+        loc = f":{h['line']}" if h.get("line") else ""
+        print(f"{h['repo']}:{h['path']}{loc}  {sig}")
+    return 0
+def cmd_context(args: argparse.Namespace) -> int:
+    print(query.context(args.query, repo=args.repo, limit=args.limit))
+    return 0
+def cmd_remove(args: argparse.Namespace) -> int:
+    rid = store.resolve(args.repo)
+    if not rid:
+        print(f"error: no indexed repo matches '{args.repo}'", file=sys.stderr)
+        return 1
+    name = store.load_registry().get(rid, {}).get("name", rid)
+    store.remove_repo(rid)
+    print(f"removed {name}")
+    return 0
+def cmd_mcp(args: argparse.Namespace) -> int:
+    from . import mcp
+    return mcp.serve()
+def cmd_refresh(args: argparse.Namespace) -> int:
+    reg = store.load_registry()
+    if not reg:
+        print("nothing to refresh")
+        return 0
+    for rid, m in list(reg.items()):
+        try:
+            indexer.index_repo(m["path"], name=m["name"])
+            print(f"refreshed {m['name']}")
+        except (NotADirectoryError, FileNotFoundError):
+            print(f"skip {m['name']} (path missing: {m['path']})", file=sys.stderr)
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="crumbs",
+        description="Local, token-efficient cross-repo context for LLMs.",
+    )
+    p.add_argument("--version", action="version", version=f"crumbs {__version__}")
+    sub = p.add_subparsers(dest="cmd", required=True)
+    pi = sub.add_parser("index", help="index one or more repos")
+    pi.add_argument("paths", nargs="*", help="repo paths (default: .)")
+    pi.add_argument("--name", help="override repo name")
+    pi.set_defaults(func=cmd_index)
+    pl = sub.add_parser("list", help="list indexed repos")
+    pl.add_argument("--json", action="store_true")
+    pl.set_defaults(func=cmd_list)
+    pm = sub.add_parser("map", help="print compact map of a repo")
+    pm.add_argument("repo", help="repo name, id, or path")
+    pm.add_argument("--max-symbols", type=int, default=12)
+    pm.add_argument("--stats", action="store_true", help="print token estimate to stderr")
+    pm.set_defaults(func=cmd_map)
+    ps = sub.add_parser("search", help="search symbols across repos")
+    ps.add_argument("query")
+    ps.add_argument("--repo", help="limit to one repo")
+    ps.add_argument("--limit", type=int, default=30)
+    ps.add_argument("--json", action="store_true")
+    ps.set_defaults(func=cmd_search)
+    pc = sub.add_parser("context", help="LLM-ready context slice for a query")
+    pc.add_argument("query")
+    pc.add_argument("--repo", help="limit to one repo")
+    pc.add_argument("--limit", type=int, default=20)
+    pc.set_defaults(func=cmd_context)
+    pr = sub.add_parser("remove", help="remove a repo from the index")
+    pr.add_argument("repo")
+    pr.set_defaults(func=cmd_remove)
+    prf = sub.add_parser("refresh", help="re-index all known repos")
+    prf.set_defaults(func=cmd_refresh)
+    pmcp = sub.add_parser("mcp", help="run as an MCP server over stdio")
+    pmcp.set_defaults(func=cmd_mcp)
+    return p
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())

crumbs/digest.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Render a compact, token-efficient map of an indexed repo."""
+from __future__ import annotations
+from typing import Any, Dict, List
+from . import store
+def _est_tokens(chars: int) -> int:
+    """Rough token estimate (~4 chars/token)."""
+    return chars // 4
+def loc(sym: Dict[str, Any]) -> str:
+    """Compact source location tag, e.g. ``L40-92`` or ``L40``."""
+    start = sym.get("line")
+    if not start:
+        return ""
+    end = sym.get("end_line", start)
+    return f"L{start}" if end == start else f"L{start}-{end}"
+def repo_map(rid: str, max_symbols_per_file: int = 12) -> str:
+    data = store.load_repo(rid)
+    if not data:
+        return ""
+    lines: List[str] = []
+    g = data.get("git", {})
+    header = f"# {data['name']}"
+    lines.append(header)
+    meta = []
+    if g.get("remote"):
+        meta.append(g["remote"])
+    if g.get("branch"):
+        meta.append(f"@{g['branch']}")
+    if meta:
+        lines.append(" ".join(meta))
+    st = data["stats"]
+    lines.append(
+        f"_{st['files']} files, {st['symbols']} symbols indexed_"
+    )
+    lines.append("")
+    if data.get("readme"):
+        excerpt = data["readme"].strip().replace("\n\n", "\n")
+        lines.append("> " + excerpt.replace("\n", "\n> "))
+        lines.append("")
+    for f in data["files"]:
+        syms = f["symbols"]
+        if not syms:
+            continue
+        lines.append(f"### {f['path']}")
+        for sym in syms[:max_symbols_per_file]:
+            sig = sym["sig"] or f"{sym['kind']} {sym['name']}"
+            tag = loc(sym)
+            where = f" [{tag}]" if tag else ""
+            doc = f"  — {sym['doc']}" if sym.get("doc") else ""
+            lines.append(f"- {sig}{where}{doc}")
+        if len(syms) > max_symbols_per_file:
+            lines.append(f"- … +{len(syms) - max_symbols_per_file} more")
+        lines.append("")
+    return "\n".join(lines)
+def savings(data: Dict[str, Any], map_text: str) -> Dict[str, int]:
+    src_tokens = _est_tokens(data["stats"]["source_bytes"])
+    map_tokens = _est_tokens(len(map_text))
+    pct = 0 if src_tokens == 0 else round(100 * (1 - map_tokens / src_tokens))
+    return {
+        "source_tokens": src_tokens,
+        "map_tokens": map_tokens,
+        "saved_pct": pct,
+    }

crumbs/extractors.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""Extract compact symbol signatures from source files.
+The goal is a high-signal, low-token summary of what a file *contains* and
+*exposes* -- function/class/type signatures and one-line docs -- never the full
+bodies. Python is parsed with the stdlib ``ast`` for accuracy; other languages
+use lightweight regex that captures declarations without trying to be a parser.
+"""
+from __future__ import annotations
+import ast
+import re
+from typing import Dict, List
+# Map file extension -> language label used in output.
+LANGS: Dict[str, str] = {
+    ".py": "python",
+    ".js": "javascript",
+    ".jsx": "javascript",
+    ".mjs": "javascript",
+    ".cjs": "javascript",
+    ".ts": "typescript",
+    ".tsx": "typescript",
+    ".go": "go",
+    ".rs": "rust",
+    ".java": "java",
+    ".rb": "ruby",
+    ".php": "php",
+    ".c": "c",
+    ".h": "c",
+    ".cpp": "cpp",
+    ".cc": "cpp",
+    ".hpp": "cpp",
+    ".cs": "csharp",
+    ".swift": "swift",
+    ".kt": "kotlin",
+    ".md": "markdown",
+}
+def lang_for(filename: str) -> str:
+    for ext, lang in LANGS.items():
+        if filename.endswith(ext):
+            return lang
+    return ""
+def _first_line(text: str) -> str:
+    text = (text or "").strip()
+    return text.splitlines()[0].strip() if text else ""
+def extract(path: str, text: str) -> List[Dict[str, str]]:
+    """Return a list of symbols.
+    Each symbol is ``{kind, name, sig, doc, line, end_line}`` where ``line`` /
+    ``end_line`` are 1-based source line numbers so a reader can open just the
+    symbol's slice (e.g. ``path:line-end_line``) instead of the whole file.
+    """
+    lang = lang_for(path)
+    if lang == "python":
+        return _python(text)
+    if lang in ("javascript", "typescript"):
+        return _js_ts(text)
+    if lang == "go":
+        return _go(text)
+    if lang == "rust":
+        return _rust(text)
+    if lang == "markdown":
+        return _markdown(text)
+    if lang:
+        return _generic(text)
+    return []
+# --------------------------------------------------------------------------- #
+# Python (AST-based, accurate)
+# --------------------------------------------------------------------------- #
+def _unparse(node) -> str:
+    """Best-effort source for an annotation/default node (3.9+ has ast.unparse)."""
+    if node is None:
+        return ""
+    if hasattr(ast, "unparse"):
+        try:
+            return ast.unparse(node)
+        except Exception:
+            return ""
+    return ""  # Python 3.8: omit annotation rather than guess
+def _arg(arg: ast.arg, default=None) -> str:
+    s = arg.arg
+    ann = _unparse(getattr(arg, "annotation", None))
+    if ann:
+        s += ": " + ann
+    if default is not None:
+        d = _unparse(default)
+        if d:
+            s += ("=" if not ann else " = ") + d
+    return s
+def _py_args(node: ast.AST) -> str:
+    try:
+        a = node.args  # type: ignore[attr-defined]
+    except AttributeError:
+        return "()"
+    parts: List[str] = []
+    pos = list(a.posonlyargs) + list(a.args)
+    # defaults align to the tail of the positional args.
+    pad = [None] * (len(pos) - len(a.defaults)) + list(a.defaults)
+    for arg, default in zip(pos, pad):
+        parts.append(_arg(arg, default))
+    if a.posonlyargs:
+        parts.insert(len(a.posonlyargs), "/")
+    if a.vararg:
+        parts.append("*" + _arg(a.vararg))
+    elif a.kwonlyargs:
+        parts.append("*")
+    for arg, default in zip(a.kwonlyargs, a.kw_defaults):
+        parts.append(_arg(arg, default))
+    if a.kwarg:
+        parts.append("**" + _arg(a.kwarg))
+    sig = "(" + ", ".join(parts) + ")"
+    ret = _unparse(getattr(node, "returns", None))
+    if ret:
+        sig += " -> " + ret
+    return sig
+def _python(text: str) -> List[Dict[str, str]]:
+    try:
+        tree = ast.parse(text)
+    except SyntaxError:
+        return _generic(text)
+    out: List[Dict[str, str]] = []
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            prefix = "async def" if isinstance(node, ast.AsyncFunctionDef) else "def"
+            out.append({
+                "kind": "function",
+                "name": node.name,
+                "sig": f"{prefix} {node.name}{_py_args(node)}",
+                "doc": _first_line(ast.get_docstring(node) or ""),
+                "line": node.lineno,
+                "end_line": getattr(node, "end_lineno", node.lineno),
+            })
+        elif isinstance(node, ast.ClassDef):
+            bases = ", ".join(
+                b.id for b in node.bases if isinstance(b, ast.Name)
+            )
+            sig = f"class {node.name}" + (f"({bases})" if bases else "")
+            methods = [
+                n.name
+                for n in node.body
+                if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
+                and not n.name.startswith("_")
+            ]
+            doc = _first_line(ast.get_docstring(node) or "")
+            if methods:
+                doc = (doc + " " if doc else "") + "methods: " + ", ".join(methods[:12])
+            out.append({
+                "kind": "class",
+                "name": node.name,
+                "sig": sig,
+                "doc": doc,
+                "line": node.lineno,
+                "end_line": getattr(node, "end_lineno", node.lineno),
+            })
+    return out
+# --------------------------------------------------------------------------- #
+# Regex-based extractors for other languages
+# --------------------------------------------------------------------------- #
+def _collect(text: str, patterns: List[tuple]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    seen = set()
+    for kind, rx in patterns:
+        for m in rx.finditer(text):
+            name = m.group("name")
+            if not name or name in seen:
+                continue
+            seen.add(name)
+            sig = m.group(0).strip().rstrip("{(=").strip()
+            sig = re.sub(r"\s+", " ", sig)[:120]
+            line = text.count("\n", 0, m.start()) + 1
+            out.append({
+                "kind": kind, "name": name, "sig": sig, "doc": "",
+                "line": line, "end_line": line,
+            })
+    return out
+_JS_TS = [
+    ("function", re.compile(r"^\s*export\s+(?:default\s+)?(?:async\s+)?function\s+(?P<name>\w+)", re.M)),
+    ("function", re.compile(r"^\s*(?:async\s+)?function\s+(?P<name>\w+)", re.M)),
+    ("class", re.compile(r"^\s*export\s+(?:default\s+)?(?:abstract\s+)?class\s+(?P<name>\w+)", re.M)),
+    ("class", re.compile(r"^\s*(?:abstract\s+)?class\s+(?P<name>\w+)", re.M)),
+    ("const", re.compile(r"^\s*export\s+const\s+(?P<name>\w+)", re.M)),
+    ("type", re.compile(r"^\s*export\s+(?:type|interface)\s+(?P<name>\w+)", re.M)),
+    ("type", re.compile(r"^\s*(?:type|interface)\s+(?P<name>\w+)", re.M)),
+]
+_GO = [
+    ("function", re.compile(r"^\s*func\s+(?:\([^)]*\)\s*)?(?P<name>\w+)\s*\(", re.M)),
+    ("type", re.compile(r"^\s*type\s+(?P<name>\w+)\s+(?:struct|interface)", re.M)),
+]
+_RUST = [
+    ("function", re.compile(r"^\s*(?:pub\s+)?(?:async\s+)?fn\s+(?P<name>\w+)", re.M)),
+    ("struct", re.compile(r"^\s*(?:pub\s+)?struct\s+(?P<name>\w+)", re.M)),
+    ("enum", re.compile(r"^\s*(?:pub\s+)?enum\s+(?P<name>\w+)", re.M)),
+    ("trait", re.compile(r"^\s*(?:pub\s+)?trait\s+(?P<name>\w+)", re.M)),
+]
+def _js_ts(text: str) -> List[Dict[str, str]]:
+    return _collect(text, _JS_TS)
+def _go(text: str) -> List[Dict[str, str]]:
+    return _collect(text, _GO)
+def _rust(text: str) -> List[Dict[str, str]]:
+    return _collect(text, _RUST)
+_HEADING = re.compile(r"^(#{1,3})\s+(?P<name>.+?)\s*#*$", re.M)
+def _markdown(text: str) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    for m in _HEADING.finditer(text):
+        level = len(m.group(1))
+        line = text.count("\n", 0, m.start()) + 1
+        out.append({
+            "kind": f"h{level}",
+            "name": m.group("name").strip(),
+            "sig": "",
+            "doc": "",
+            "line": line,
+            "end_line": line,
+        })
+    return out[:30]
+_GENERIC = [
+    ("def", re.compile(r"^\s*(?:public|private|protected|static|\s)*\b(?:func|function|def|fn|sub|method)\s+(?P<name>\w+)", re.M)),
+]
+def _generic(text: str) -> List[Dict[str, str]]:
+    return _collect(text, _GENERIC)

crumbs/indexer.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Walk a repository and build its compact crumb data."""
+from __future__ import annotations
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from . import extractors, store
+# Directories never worth indexing.
+SKIP_DIRS = {
+    ".git", "node_modules", "__pycache__", ".venv", "venv", "env",
+    "dist", "build", "target", ".next", ".nuxt", "out", "vendor",
+    ".idea", ".vscode", "coverage", ".pytest_cache", ".mypy_cache",
+    ".ruff_cache", "site-packages", ".tox", "bin", "obj", ".cache",
+    ".remember", ".crumbs",
+}
+# Files to skip by name.
+SKIP_FILES = {"package-lock.json", "yarn.lock", "poetry.lock", "Cargo.lock", "pnpm-lock.yaml"}
+MAX_FILE_BYTES = 1_500_000  # skip files larger than this (likely generated/binary)
+DOC_NAMES = {"readme.md", "readme.rst", "readme.txt", "readme"}
+def _is_text(path: Path) -> bool:
+    try:
+        with path.open("rb") as f:
+            chunk = f.read(2048)
+        return b"\x00" not in chunk
+    except OSError:
+        return False
+def _git_info(root: Path) -> Dict[str, str]:
+    info: Dict[str, str] = {}
+    try:
+        remote = subprocess.run(
+            ["git", "-C", str(root), "remote", "get-url", "origin"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if remote.returncode == 0:
+            info["remote"] = remote.stdout.strip()
+        branch = subprocess.run(
+            ["git", "-C", str(root), "rev-parse", "--abbrev-ref", "HEAD"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if branch.returncode == 0:
+            info["branch"] = branch.stdout.strip()
+    except (OSError, subprocess.SubprocessError):
+        pass
+    return info
+def index_repo(path: str, name: Optional[str] = None) -> Dict[str, Any]:
+    """Index a repository at ``path`` and persist its crumbs.
+    Returns the crumb data dict.
+    """
+    root = Path(path).expanduser().resolve()
+    if not root.is_dir():
+        raise NotADirectoryError(f"not a directory: {root}")
+    rid = store.repo_id(str(root))
+    name = name or root.name
+    files: List[Dict[str, Any]] = []
+    total_source_bytes = 0
+    readme_excerpt = ""
+    for dirpath, dirnames, filenames in os.walk(root):
+        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS and not d.startswith(".") or d in (".github",)]
+        for fn in filenames:
+            if fn in SKIP_FILES:
+                continue
+            fpath = Path(dirpath) / fn
+            rel = str(fpath.relative_to(root))
+            lang = extractors.lang_for(fn)
+            try:
+                size = fpath.stat().st_size
+            except OSError:
+                continue
+            if size > MAX_FILE_BYTES:
+                continue
+            # Capture a top-level README excerpt for the repo summary.
+            if fn.lower() in DOC_NAMES and "/" not in rel and not readme_excerpt:
+                if _is_text(fpath):
+                    readme_excerpt = _read(fpath)[:600]
+            if not lang:
+                continue
+            if not _is_text(fpath):
+                continue
+            text = _read(fpath)
+            total_source_bytes += len(text)
+            symbols = extractors.extract(rel, text)
+            files.append({
+                "path": rel,
+                "lang": lang,
+                "loc": text.count("\n") + 1,
+                "symbols": symbols,
+            })
+    files.sort(key=lambda f: f["path"])
+    sym_count = sum(len(f["symbols"]) for f in files)
+    data: Dict[str, Any] = {
+        "id": rid,
+        "name": name,
+        "path": str(root),
+        "indexed_at": store.now(),
+        "git": _git_info(root),
+        "readme": readme_excerpt,
+        "files": files,
+        "stats": {
+            "files": len(files),
+            "symbols": sym_count,
+            "source_bytes": total_source_bytes,
+        },
+    }
+    store.save_repo(rid, data)
+    return data
+def _read(path: Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8", errors="ignore")
+    except OSError:
+        return ""

crumbs/mcp.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""A minimal MCP (Model Context Protocol) server for crumbs.
+This speaks the MCP wire protocol directly over stdio with **zero
+dependencies** -- no SDK -- to keep crumbs pure-stdlib. An MCP host (Claude
+Code, Claude Desktop, or any MCP client) launches ``crumbs mcp`` as a
+subprocess and talks to it in JSON-RPC 2.0 over stdin/stdout.
+Wire format (stdio transport): newline-delimited JSON. Each message is one
+JSON object on its own line. stdout is reserved for protocol traffic only;
+all logging goes to stderr.
+Lifecycle:
+    client -> initialize            -> server: capabilities + serverInfo
+    client -> notifications/initialized   (no response)
+    client -> tools/list            -> server: the tool catalog
+    client -> tools/call            -> server: the tool's output
+The tools are thin adapters over the existing crumbs modules; the MCP layer
+only translates JSON-RPC <-> Python calls and formats results as text.
+"""
+from __future__ import annotations
+import json
+import sys
+from typing import Any, Callable, Dict, List, Optional
+from . import __version__, digest, indexer, query, store
+# Protocol version we default to if the client doesn't propose one. We echo
+# the client's requested version when present for forward compatibility.
+DEFAULT_PROTOCOL_VERSION = "2025-06-18"
+# JSON-RPC error codes we use.
+PARSE_ERROR = -32700
+INVALID_REQUEST = -32600
+METHOD_NOT_FOUND = -32601
+INVALID_PARAMS = -32602
+INTERNAL_ERROR = -32603
+def _log(msg: str) -> None:
+    print(f"[crumbs-mcp] {msg}", file=sys.stderr, flush=True)
+# --------------------------------------------------------------------------- #
+# Tool implementations -- each returns a plain string (rendered as text).
+# --------------------------------------------------------------------------- #
+def _tool_index(args: Dict[str, Any]) -> str:
+    paths = args.get("paths") or ([args["path"]] if args.get("path") else ["."])
+    name = args.get("name")
+    out: List[str] = []
+    for p in paths:
+        data = indexer.index_repo(p, name=name)
+        st = data["stats"]
+        m = digest.repo_map(data["id"])
+        sav = digest.savings(data, m)
+        out.append(
+            f"indexed {data['name']}: {st['files']} files, {st['symbols']} symbols "
+            f"(map ~{sav['map_tokens']} tok vs ~{sav['source_tokens']} source, -{sav['saved_pct']}%)"
+        )
+    return "\n".join(out)
+def _tool_list(args: Dict[str, Any]) -> str:
+    reg = store.load_registry()
+    if not reg:
+        return "No repos indexed yet. Use crumbs_index with a path first."
+    rows = sorted(reg.items(), key=lambda kv: kv[1]["name"])
+    lines = []
+    for rid, m in rows:
+        st = m["stats"]
+        lines.append(f"{m['name']} ({rid}): {st['files']} files, {st['symbols']} symbols")
+    return "\n".join(lines)
+def _resolve_or_index(selector: str) -> Optional[str]:
+    """Resolve a repo selector; if it's an unindexed path, index it first."""
+    rid = store.resolve(selector)
+    if rid:
+        return rid
+    try:
+        indexer.index_repo(selector)
+    except (NotADirectoryError, FileNotFoundError):
+        return None
+    return store.resolve(selector)
+def _tool_map(args: Dict[str, Any]) -> str:
+    repo = args["repo"]
+    rid = _resolve_or_index(repo)
+    if not rid:
+        return f"No indexed repo matches '{repo}' (and it is not an indexable path)."
+    return digest.repo_map(rid, max_symbols_per_file=int(args.get("max_symbols", 12)))
+def _tool_search(args: Dict[str, Any]) -> str:
+    repo = args.get("repo")
+    if repo:
+        _resolve_or_index(repo)
+    hits = query.search(args["query"], repo=repo, limit=int(args.get("limit", 30)))
+    if not hits:
+        return "No matches."
+    lines = []
+    for h in hits:
+        sig = h["sig"] or f"{h['kind']} {h['name']}"
+        loc = f":{h['line']}" if h.get("line") else ""
+        lines.append(f"{h['repo']}:{h['path']}{loc}  {sig}")
+    return "\n".join(lines)
+def _tool_context(args: Dict[str, Any]) -> str:
+    repo = args.get("repo")
+    if repo:
+        _resolve_or_index(repo)
+    return query.context(args["query"], repo=repo, limit=int(args.get("limit", 20)))
+# --------------------------------------------------------------------------- #
+# Tool catalog: name -> {description, inputSchema, handler}. The description and
+# schema are what the model uses to decide *whether* and *how* to call a tool.
+# --------------------------------------------------------------------------- #
+def _str(desc: str) -> Dict[str, str]:
+    return {"type": "string", "description": desc}
+TOOLS: Dict[str, Dict[str, Any]] = {
+    "crumbs_map": {
+        "description": (
+            "Get a compact, token-efficient map of a repository: every file with "
+            "its typed function/class signatures, one-line docs, and source line "
+            "ranges (e.g. [L40-92]) -- but NOT the file bodies. Use this FIRST to "
+            "orient yourself in a repo instead of reading files; then open only the "
+            "line ranges it points to. Indexes the repo automatically if needed."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "repo": _str("Repo name, id, or filesystem path."),
+                "max_symbols": {"type": "integer", "description": "Max symbols shown per file (default 12)."},
+            },
+            "required": ["repo"],
+        },
+        "handler": _tool_map,
+    },
+    "crumbs_search": {
+        "description": (
+            "Search for symbols (functions, classes, types) by keyword across all "
+            "indexed repos, ranked by relevance. Returns repo:path:line plus the "
+            "signature for each hit, so you can open the exact slice. Use to find "
+            "where something lives across one or many repos."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "query": _str("Keywords to search for, e.g. 'auth token'."),
+                "repo": _str("Optional: limit to one repo (name, id, or path)."),
+                "limit": {"type": "integer", "description": "Max results (default 30)."},
+            },
+            "required": ["query"],
+        },
+        "handler": _tool_search,
+    },
+    "crumbs_context": {
+        "description": (
+            "Build an LLM-ready context slice for a topic: the most relevant symbols "
+            "across indexed repos, grouped by repo and file, with signatures, docs, "
+            "and line ranges. Use when you want focused context on a topic rather "
+            "than a whole repo map."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "query": _str("Topic to gather context for, e.g. 'rate limiting'."),
+                "repo": _str("Optional: limit to one repo."),
+                "limit": {"type": "integer", "description": "Max symbols (default 20)."},
+            },
+            "required": ["query"],
+        },
+        "handler": _tool_context,
+    },
+    "crumbs_index": {
+        "description": (
+            "Index one or more repositories so their maps/searches are available. "
+            "Usually unnecessary -- the other tools auto-index a path on first use -- "
+            "but call this to (re)index explicitly."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "paths": {"type": "array", "items": {"type": "string"}, "description": "Repo paths to index."},
+                "path": _str("A single repo path (alternative to 'paths')."),
+                "name": _str("Optional override name for the repo."),
+            },
+        },
+        "handler": _tool_index,
+    },
+    "crumbs_list": {
+        "description": "List all indexed repositories with their file and symbol counts.",
+        "inputSchema": {"type": "object", "properties": {}},
+        "handler": _tool_list,
+    },
+}
+# --------------------------------------------------------------------------- #
+# JSON-RPC plumbing
+# --------------------------------------------------------------------------- #
+def _result(req_id: Any, result: Any) -> Dict[str, Any]:
+    return {"jsonrpc": "2.0", "id": req_id, "result": result}
+def _error(req_id: Any, code: int, message: str) -> Dict[str, Any]:
+    return {"jsonrpc": "2.0", "id": req_id, "error": {"code": code, "message": message}}
+def _handle(msg: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """Process one JSON-RPC message; return a response, or None for notifications."""
+    method = msg.get("method")
+    req_id = msg.get("id")
+    is_notification = "id" not in msg
+    params = msg.get("params") or {}
+    if method == "initialize":
+        proto = params.get("protocolVersion", DEFAULT_PROTOCOL_VERSION)
+        return _result(req_id, {
+            "protocolVersion": proto,
+            "capabilities": {"tools": {"listChanged": False}},
+            "serverInfo": {"name": "crumbs", "version": __version__},
+        })
+    if method in ("notifications/initialized", "initialized"):
+        return None  # notification: acknowledge by doing nothing
+    if method == "ping":
+        return _result(req_id, {})
+    if method == "tools/list":
+        tools = [
+            {"name": name, "description": t["description"], "inputSchema": t["inputSchema"]}
+            for name, t in TOOLS.items()
+        ]
+        return _result(req_id, {"tools": tools})
+    if method == "tools/call":
+        name = params.get("name")
+        arguments = params.get("arguments") or {}
+        tool = TOOLS.get(name)
+        if not tool:
+            return _error(req_id, INVALID_PARAMS, f"unknown tool: {name}")
+        try:
+            text = tool["handler"](arguments)
+        except KeyError as e:
+            # a required argument was missing -- report as a tool error, not a crash
+            return _result(req_id, {
+                "content": [{"type": "text", "text": f"missing argument: {e}"}],
+                "isError": True,
+            })
+        except Exception as e:  # noqa: BLE001 -- surface any tool failure to the client
+            _log(f"tool {name} failed: {e}")
+            return _result(req_id, {
+                "content": [{"type": "text", "text": f"error: {e}"}],
+                "isError": True,
+            })
+        return _result(req_id, {"content": [{"type": "text", "text": text}], "isError": False})
+    if is_notification:
+        return None  # ignore unknown notifications
+    return _error(req_id, METHOD_NOT_FOUND, f"method not found: {method}")
+def serve(stdin=None, stdout=None) -> int:
+    """Run the stdio MCP server loop until stdin closes."""
+    stdin = stdin or sys.stdin
+    stdout = stdout or sys.stdout
+    _log(f"crumbs {__version__} MCP server ready (stdio)")
+    for line in stdin:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            msg = json.loads(line)
+        except json.JSONDecodeError:
+            stdout.write(json.dumps(_error(None, PARSE_ERROR, "invalid JSON")) + "\n")
+            stdout.flush()
+            continue
+        response = _handle(msg)
+        if response is not None:
+            stdout.write(json.dumps(response) + "\n")
+            stdout.flush()
+    return 0

crumbs/query.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Search across indexed repos and build LLM-ready context slices."""
+from __future__ import annotations
+import re
+from typing import Any, Dict, List, Optional
+from . import digest, store
+def _tokens(q: str) -> List[str]:
+    return [t for t in re.split(r"[^A-Za-z0-9_]+", q.lower()) if t]
+def _score(terms: List[str], path: str, sym: Dict[str, str]) -> int:
+    hay = f"{path} {sym['name']} {sym['sig']} {sym.get('doc', '')}".lower()
+    name = sym["name"].lower()
+    score = 0
+    for t in terms:
+        if not t:
+            continue
+        if t == name:
+            score += 10
+        elif t in name:
+            score += 5
+        if t in hay:
+            score += 1
+    return score
+def search(query: str, repo: Optional[str] = None, limit: int = 30) -> List[Dict[str, Any]]:
+    """Return ranked symbol matches across indexed repos."""
+    terms = _tokens(query)
+    if not terms:
+        return []
+    rids = [store.resolve(repo)] if repo else store.all_repos()
+    rids = [r for r in rids if r]
+    results: List[Dict[str, Any]] = []
+    for rid in rids:
+        data = store.load_repo(rid)
+        if not data:
+            continue
+        for f in data["files"]:
+            for sym in f["symbols"]:
+                s = _score(terms, f["path"], sym)
+                if s > 0:
+                    results.append({
+                        "repo": data["name"],
+                        "path": f["path"],
+                        "lang": f["lang"],
+                        "score": s,
+                        **sym,
+                    })
+    results.sort(key=lambda r: r["score"], reverse=True)
+    return results[:limit]
+def context(query: str, repo: Optional[str] = None, limit: int = 20) -> str:
+    """Format the most relevant crumbs for a query as compact markdown."""
+    hits = search(query, repo=repo, limit=limit)
+    if not hits:
+        return f"# crumbs context: {query}\n\n_No matches across indexed repos._\n"
+    lines = [f"# crumbs context: {query}", ""]
+    by_repo: Dict[str, List[Dict[str, Any]]] = {}
+    for h in hits:
+        by_repo.setdefault(h["repo"], []).append(h)
+    for repo_name, items in by_repo.items():
+        lines.append(f"## {repo_name}")
+        cur_path = None
+        for it in items:
+            if it["path"] != cur_path:
+                cur_path = it["path"]
+                lines.append(f"- `{it['path']}`")
+            sig = it["sig"] or f"{it['kind']} {it['name']}"
+            tag = digest.loc(it)
+            where = f" [{tag}]" if tag else ""
+            doc = f"  — {it['doc']}" if it.get("doc") else ""
+            lines.append(f"    - {sig}{where}{doc}")
+        lines.append("")
+    return "\n".join(lines)

crumbs/store.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Local on-disk store for crumb data.
+Layout (default ~/.crumbs, override with CRUMBS_HOME):
+    <home>/
+        registry.json        # id -> {name, path, indexed_at, stats}
+        repos/<id>.json      # full crumb data for one repo
+"""
+from __future__ import annotations
+import hashlib
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+def home() -> Path:
+    """Return the crumbs home directory, creating it if needed."""
+    root = Path(os.environ.get("CRUMBS_HOME", Path.home() / ".crumbs"))
+    (root / "repos").mkdir(parents=True, exist_ok=True)
+    return root
+def repo_id(path: str) -> str:
+    """Stable short id for a repo, derived from its absolute path."""
+    abspath = str(Path(path).expanduser().resolve())
+    return hashlib.sha1(abspath.encode()).hexdigest()[:12]
+def _registry_path() -> Path:
+    return home() / "registry.json"
+def load_registry() -> Dict[str, Any]:
+    p = _registry_path()
+    if not p.exists():
+        return {}
+    try:
+        return json.loads(p.read_text())
+    except (json.JSONDecodeError, OSError):
+        return {}
+def save_registry(reg: Dict[str, Any]) -> None:
+    _registry_path().write_text(json.dumps(reg, indent=2, sort_keys=True))
+def save_repo(rid: str, data: Dict[str, Any]) -> None:
+    """Persist one repo's crumb data and update the registry."""
+    (home() / "repos" / f"{rid}.json").write_text(json.dumps(data))
+    reg = load_registry()
+    reg[rid] = {
+        "name": data["name"],
+        "path": data["path"],
+        "indexed_at": data["indexed_at"],
+        "stats": data["stats"],
+    }
+    save_registry(reg)
+def load_repo(rid: str) -> Optional[Dict[str, Any]]:
+    p = home() / "repos" / f"{rid}.json"
+    if not p.exists():
+        return None
+    try:
+        return json.loads(p.read_text())
+    except (json.JSONDecodeError, OSError):
+        return None
+def remove_repo(rid: str) -> bool:
+    p = home() / "repos" / f"{rid}.json"
+    existed = p.exists()
+    if existed:
+        p.unlink()
+    reg = load_registry()
+    if rid in reg:
+        del reg[rid]
+        save_registry(reg)
+    return existed
+def resolve(selector: str) -> Optional[str]:
+    """Resolve a user-supplied selector to a repo id.
+    Accepts an exact id, a repo name, or a filesystem path.
+    """
+    reg = load_registry()
+    if selector in reg:
+        return selector
+    # by name (exact, then unique prefix)
+    by_name = [rid for rid, m in reg.items() if m["name"] == selector]
+    if len(by_name) == 1:
+        return by_name[0]
+    # by path
+    try:
+        rid = repo_id(selector)
+        if rid in reg:
+            return rid
+    except OSError:
+        pass
+    # by name prefix
+    pref = [rid for rid, m in reg.items() if m["name"].startswith(selector)]
+    if len(pref) == 1:
+        return pref[0]
+    return None
+def now() -> float:
+    return time.time()
+def all_repos() -> List[str]:
+    return list(load_registry().keys())

crumbs_cli-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,110 @@
+Metadata-Version: 2.4
+Name: crumbs-cli
+Version: 0.3.0
+Summary: Local, token-efficient cross-repo context for LLMs. CLI + MCP server.
+Author: crumbs
+License: MIT
+Project-URL: Homepage, https://github.com/crumbs1505/crumbs
+Project-URL: Repository, https://github.com/crumbs1505/crumbs
+Project-URL: Issues, https://github.com/crumbs1505/crumbs/issues
+Keywords: llm,context,claude,code,repo,tokens,mcp
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Documentation
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: license-file
+# crumbs
+**Local, token-efficient cross-repo context for LLMs.**
+`crumbs` indexes your repositories into compact *context crumbs* — file maps and
+symbol signatures (typed function/class/type declarations + one-line docs + line
+ranges), **never the full file bodies**. An assistant like Claude can then
+understand many repos at once by reading a tiny map instead of paying tokens to
+read the entire source tree.
+Indexing this very tool produces a map of **~1,200 tokens** standing in for
+**~8,400 tokens** of source — an **~86% reduction** — while still naming every
+file and symbol. Each symbol carries its full type signature and a source line
+range (e.g. `def build_parser() -> ArgumentParser [L125-168]`), so the assistant
+can open *just that slice* of a file rather than the whole thing.
+- 🪶 **Zero dependencies.** Pure Python 3.8+ stdlib. Runs on any device.
+- 🔒 **Fully local.** Crumbs live in `~/.crumbs`. Nothing leaves your machine.
+- 🧠 **Cross-repo.** Search and pull context across every repo you've indexed.
+- 🎯 **High signal.** Python is parsed via `ast`; JS/TS/Go/Rust/etc. via fast
+  regex. Skips `node_modules`, `.git`, build dirs, lockfiles, and binaries.
+## Install
+```bash
+pip install -e .        # provides the `crumbs` command
+# or run without installing:
+python3 -m crumbs --help
+```
+## Usage
+```bash
+crumbs index ~/code/my-api ~/code/my-web   # index one or more repos
+crumbs list                                # show indexed repos + stats
+crumbs map my-api --stats                  # compact map of one repo (+ token estimate)
+crumbs search "auth token"                 # rank matching symbols across all repos
+crumbs context "rate limiting" --repo my-api   # LLM-ready context slice
+crumbs refresh                             # re-index everything
+crumbs remove my-web                       # drop a repo from the index
+```
+A repo can be referenced by name, id, or path.
+## Workflow with Claude
+1. `crumbs index` the repos you work across (once, or on a `crumbs refresh` cron).
+2. Ask Claude to run `crumbs map <repo>` or `crumbs context "<topic>"` instead of
+   reading whole files. It gets the structure and the relevant symbols for a
+   fraction of the tokens, then reads full files only where it actually needs to.
+## How it stays cheap
+| | Full repo read | `crumbs map` |
+|---|---|---|
+| What | every byte of every file | file tree + typed signatures + 1-line docs + line ranges |
+| Bodies | yes | no |
+| Cost | grows with codebase | grows with *interface* size |
+Because every symbol records its line range, the follow-up step is cheap too: the
+assistant reads `path:start-end` for the one function it needs instead of opening
+the entire file.
+Storage layout (`~/.crumbs`, override with `CRUMBS_HOME`):
+```
+registry.json        # id -> {name, path, indexed_at, stats}
+repos/<id>.json      # full crumb data for one repo
+```
+## Supported languages
+Python (AST), JavaScript/TypeScript, Go, Rust, and a generic declaration
+matcher for Java, Ruby, PHP, C/C++, C#, Swift, Kotlin. Markdown is indexed by
+heading. Anything else is skipped from symbol extraction but still ignored
+safely.
+## Tests
+```bash
+python3 -m unittest discover -s tests -v
+```
+## License
+MIT

crumbs_cli-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+crumbs/__init__.py,sha256=q35zpXLx5u_N4VcaLi8c9MtXCaiqWvLOEYy1XHNHTAs,359
+crumbs/__main__.py,sha256=4JMK66Wj4uLZTKbF-sT3LAxOsr6buig77PmOkJCRRxw,83
+crumbs/cli.py,sha256=EB8MufECSES4I1mlRQNzbvZgE1DfxdDX53OWc13Qb1g,5903
+crumbs/digest.py,sha256=57U4aBx9DRDdRm-bzyisgNBElTi3gYg5P6qbyZp7FqM,2255
+crumbs/extractors.py,sha256=SFGQeeHyTU29Vp1YXia0uY2xhOXb1yL81JyMTw0xJoQ,8265
+crumbs/indexer.py,sha256=udba2FnVg_cegc7PO9pEYBKQiPPRmjtpGoBG7rjw2JM,4069
+crumbs/mcp.py,sha256=yDXRTj9JzxuhQJd7V89Xt_KBFXsL3RWB3t5dwhTqWM4,11211
+crumbs/query.py,sha256=qImdrwV_3uGYTX9seBjZ1RlUOG_2aN_15ASMh3T4imU,2686
+crumbs/store.py,sha256=msocCNaLpoAprhwx40t-0LXcvyb3kFwAM3A_xHL9SL0,2976
+crumbs_cli-0.3.0.dist-info/licenses/LICENSE,sha256=VKhnYSB3LGOqeE0zQqoresvhyX33yvdZ5MUA0g8ReSE,1068
+crumbs_cli-0.3.0.dist-info/METADATA,sha256=nZscACFHl1QGIDWMZGsMipFxOflGtS6UUXRmDn7pwoU,4174
+crumbs_cli-0.3.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+crumbs_cli-0.3.0.dist-info/entry_points.txt,sha256=b8-FzjddBJ7krJaLq301KWLvmOvyMIZoaiSYTM5qsd4,43
+crumbs_cli-0.3.0.dist-info/top_level.txt,sha256=l3c3J2z_MFKJW73ZMrHj8sF9XlsHdUpNm8how5Of7sY,7
+crumbs_cli-0.3.0.dist-info/RECORD,,

crumbs_cli-0.3.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

crumbs_cli-0.3.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ crumbs = crumbs.cli:main

crumbs_cli-0.3.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 SufyanShaik
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

crumbs_cli-0.3.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ crumbs