PyPI - codebase-index - Versions diffs - 1.6.0__py3-none-any.whl - Mend

codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

codebase_index/__init__.py +7 -0
codebase_index/__main__.py +3 -0
codebase_index/cli.py +916 -0
codebase_index/config.py +110 -0
codebase_index/discovery/__init__.py +10 -0
codebase_index/discovery/classify.py +151 -0
codebase_index/discovery/ignore.py +58 -0
codebase_index/discovery/walker.py +75 -0
codebase_index/doctor.py +138 -0
codebase_index/embeddings/__init__.py +2 -0
codebase_index/embeddings/backend.py +67 -0
codebase_index/embeddings/external.py +56 -0
codebase_index/embeddings/local.py +41 -0
codebase_index/embeddings/noop.py +15 -0
codebase_index/graph/__init__.py +8 -0
codebase_index/graph/analysis.py +468 -0
codebase_index/graph/builder.py +160 -0
codebase_index/graph/expand.py +136 -0
codebase_index/graph/export.py +381 -0
codebase_index/graph/navigate.py +201 -0
codebase_index/indexer/__init__.py +8 -0
codebase_index/indexer/doc_chunks.py +202 -0
codebase_index/indexer/freshness.py +109 -0
codebase_index/indexer/pipeline.py +423 -0
codebase_index/mcp/__init__.py +2 -0
codebase_index/mcp/server.py +354 -0
codebase_index/models.py +145 -0
codebase_index/output/__init__.py +6 -0
codebase_index/output/json.py +13 -0
codebase_index/output/markdown.py +316 -0
codebase_index/output/redact.py +31 -0
codebase_index/parsers/__init__.py +9 -0
codebase_index/parsers/base.py +47 -0
codebase_index/parsers/languages.py +290 -0
codebase_index/parsers/line_chunker.py +39 -0
codebase_index/parsers/symbol_chunks.py +62 -0
codebase_index/parsers/treesitter.py +439 -0
codebase_index/retrieval/__init__.py +9 -0
codebase_index/retrieval/budget.py +82 -0
codebase_index/retrieval/fusion.py +62 -0
codebase_index/retrieval/intent.py +56 -0
codebase_index/retrieval/pipeline.py +207 -0
codebase_index/retrieval/rerank.py +69 -0
codebase_index/retrieval/searchers.py +291 -0
codebase_index/retrieval/skeleton.py +251 -0
codebase_index/retrieval/types.py +79 -0
codebase_index/scaffold.py +399 -0
codebase_index/service.py +158 -0
codebase_index/skill_template/SKILL.md +198 -0
codebase_index/skill_template/examples/hooks/settings.json +16 -0
codebase_index/skill_template/scripts/cbx +25 -0
codebase_index/skill_template/scripts/cbx.ps1 +25 -0
codebase_index/skill_update.py +150 -0
codebase_index/storage/__init__.py +8 -0
codebase_index/storage/db.py +116 -0
codebase_index/storage/repo.py +701 -0
codebase_index/storage/schema.sql +125 -0
codebase_index/watch/__init__.py +5 -0
codebase_index/watch/watcher.py +93 -0
codebase_index-1.6.0.dist-info/METADATA +748 -0
codebase_index-1.6.0.dist-info/RECORD +64 -0
codebase_index-1.6.0.dist-info/WHEEL +4 -0
codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0

codebase_index/graph/navigate.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Graph navigation: shortest path between two nodes, and a node "card".
+graphify ships `path A B` (how are two things connected?) and `explain Symbol`
+(what is this node?). codebase-index already uses `explain` for how-it-works
+retrieval, so the node card lives under `describe` to avoid colliding with it.
+Both walk the *resolved* edge graph and carry the Phase-1 confidence trail, so a
+path through an `inferred`/`ambiguous` edge is visibly less certain than one
+through `extracted` edges.
+"""
+from __future__ import annotations
+import sqlite3
+from collections import deque
+from typing import Optional
+from ..storage import repo
+# BFS safety valve: stop exploring after this many nodes so `path` stays cheap on
+# very large graphs (the shortest path, if short, is found long before this).
+_MAX_VISITS = 20000
+Node = tuple[str, int]
+def _freshness(conn: sqlite3.Connection) -> dict:
+    return {
+        "exists": True,
+        "stale": False,
+        "built_at": repo.get_meta(conn, "built_at"),
+        "head_commit": repo.get_meta(conn, "head_commit"),
+    }
+def _resolve_targets(conn: sqlite3.Connection, token: str) -> list[Node]:
+    """Resolve a path/symbol token to one or more graph nodes (file or symbols)."""
+    frow = repo.file_by_path(conn, token)
+    if frow is not None:
+        return [("file", int(frow["id"]))]
+    sym_rows = repo.symbols_by_name(conn, token, exact=True)
+    if sym_rows:
+        return [("symbol", int(r["id"])) for r in sym_rows]
+    suffix = repo.files_with_suffix(conn, token)
+    if len(suffix) == 1:
+        return [("file", int(suffix[0]["id"]))]
+    return []
+def _node_ref(conn: sqlite3.Connection, kind: str, node_id: int) -> Optional[dict]:
+    if kind == "file":
+        row = conn.execute("SELECT path FROM files WHERE id = ?", (node_id,)).fetchone()
+        if row is None:
+            return None
+        return {"kind": "file", "name": row["path"].rsplit("/", 1)[-1], "path": row["path"],
+                "line_start": None}
+    row = conn.execute(
+        "SELECT s.name AS name, s.kind AS kind, s.line_start AS line_start, f.path AS path "
+        "FROM symbols s JOIN files f ON f.id = s.file_id WHERE s.id = ?",
+        (node_id,),
+    ).fetchone()
+    if row is None:
+        return None
+    return {"kind": "symbol", "name": row["name"], "symbol_kind": row["kind"],
+            "path": row["path"], "line_start": row["line_start"]}
+def _undirected_neighbors(conn: sqlite3.Connection, kind: str, node_id: int):
+    """Yield (next_kind, next_id, edge_type, confidence, direction) ignoring edge
+    direction — `path` answers "how are these connected", not "who calls whom"."""
+    for e in repo.incoming_edges(conn, kind, node_id):
+        yield e["src_kind"], int(e["src_id"]), e["edge_type"], e["confidence"], "in"
+    for e in repo.outgoing_edges(conn, kind, node_id):
+        if e["dst_id"] is not None:
+            yield e["dst_kind"], int(e["dst_id"]), e["edge_type"], e["confidence"], "out"
+# ---------------------------------------------------------------------------
+# path A B
+# ---------------------------------------------------------------------------
+def path_payload(conn: sqlite3.Connection, src: str, dst: str) -> dict:
+    """Shortest undirected path between two nodes, with the edge audit trail."""
+    src_seeds = _resolve_targets(conn, src)
+    dst_seeds = set(_resolve_targets(conn, dst))
+    base = {"src": src, "dst": dst, "index": _freshness(conn), "nodes": [], "steps": []}
+    if not src_seeds or not dst_seeds:
+        missing = src if not src_seeds else dst
+        return {**base, "found": False, "reason": f"Could not resolve `{missing}` to an indexed node."}
+    # Multi-source BFS from every src node; stop at the first dst node reached.
+    parent: dict[Node, Optional[Node]] = {seed: None for seed in src_seeds}
+    via: dict[Node, tuple] = {}
+    queue: deque[Node] = deque(src_seeds)
+    found: Optional[Node] = None
+    visits = 0
+    while queue and visits < _MAX_VISITS:
+        node = queue.popleft()
+        visits += 1
+        if node in dst_seeds:
+            found = node
+            break
+        for nk, nid, etype, conf, direction in _undirected_neighbors(conn, *node):
+            nxt = (nk, nid)
+            if nxt not in parent:
+                parent[nxt] = node
+                via[nxt] = (etype, conf, direction)
+                queue.append(nxt)
+    if found is None:
+        return {**base, "found": False,
+                "reason": "No path found between the two nodes in the resolved graph."}
+    # Reconstruct from `found` back to a src seed.
+    chain: list[Node] = []
+    cur: Optional[Node] = found
+    while cur is not None:
+        chain.append(cur)
+        cur = parent[cur]
+    chain.reverse()
+    nodes = [ref for n in chain if (ref := _node_ref(conn, *n)) is not None]
+    steps = []
+    for prev, nxt in zip(chain, chain[1:]):
+        etype, conf, direction = via[nxt]
+        a, b = _node_ref(conn, *prev), _node_ref(conn, *nxt)
+        if a and b:
+            steps.append({"from": a, "to": b, "edge_type": etype,
+                          "confidence": conf, "direction": direction})
+    return {**base, "found": True, "hops": len(steps), "nodes": nodes, "steps": steps}
+# ---------------------------------------------------------------------------
+# describe <symbol>
+# ---------------------------------------------------------------------------
+def describe_payload(conn: sqlite3.Connection, query: str) -> dict:
+    """A node card: definition(s), callers, callees, centrality, module, god status."""
+    base = {"query": query, "index": _freshness(conn)}
+    sym_rows = repo.symbols_by_name(conn, query, exact=True)
+    if not sym_rows:
+        return {**base, "found": False,
+                "reason": f"No symbol named `{query}` is indexed. Try `search` or `symbol`."}
+    definitions = [
+        {
+            "name": r["name"],
+            "qualified": r["qualified"],
+            "kind": r["kind"],
+            "path": r["path"],
+            "line_start": r["line_start"],
+            "line_end": r["line_end"],
+            "signature": r["signature"],
+            "in_degree": int(r["in_degree"]),
+            "out_degree": int(r["out_degree"]),
+        }
+        for r in sym_rows
+    ]
+    # Primary = most-connected definition (the one worth describing in depth).
+    primary_row = max(sym_rows, key=lambda r: int(r["in_degree"]) + int(r["out_degree"]))
+    primary_id = int(primary_row["id"])
+    callers = [
+        {"path": r["path"], "line": r["line"], "confidence": r["confidence"]}
+        for r in repo.refs_for_name(conn, query)
+    ]
+    callees = []
+    for e in repo.outgoing_edges(conn, "symbol", primary_id):
+        if e["dst_id"] is None:
+            continue
+        ref = _node_ref(conn, e["dst_kind"], int(e["dst_id"]))
+        if ref is not None:
+            callees.append({**ref, "edge_type": e["edge_type"], "confidence": e["confidence"]})
+    module = primary_row["path"].rsplit("/", 1)[0] if "/" in primary_row["path"] else "(root)"
+    god = _god_rank(conn, primary_row["name"], primary_row["path"])
+    return {
+        **base,
+        "found": True,
+        "definitions": definitions,
+        "primary": {"name": primary_row["name"], "path": primary_row["path"],
+                    "module": module, "god_rank": god,
+                    "in_degree": int(primary_row["in_degree"]),
+                    "out_degree": int(primary_row["out_degree"])},
+        "callers": callers,
+        "callees": callees,
+    }
+def _god_rank(conn: sqlite3.Connection, name: str, path: str) -> Optional[int]:
+    """1-based rank of this symbol among the cached god nodes, or None."""
+    from . import analysis
+    summary = analysis.load_analysis(conn)
+    if not summary:
+        return None
+    for idx, g in enumerate(summary.get("god_nodes", []), start=1):
+        if g.get("name") == name and g.get("path") == path:
+            return idx
+    return None

codebase_index/indexer/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Indexing orchestration.
+pipeline.py    : full + incremental build = discovery -> parse -> store chunks/symbols ->
+                 graph build -> summaries -> FTS sync -> (optional) embeddings.
+incremental.py : decide which files to (re)process from sha256 + mtime_ns + git status; handle
+                 deletions (cascade) and config_hash changes (rebuild affected rows).
+summarize.py   : file/module/package summaries (heuristic/extractive by default; pluggable later).
+"""

codebase_index/indexer/doc_chunks.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Extract document-style chunks from non-code content for FTS5 indexing.
+Produces chunks of kind="doc" from:
+- Markdown headings (# Heading)
+- README sections (first 200 chars under each heading)
+- Test function names (test_* in Python)
+- Function/class docstrings
+- Exception messages (raise X("message"))
+- Config keys (.codeindex.json, pyproject.toml)
+"""
+from __future__ import annotations
+import json
+import re
+from typing import Optional
+from ..parsers.base import Chunk
+_MD_HEADING_RE = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
+_TEST_FUNC_RE = re.compile(r'def\s+(test_\w+)\s*\(', re.MULTILINE)
+_DOCSTRING_RE = re.compile(r'(?:def|class)\s+\w+[\s\S]*?("""[\s\S]*?""")')
+_EXCEPTION_RE = re.compile(r'raise\s+\w+\s*\(\s*["\'](.+?)["\']', re.MULTILINE)
+def extract_doc_chunks(text: str, rel_path: str, lang: Optional[str]) -> list[Chunk]:
+    """Extract all doc-style chunks from a file."""
+    chunks: list[Chunk] = []
+    if lang == "markdown":
+        chunks.extend(_extract_md_headings(text))
+        chunks.extend(_extract_readme_sections(text))
+    elif lang == "python":
+        chunks.extend(_extract_test_names(text))
+        chunks.extend(_extract_docstrings(text))
+        chunks.extend(_extract_exception_messages(text))
+    elif lang in ("json", "toml"):
+        chunks.extend(_extract_config_keys(text, lang))
+    elif rel_path.endswith(".py"):
+        chunks.extend(_extract_test_names(text))
+        chunks.extend(_extract_docstrings(text))
+        chunks.extend(_extract_exception_messages(text))
+    return chunks
+def _extract_md_headings(text: str) -> list[Chunk]:
+    """Extract markdown headings as searchable chunks."""
+    chunks = []
+    for match in _MD_HEADING_RE.finditer(text):
+        line_num = text[:match.start()].count('\n') + 1
+        heading = match.group(0).strip()
+        token_est = max(1, len(heading) // 4)
+        chunks.append(Chunk(
+            line_start=line_num,
+            line_end=line_num,
+            content=heading,
+            token_est=token_est,
+            kind="doc",
+        ))
+    return chunks
+def _extract_readme_sections(text: str) -> list[Chunk]:
+    """Extract first 200 chars under each markdown heading."""
+    chunks = []
+    headings = list(_MD_HEADING_RE.finditer(text))
+    for i, match in enumerate(headings):
+        heading_text = match.group(0).strip()
+        start = match.end()
+        end = headings[i + 1].start() if i + 1 < len(headings) else len(text)
+        section_body = text[start:end].strip()[:200]
+        if section_body:
+            line_start = text[:match.start()].count('\n') + 1
+            line_end = text[:start + len(section_body)].count('\n') + 1
+            content = f"{heading_text}: {section_body}"
+            token_est = max(1, len(content) // 4)
+            chunks.append(Chunk(
+                line_start=line_start,
+                line_end=line_end,
+                content=content,
+                token_est=token_est,
+                kind="doc",
+            ))
+    return chunks
+def _extract_test_names(text: str) -> list[Chunk]:
+    """Extract test function names as searchable chunks."""
+    chunks = []
+    for match in _TEST_FUNC_RE.finditer(text):
+        line_num = text[:match.start()].count('\n') + 1
+        func_name = match.group(1)
+        token_est = max(1, len(func_name) // 4)
+        chunks.append(Chunk(
+            line_start=line_num,
+            line_end=line_num,
+            content=f"test function: {func_name}",
+            token_est=token_est,
+            kind="doc",
+        ))
+    return chunks
+def _extract_docstrings(text: str) -> list[Chunk]:
+    """Extract function/class docstrings as searchable chunks."""
+    chunks = []
+    for match in _DOCSTRING_RE.finditer(text):
+        line_start = text[:match.start()].count('\n') + 1
+        docstring = match.group(1).strip('"""').strip()
+        if docstring and len(docstring) > 10:
+            line_end = text[:match.end()].count('\n') + 1
+            token_est = max(1, len(docstring) // 4)
+            chunks.append(Chunk(
+                line_start=line_start,
+                line_end=line_end,
+                content=docstring[:500],
+                token_est=token_est,
+                kind="doc",
+            ))
+    return chunks
+def _extract_exception_messages(text: str) -> list[Chunk]:
+    """Extract exception messages as searchable chunks."""
+    chunks = []
+    for match in _EXCEPTION_RE.finditer(text):
+        line_num = text[:match.start()].count('\n') + 1
+        msg = match.group(1)
+        token_est = max(1, len(msg) // 4)
+        chunks.append(Chunk(
+            line_start=line_num,
+            line_end=line_num,
+            content=f"exception: {msg}",
+            token_est=token_est,
+            kind="doc",
+        ))
+    return chunks
+def _extract_config_keys(text: str, lang: str) -> list[Chunk]:
+    """Extract config keys from JSON/TOML files."""
+    chunks = []
+    if lang == "json":
+        try:
+            data = json.loads(text)
+            keys = _flatten_json_keys(data)
+            for key_path, value in keys:
+                line_est = 1
+                content = f"config key: {key_path} = {_truncate_value(value)}"
+                token_est = max(1, len(content) // 4)
+                chunks.append(Chunk(
+                    line_start=line_est,
+                    line_end=line_est,
+                    content=content,
+                    token_est=token_est,
+                    kind="doc",
+                ))
+        except json.JSONDecodeError:
+            pass
+    elif lang == "toml":
+        for match in re.finditer(r'^([\w.]+)\s*=', text, re.MULTILINE):
+            line_num = text[:match.start()].count('\n') + 1
+            key = match.group(1)
+            content = f"config key: {key}"
+            token_est = max(1, len(content) // 4)
+            chunks.append(Chunk(
+                line_start=line_num,
+                line_end=line_num,
+                content=content,
+                token_est=token_est,
+                kind="doc",
+            ))
+    return chunks
+def _flatten_json_keys(data, prefix: str = "") -> list[tuple[str, str]]:
+    """Flatten nested JSON into dot-notation key paths."""
+    result = []
+    if isinstance(data, dict):
+        for k, v in data.items():
+            path = f"{prefix}.{k}" if prefix else k
+            if isinstance(v, (dict, list)):
+                result.extend(_flatten_json_keys(v, path))
+            else:
+                result.append((path, v))
+    elif isinstance(data, list):
+        for i, v in enumerate(data):
+            path = f"{prefix}[{i}]"
+            if isinstance(v, (dict, list)):
+                result.extend(_flatten_json_keys(v, path))
+            else:
+                result.append((path, v))
+    return result
+def _truncate_value(value, max_len: int = 100) -> str:
+    s = str(value)
+    return s if len(s) <= max_len else s[:max_len] + "..."

codebase_index/indexer/freshness.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Compute index freshness for the `index` block of every response.
+Contract (consumed by SKILL.md step 2):
+  exists -> a build has happened (meta.built_at present).
+  stale  -> the working tree differs from what was indexed.
+  files_changed_since_build -> how many indexable files differ.
+Strategy:
+  * Git fast-path: if the repo is a clean git tree AT the indexed commit, nothing
+    changed -> not stale (cheap; no walk).
+  * Accurate fallback (dirty tree, different commit, or no git): walk the current
+    indexable set and diff (path, mtime_ns) against the `files` table. This reuses
+    the discovery gates, so ignored/secret/binary files never count as changes.
+"""
+from __future__ import annotations
+import hashlib
+import subprocess
+from pathlib import Path
+from ..config import Config
+from ..discovery.walker import walk
+from ..models import IndexFreshness
+from ..storage import repo
+def compute_freshness(conn, root: Path, config: Config) -> IndexFreshness:
+    built_at = repo.get_meta(conn, "built_at")
+    if built_at is None:
+        return IndexFreshness(exists=False, stale=False)
+    head = repo.get_meta(conn, "head_commit")
+    root = Path(root)
+    if _git_clean_at(root, head):
+        changed = 0
+    else:
+        changed = _changed_count(conn, root, config)
+    return IndexFreshness(
+        exists=True,
+        stale=changed > 0,
+        files_changed_since_build=changed,
+        built_at=built_at,
+        head_commit=head,
+    )
+def _changed_count(conn, root: Path, config: Config) -> int:
+    """Added + removed + content-modified indexable files vs. the index.
+    Mirrors the incremental update's decision (indexer/pipeline.py): a file is
+    unchanged when (mtime, size) match, and even when they differ it is only
+    counted as changed if its sha256 differs. A bare `touch` that rewrites mtime
+    without changing bytes is a no-op for update_index, so it must not register as
+    stale here either.
+    """
+    indexed = repo.fingerprints(conn)  # path -> (mtime_ns, size_bytes, sha256)
+    seen: set[str] = set()
+    changed = 0
+    for cand in walk(root, config):
+        try:
+            st = cand.path.stat()
+        except OSError:
+            continue
+        seen.add(cand.rel_path)
+        prior = indexed.get(cand.rel_path)
+        if prior is None:
+            changed += 1
+            continue
+        if prior[0] == st.st_mtime_ns and prior[1] == cand.size_bytes:
+            continue
+        try:
+            if prior[2] == _sha256_file(cand.path):
+                continue
+        except OSError:
+            pass
+        changed += 1
+    changed += sum(1 for path in indexed if path not in seen)
+    return changed
+def _sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        for block in iter(lambda: fh.read(65536), b""):
+            h.update(block)
+    return h.hexdigest()
+def _git_clean_at(root: Path, indexed_head: "str | None") -> bool:
+    """True iff git is available, HEAD == indexed_head, and the tree has no changes."""
+    if indexed_head is None or not (root / ".git").exists():
+        return False
+    try:
+        head = subprocess.run(
+            ["git", "-C", str(root), "rev-parse", "HEAD"],
+            capture_output=True, text=True, timeout=5, check=False,
+        )
+        if head.returncode != 0 or head.stdout.strip() != indexed_head:
+            return False
+        status = subprocess.run(
+            ["git", "-C", str(root), "status", "--porcelain"],
+            capture_output=True, text=True, timeout=5, check=False,
+        )
+    except (OSError, subprocess.SubprocessError):
+        return False
+    return status.returncode == 0 and status.stdout.strip() == ""