PyPI - second-brain-graph - Versions diffs - 0.1.0__py3-none-any.whl - Mend

second-brain-graph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

second_brain/__init__.py +21 -0
second_brain/__main__.py +8 -0
second_brain/assess.py +211 -0
second_brain/classify.py +87 -0
second_brain/cli.py +185 -0
second_brain/freshness.py +121 -0
second_brain/gate.py +79 -0
second_brain/ignore.py +76 -0
second_brain/indexer.py +295 -0
second_brain/mcp_server.py +91 -0
second_brain/model.py +213 -0
second_brain/operational.py +103 -0
second_brain/py.typed +0 -0
second_brain/pycode.py +61 -0
second_brain/query.py +177 -0
second_brain/references.py +80 -0
second_brain/store.py +66 -0
second_brain/ui/3d-force-graph.min.js +5 -0
second_brain/ui/template.html +341 -0
second_brain/viewer.py +57 -0
second_brain_graph-0.1.0.dist-info/METADATA +258 -0
second_brain_graph-0.1.0.dist-info/RECORD +26 -0
second_brain_graph-0.1.0.dist-info/WHEEL +5 -0
second_brain_graph-0.1.0.dist-info/entry_points.txt +3 -0
second_brain_graph-0.1.0.dist-info/licenses/LICENSE +21 -0
second_brain_graph-0.1.0.dist-info/top_level.txt +1 -0

second_brain/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Second Brain (SB) — a living, low-token map of every project.
+Public API re-exports the graph model so callers can do ``from second_brain import Graph``.
+"""
+from __future__ import annotations
+from second_brain.model import EDGE_COLORS, NODE_COLORS, Edge, EdgeType, Graph, Node, NodeType
+__version__ = "0.1.0"
+__all__ = [
+    "EDGE_COLORS",
+    "NODE_COLORS",
+    "Edge",
+    "EdgeType",
+    "Graph",
+    "Node",
+    "NodeType",
+    "__version__",
+]

second_brain/__main__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Allow running the CLI as ``python -m second_brain`` (no PATH setup needed)."""
+from __future__ import annotations
+from second_brain.cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

second_brain/assess.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""One-shot project assessment: what Second Brain reveals, and what it saves.
+``second-brain assess`` indexes a project read-only and writes a before/after report a user can
+run on their own codebase before adopting the tool: hidden problems (truncated/empty/orphan
+files, broken links), the project's scale, the decisions and cross-references it surfaces, and
+the token cost of orienting an assistant WITHOUT vs WITH Second Brain. The single most useful
+thing for someone deciding whether it is worth installing.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from second_brain import query
+from second_brain.freshness import index
+from second_brain.model import Graph, NodeType
+_TEXT_EXTS = {
+    ".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".jsonl",
+    ".html", ".htm", ".css", ".toml", ".ini", ".cfg", ".yaml", ".yml", ".xml", ".sql", ".ps1",
+    ".psm1", ".sh", ".log", ".csv", ".go", ".rs", ".java", ".c", ".cc", ".cpp", ".h", ".hpp",
+    ".rb", ".php", ".cs",
+}
+_DOC_TYPES = {
+    NodeType.STRUCTURE, NodeType.REPORT, NodeType.DESIGN, NodeType.DECISION, NodeType.MEMORY,
+}
+_SCAN_CAP = 2_000_000
+_LIST_CAP = 50  # how many truncated/empty file names to record in the report
+def _ext(name: str) -> str:
+    return os.path.splitext(name)[1].lower()
+_NULL_RUN = b"\x00" * 16  # a contiguous null run this long = zero-fill/truncation, not encoding
+def _looks_utf16(chunk: bytes) -> bool:
+    """True if the null bytes are explained by UTF-16 encoding (valid text, not corruption).
+    UTF-16 text has a BOM, or null bytes concentrated on alternating positions (the high byte
+    of mostly-ASCII codepoints). A contiguous run of nulls is never normal UTF-16 text (it would
+    be consecutive U+0000), so that case is handled separately by ``_is_corrupt``.
+    """
+    if chunk[:2] in (b"\xff\xfe", b"\xfe\xff"):
+        return True
+    if len(chunk) < 4:
+        return False
+    even = chunk[0::2]
+    odd = chunk[1::2]
+    even_nulls, odd_nulls = even.count(0), odd.count(0)
+    return (odd_nulls > 0.4 * len(odd) and even_nulls < 0.05 * len(even)) or (
+        even_nulls > 0.4 * len(even) and odd_nulls < 0.05 * len(odd)
+    )
+def _is_corrupt(chunk: bytes) -> bool:
+    """True if a chunk shows genuine truncation/corruption (null bytes not explained by UTF-16)."""
+    if b"\x00" not in chunk:
+        return False
+    if _NULL_RUN in chunk:  # contiguous zero-fill, regardless of text encoding
+        return True
+    return not _looks_utf16(chunk)
+def scan_integrity(root: str | os.PathLike[str], graph: Graph) -> dict[str, list[str]]:
+    """Find empty (zero-byte) and truncated/corrupted (null-byte) files among indexed nodes.
+    Empty ``__init__.py`` files are excluded - they are conventionally empty, not a problem.
+    """
+    root_p = Path(root)
+    empty: list[str] = []
+    truncated: list[str] = []
+    for n in graph.nodes.values():
+        if not n.path:
+            continue
+        p = root_p / n.path
+        try:
+            sz = p.stat().st_size
+        except OSError:
+            continue
+        if sz == 0:
+            if os.path.basename(n.path) != "__init__.py":
+                empty.append(n.path)
+            continue
+        if _ext(n.path) in _TEXT_EXTS:
+            try:
+                with open(p, "rb") as f:
+                    head = f.read(_SCAN_CAP)
+                    tail = b""
+                    if sz > _SCAN_CAP:  # also scan the file's end, where zero-fill usually lands
+                        f.seek(-min(65536, sz), os.SEEK_END)
+                        tail = f.read()
+            except OSError:
+                continue
+            if _is_corrupt(head) or _NULL_RUN in tail:
+                truncated.append(n.path)
+    return {"empty": sorted(empty), "truncated": sorted(truncated)}
+def _tok(chars: int) -> int:
+    return round(chars / 4)
+def _human(n: int) -> str:
+    f = float(n)
+    for u in ("B", "KB", "MB", "GB", "TB"):
+        if f < 1024 or u == "TB":
+            return f"{f:.0f} {u}" if u == "B" else f"{f:.1f} {u}"
+        f /= 1024
+    return f"{f:.1f} TB"
+def _digest_text(m: dict) -> str:
+    lines = [f"{m['project']}: {m['files']} files, {m['areas']} areas, {m['links']} links"]
+    for a in m["by_area"]:
+        lines.append(f"{a['area']}: {a['files']} files {a['size']}B [{','.join(a['top_types'])}]")
+    for x in m["most_connected"]:
+        lines.append(f"{x['degree']} {x['id']} {x['type']}")
+    return "\n".join(lines)
+def assess(root: str | os.PathLike[str]) -> dict:
+    """Run the read-only assessment and return all metrics as plain data."""
+    root_p = Path(root).resolve()
+    graph, _ = index(root_p)
+    m = query.project_map(graph, top=10)
+    integ = scan_integrity(root_p, graph)
+    files = [n for n in graph.nodes.values() if n.path]
+    inventory_chars = sum(len(n.path) + 1 for n in files)
+    doc_bytes = sum(int(n.meta.get("size", 0)) for n in files if n.type in _DOC_TYPES)
+    graph_bytes = len(graph.to_json(indent=None).encode("utf-8"))
+    digest_chars = len(_digest_text(m))
+    orphans_pct = round(100 * m["orphans"] / max(1, m["files"]))
+    return {
+        "project": graph.project,
+        "files": m["files"], "areas": m["areas"], "links": m["links"], "size": m["size"],
+        "node_types": m["node_types"], "edge_types": m["edge_types"],
+        "decisions": m["node_types"].get("decision", 0),
+        "sessions": m["node_types"].get("session", 0),
+        "orphans": m["orphans"], "orphans_pct": orphans_pct, "broken_refs": m["broken_refs"],
+        "empty": len(integ["empty"]), "truncated": len(integ["truncated"]),
+        "empty_files": integ["empty"][:_LIST_CAP],
+        "truncated_files": integ["truncated"][:_LIST_CAP],
+        "most_connected": m["most_connected"], "by_area": m["by_area"],
+        "tokens_inventory": _tok(inventory_chars),
+        "tokens_read_all_docs": _tok(doc_bytes),
+        "tokens_read_all_files": _tok(m["size"]),
+        "tokens_digest": _tok(digest_chars),
+        "graph_bytes": graph_bytes,
+    }
+def _file_lines(names: list[str], total: int) -> list[str]:
+    """Indented bullets listing file names, with an '...and N more' when capped."""
+    out = [f"  - `{n}`" for n in names]
+    if total > len(names):
+        out.append(f"  - ...and {total - len(names)} more")
+    return out
+def render_markdown(r: dict) -> str:
+    """Render an assessment dict as a Markdown before/after report."""
+    digest = max(1, r["tokens_digest"])
+    orient = max(r["tokens_read_all_docs"], r["tokens_inventory"])  # cheapest realistic orient
+    factor = round(orient / digest) if orient > digest else None
+    saving = (
+        f"- **~{factor}x less** than reading the docs to get oriented - and roughly constant "
+        "as the project grows"
+        if factor
+        else "- at this scale the digest already costs no more than just listing the files"
+    )
+    types = ", ".join(f"{k} {v}" for k, v in sorted(r["node_types"].items(), key=lambda kv: -kv[1]))
+    out = [
+        f"# Second Brain - assessment of `{r['project']}`",
+        "",
+        "Read-only snapshot. Re-run after changes; use `second-brain gate` to catch drift.",
+        "",
+        "## Scale",
+        "",
+        f"- **{r['files']} files** in **{r['areas']} areas**, **{r['links']} links**, "
+        f"{_human(r['size'])}",
+        f"- node types: {types}",
+        "",
+        "## What was hidden (before Second Brain)",
+        "",
+        f"- **{r['truncated']}** truncated/corrupted files (null bytes)",
+        *_file_lines(r.get("truncated_files", []), r["truncated"]),
+        f"- **{r['empty']}** empty files (zero bytes)",
+        *_file_lines(r.get("empty_files", []), r["empty"]),
+        f"- **{r['orphans']}** orphan files (~{r['orphans_pct']}%) - linked to nothing",
+        f"- **{r['broken_refs']}** broken references",
+        f"- **{r['decisions']}** decisions scattered in docs - now queryable nodes",
+        "",
+        "## Token cost to orient an assistant",
+        "",
+        f"- WITHOUT Second Brain: ~**{r['tokens_read_all_files']:,} tokens** to read every "
+        f"indexed file, ~{r['tokens_read_all_docs']:,} for just the documents, or "
+        f"~{r['tokens_inventory']:,} just to list every file's location",
+        f"- WITH Second Brain: ~**{r['tokens_digest']:,} tokens** (the `map` digest); the full "
+        f"index is {_human(r['graph_bytes'])}, queried on demand and never loaded into context",
+        saving,
+        "",
+        "## Most connected files",
+        "",
+    ]
+    for x in r["most_connected"]:
+        out.append(f"- {x['degree']:>3}  `{x['id']}` ({x['type']})")
+    out += ["", "*Generated by `second-brain assess` (read-only).*", ""]
+    return "\n".join(out)

second_brain/classify.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""Classify a file into a NodeType from its path and name (heuristic, tunable).
+The order of checks matters: more specific signals win. The classification is deliberately
+conservative and documented; it is meant to be refined per project over time.
+"""
+from __future__ import annotations
+import os
+import re
+from second_brain.model import NodeType
+_PROGRAM_EXTS = {
+    ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
+    ".go", ".rs", ".java", ".kt", ".c", ".cc", ".cpp", ".h", ".hpp",
+    ".rb", ".php", ".cs", ".swift", ".scala", ".lua",
+    ".ps1", ".psm1", ".sh", ".bash", ".bat", ".cmd", ".sql", ".vcl", ".r",
+}
+_DATA_EXTS = {
+    ".db", ".sqlite", ".sqlite3", ".duckdb", ".parquet", ".csv", ".tsv", ".jsonl", ".ndjson",
+}
+_CONFIG_EXTS = {
+    ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".env", ".json", ".xml", ".properties",
+}
+_DOC_EXTS = {".md", ".markdown", ".rst", ".txt", ".pdf", ".html", ".htm", ".docx", ".pptx", ".odt"}
+_STRUCTURE_NAMES = {
+    "progetto.md", "progetto-storia.md", "readme.md", "readme", "index.md",
+    "data-map.md", "changelog.md", "contributing.md", "license", "license.md",
+    "license.txt", "authors", "notice",
+}
+_DECISION_RE = re.compile(r"(?i)(?:^|[-_/])(?:adr|decision|decisione|decisioni)(?:[-_/.]|$)")
+_DESIGN_RE = re.compile(
+    r"(?i)(?:^|[-_/])(?:disegno|design|piano|plan|roadmap|spec|blueprint|brief|"
+    r"architettura|architecture)(?:[-_/.]|$)"
+)
+_REPORT_RE = re.compile(
+    r"(?i)(?:^|[-_/])(?:report|rapporto|collaudo|diagnosi|revisione|readiness|analisi|analysis|audit|verifica|backtest|indagine|strumentazione)(?:[-_/.]|$)"
+)
+_DATE_RE = re.compile(r"(?<!\d)(?:20\d{2}[-_]?\d{2}[-_]?\d{2}|20\d{2}[-_]\d{2})(?!\d)")
+def _ext(name: str) -> str:
+    # os.path.splitext treats leading-dot names (".gitignore") as extensionless, which is
+    # what we want (a dotfile has no "extension").
+    return os.path.splitext(name)[1].lower()
+def classify(rel_posix: str) -> NodeType:
+    """Return the NodeType for a file given its POSIX relative path."""
+    name = rel_posix.rsplit("/", 1)[-1]
+    low = name.lower()
+    ext = _ext(low)
+    parts = [p.lower() for p in rel_posix.split("/")]
+    # 1. Memory (path- or name-based)
+    if "memory" in parts or low.startswith("memory.") or low == "memory.md":
+        return NodeType.MEMORY
+    # 2. Config / data / program by extension (strong signals)
+    if ext in _DATA_EXTS:
+        return NodeType.DATA
+    if ext in _PROGRAM_EXTS:
+        return NodeType.PROGRAM
+    config_names = {".gitignore", ".secondbrainignore", "dockerfile", "caddyfile", "makefile"}
+    if ext in _CONFIG_EXTS or low in config_names:
+        return NodeType.CONFIG
+    # 3. Foundation structure docs by name
+    if low in _STRUCTURE_NAMES:
+        return NodeType.STRUCTURE
+    # 4. Document sub-typing by keyword / date (only for document-like files)
+    if ext in _DOC_EXTS or ext == "":
+        if _DECISION_RE.search(rel_posix):
+            return NodeType.DECISION
+        if _DESIGN_RE.search(rel_posix):
+            return NodeType.DESIGN
+        if _REPORT_RE.search(rel_posix) or _DATE_RE.search(name):
+            return NodeType.REPORT
+        # Fallback for loose documents: treat as project structure/knowledge.
+        return NodeType.STRUCTURE
+    # 5. Anything else (unknown extension) -> config-like by default.
+    return NodeType.CONFIG

second_brain/cli.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Command-line interface: build, gate, view, stats, and the query commands.
+Read-only on your sources: every command only reads the project and writes derived files
+under ``.secondbrain/``.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from second_brain import __version__, assess, gate, query, store
+from second_brain.freshness import build_manifest, index
+from second_brain.model import Graph
+from second_brain.viewer import write_view
+def _human(n: int) -> str:
+    f = float(n)
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if f < 1024 or unit == "TB":
+            return f"{f:.0f} {unit}" if unit == "B" else f"{f:.1f} {unit}"
+        f /= 1024
+    return f"{f:.1f} TB"
+def _build(path: str) -> tuple[Graph, dict[str, str]]:
+    # Single filesystem walk produces both the graph and the manifest.
+    return index(path)
+def _load_or_build(path: str) -> Graph:
+    return store.load_graph(path) or _build(path)[0]
+def cmd_build(args: argparse.Namespace) -> int:
+    g, m = _build(args.path)
+    store.save(args.path, g, m)
+    c = g.counts()
+    print(f"built '{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
+    print("  nodes:", c["nodes"])
+    print("  edges:", c["edges"])
+    print(f"  store: {store.store_dir(args.path)}")
+    return 0
+def cmd_gate(args: argparse.Namespace) -> int:
+    g = store.load_graph(args.path)
+    old = store.load_manifest(args.path)
+    if g is None or old is None:
+        print("no graph found \u2014 run 'second-brain build' first", file=sys.stderr)
+        return 2
+    rep = gate.evaluate(g, old, build_manifest(args.path))
+    print(rep.summary())
+    msg = "OK: fresh and clean" if rep.ok else "DRIFT: rebuild and/or fix the issues above"
+    print(msg)
+    return 0 if rep.ok else 1
+_BACKBONE_AUTO = 8000  # graphs bigger than this auto-render as a backbone to stay light
+def cmd_view(args: argparse.Namespace) -> int:
+    g, _ = _build(args.path)
+    full = len(g.nodes)
+    if args.backbone or full > _BACKBONE_AUTO:
+        g = query.backbone(g)
+        print(f"backbone: rendering {len(g.nodes)} of {full} nodes "
+              "(isolated data files summarized on their area)")
+    out = write_view(args.path, g)
+    print(f"view written: {out}")
+    print("open it in a browser (double-click).")
+    return 0
+def cmd_stats(args: argparse.Namespace) -> int:
+    g, _ = _build(args.path)
+    c = g.counts()
+    print(f"'{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
+    for k, v in sorted(c["nodes"].items()):
+        print(f"  {k:10} {v}")
+    for k, v in sorted(c["edges"].items()):
+        print(f"  -{k:9} {v}")
+    return 0
+def cmd_map(args: argparse.Namespace) -> int:
+    m = query.project_map(_load_or_build(args.path))
+    print(f"{m['project']}: {m['files']} files \u00b7 {m['areas']} areas \u00b7 "
+          f"{m['links']} links \u00b7 {_human(m['size'])}")
+    print("by area:")
+    for a in m["by_area"]:
+        tt = ", ".join(a["top_types"])
+        print(f"  {a['area']:16} {a['files']:4} files  {_human(a['size']):>9}  [{tt}]")
+    print("most connected:")
+    for x in m["most_connected"]:
+        print(f"  {x['degree']:3}  {x['id']}  ({x['type']})")
+    print(f"orphans: {m['orphans']} \u00b7 broken refs: {m['broken_refs']}")
+    return 0
+def cmd_find(args: argparse.Namespace) -> int:
+    res = query.find(_load_or_build(args.path), args.query)
+    for r in res:
+        print(f"  {r['type']:9} {r['id']}")
+    print(f"({len(res)} matches)")
+    return 0
+def cmd_neighbors(args: argparse.Namespace) -> int:
+    n = query.neighbors(_load_or_build(args.path), args.node)
+    if n is None:
+        print(f"node not found: {args.node}", file=sys.stderr)
+        return 1
+    print(f"{n['id']} ({n['type']}) \u00b7 {_human(n['size'])}")
+    if n["description"]:
+        print(f"  {n['description']}")
+    if n["broken_refs"]:
+        print(f"  broken refs: {n['broken_refs']}")
+    print(f"  outgoing ({len(n['outgoing'])}):")
+    for o in n["outgoing"]:
+        print(f"    -{o['edge']}-> {o['id']} ({o['type']})")
+    print(f"  incoming ({len(n['incoming'])}):")
+    for o in n["incoming"]:
+        print(f"    <-{o['edge']}- {o['id']} ({o['type']})")
+    return 0
+def cmd_assess(args: argparse.Namespace) -> int:
+    r = assess.assess(args.path)
+    out = store.store_dir(args.path)
+    out.mkdir(parents=True, exist_ok=True)
+    p = out / "assessment.md"
+    p.write_text(assess.render_markdown(r), encoding="utf-8", newline="\n")
+    print(f"{r['project']}: {r['files']} files, {r['areas']} areas, {r['links']} links")
+    print(f"  hidden: {r['truncated']} truncated, {r['empty']} empty, "
+          f"{r['orphans']} orphans (~{r['orphans_pct']}%), {r['broken_refs']} broken, "
+          f"{r['decisions']} decisions")
+    print(f"  orient an assistant: ~{r['tokens_read_all_files']} tokens to read all files "
+          f"(~{r['tokens_read_all_docs']} for docs) -> ~{r['tokens_digest']} with SB")
+    print(f"  report: {p}")
+    return 0
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="second-brain",
+        description="Second Brain \u2014 a living, low-token map of a project.",
+    )
+    parser.add_argument("--version", action="version", version=f"second-brain {__version__}")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    for name, fn, help_text in [
+        ("build", cmd_build, "index the project -> .secondbrain/graph.json"),
+        ("gate", cmd_gate, "anti-drift check (broken refs, stale files, orphans)"),
+        ("stats", cmd_stats, "quick counts by node/edge type"),
+        ("map", cmd_map, "compact project digest (areas, sizes, most connected)"),
+        ("assess", cmd_assess, "one-shot before/after report: problems + token savings"),
+    ]:
+        sp = sub.add_parser(name, help=help_text)
+        sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
+        sp.set_defaults(func=fn)
+    sp = sub.add_parser("view", help="write a self-contained 3D viewer -> .secondbrain/view.html")
+    sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
+    sp.add_argument("--backbone", action="store_true",
+                    help="render only areas + knowledge-connected files (auto for huge graphs)")
+    sp.set_defaults(func=cmd_view)
+    sp = sub.add_parser("find", help="find nodes by name or path substring")
+    sp.add_argument("query", help="substring to search for")
+    sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
+    sp.set_defaults(func=cmd_find)
+    sp = sub.add_parser("neighbors", help="show a node and its connections")
+    sp.add_argument("node", help="node id (relative path)")
+    sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
+    sp.set_defaults(func=cmd_neighbors)
+    args = parser.parse_args(argv)
+    return int(args.func(args))
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())

second_brain/freshness.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Content-hash freshness + the single-walk ``index`` entry point.
+Hashing uses the standard-library BLAKE2b (fast, no dependency). For text files the hash is
+computed on newline-normalized bytes so a CRLF<->LF flip (e.g. a git checkout on Windows)
+does not look like a content change. The manifest is a small ``{relative_path: hash}`` map
+stored next to the graph; diffing it against the current files tells the gate whether the
+brain is still in sync with the project.
+"""
+from __future__ import annotations
+import hashlib
+import os
+from pathlib import Path
+from second_brain.ignore import load_ignore_patterns
+from second_brain.indexer import build_graph, iter_files
+from second_brain.model import Graph
+_CHUNK = 65536
+# Above this size a text file is hashed raw instead of normalized: it bounds memory, and a
+# CRLF flip on a huge file then reads as "changed" - a SAFE false positive for the gate
+# (it never hides a real change), unlike loading a multi-hundred-MB file into RAM.
+_NORMALIZE_CAP = 8_000_000
+# Files hashed with normalized line endings (CRLF/CR -> LF) to avoid cross-platform churn.
+_TEXT_HASH_EXTS = {
+    ".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
+    ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".html",
+    ".htm", ".css", ".sql", ".ps1", ".psm1", ".sh", ".bash", ".go", ".rs", ".java", ".c",
+    ".cc", ".cpp", ".h", ".hpp", ".rb", ".php", ".cs",
+}
+# Content-hash text source/docs up to this size; data, binaries and larger files use a cheap
+# size+mtime signature instead (no bytes read) so indexing stays light on data-heavy projects.
+_CONTENT_HASH_CAP = 1_000_000
+def _ext(name: str) -> str:
+    return os.path.splitext(name)[1].lower()
+def file_hash(path: Path, *, normalize_newlines: bool = False) -> str:
+    """Return a short, stable content hash of a file.
+    With ``normalize_newlines`` the bytes are read whole and CRLF/CR collapsed to LF before
+    hashing (correct across chunk boundaries); otherwise the file is streamed in chunks.
+    """
+    h = hashlib.blake2b(digest_size=16)
+    if normalize_newlines and path.stat().st_size <= _NORMALIZE_CAP:
+        data = path.read_bytes().replace(b"\r\n", b"\n").replace(b"\r", b"\n")
+        h.update(data)
+    else:
+        with open(path, "rb") as f:
+            for chunk in iter(lambda: f.read(_CHUNK), b""):
+                h.update(chunk)
+    return h.hexdigest()
+def _hash_rel(root: Path, rel: str) -> str | None:
+    """Freshness signature for a file.
+    Source/doc/config text up to ``_CONTENT_HASH_CAP`` is content-hashed with newline
+    normalization (precise and cross-platform stable). Data, binaries and large files use a cheap
+    ``size+mtime`` signature - no bytes are read - so indexing stays fast on data-heavy projects.
+    """
+    p = root / rel
+    try:
+        st = p.stat()
+    except OSError:
+        return None
+    if _ext(rel) in _TEXT_HASH_EXTS and st.st_size <= _CONTENT_HASH_CAP:
+        try:
+            return file_hash(p, normalize_newlines=True)
+        except OSError:
+            return None
+    return f"s{st.st_size}:m{int(st.st_mtime)}"
+def build_manifest(root: str | os.PathLike[str]) -> dict[str, str]:
+    """Return ``{relative_path: hash}`` for every indexable file under ``root``."""
+    root_p = Path(root).resolve()
+    rels = iter_files(root_p, load_ignore_patterns(root_p))
+    out: dict[str, str] = {}
+    for rel in rels:
+        hv = _hash_rel(root_p, rel)
+        if hv is not None:
+            out[rel] = hv
+    return out
+def index(
+    root: str | os.PathLike[str], *, operational: bool = True
+) -> tuple[Graph, dict[str, str]]:
+    """Build the graph and the manifest from one directory walk.
+    The filesystem is enumerated once (``iter_files``); file *contents* are still read again to
+    hash them for the manifest, so this is not zero double-I/O - just a single directory listing
+    shared by graph build and manifest.
+    """
+    root_p = Path(root).resolve()
+    rels = iter_files(root_p, load_ignore_patterns(root_p))
+    graph = build_graph(root_p, _rels=rels)
+    if operational:
+        from second_brain.operational import enrich
+        enrich(graph, root_p)
+    manifest: dict[str, str] = {}
+    for rel in rels:
+        hv = _hash_rel(root_p, rel)
+        if hv is not None:
+            manifest[rel] = hv
+    return graph, manifest
+def diff_manifest(old: dict[str, str], new: dict[str, str]) -> dict[str, list[str]]:
+    """Return ``{added, removed, changed}`` between two manifests (sorted lists)."""
+    old_k, new_k = set(old), set(new)
+    return {
+        "added": sorted(new_k - old_k),
+        "removed": sorted(old_k - new_k),
+        "changed": sorted(k for k in (old_k & new_k) if old[k] != new[k]),
+    }