second-brain-graph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ """Second Brain (SB) — a living, low-token map of every project.
2
+
3
+ Public API re-exports the graph model so callers can do ``from second_brain import Graph``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from second_brain.model import EDGE_COLORS, NODE_COLORS, Edge, EdgeType, Graph, Node, NodeType
9
+
10
+ __version__ = "0.1.0"
11
+
12
+ __all__ = [
13
+ "EDGE_COLORS",
14
+ "NODE_COLORS",
15
+ "Edge",
16
+ "EdgeType",
17
+ "Graph",
18
+ "Node",
19
+ "NodeType",
20
+ "__version__",
21
+ ]
@@ -0,0 +1,8 @@
1
+ """Allow running the CLI as ``python -m second_brain`` (no PATH setup needed)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from second_brain.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())
second_brain/assess.py ADDED
@@ -0,0 +1,211 @@
1
+ """One-shot project assessment: what Second Brain reveals, and what it saves.
2
+
3
+ ``second-brain assess`` indexes a project read-only and writes a before/after report a user can
4
+ run on their own codebase before adopting the tool: hidden problems (truncated/empty/orphan
5
+ files, broken links), the project's scale, the decisions and cross-references it surfaces, and
6
+ the token cost of orienting an assistant WITHOUT vs WITH Second Brain. The single most useful
7
+ thing for someone deciding whether it is worth installing.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from pathlib import Path
14
+
15
+ from second_brain import query
16
+ from second_brain.freshness import index
17
+ from second_brain.model import Graph, NodeType
18
+
19
+ _TEXT_EXTS = {
20
+ ".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".jsonl",
21
+ ".html", ".htm", ".css", ".toml", ".ini", ".cfg", ".yaml", ".yml", ".xml", ".sql", ".ps1",
22
+ ".psm1", ".sh", ".log", ".csv", ".go", ".rs", ".java", ".c", ".cc", ".cpp", ".h", ".hpp",
23
+ ".rb", ".php", ".cs",
24
+ }
25
+ _DOC_TYPES = {
26
+ NodeType.STRUCTURE, NodeType.REPORT, NodeType.DESIGN, NodeType.DECISION, NodeType.MEMORY,
27
+ }
28
+ _SCAN_CAP = 2_000_000
29
+ _LIST_CAP = 50 # how many truncated/empty file names to record in the report
30
+
31
+
32
+ def _ext(name: str) -> str:
33
+ return os.path.splitext(name)[1].lower()
34
+
35
+
36
+ _NULL_RUN = b"\x00" * 16 # a contiguous null run this long = zero-fill/truncation, not encoding
37
+
38
+
39
+ def _looks_utf16(chunk: bytes) -> bool:
40
+ """True if the null bytes are explained by UTF-16 encoding (valid text, not corruption).
41
+
42
+ UTF-16 text has a BOM, or null bytes concentrated on alternating positions (the high byte
43
+ of mostly-ASCII codepoints). A contiguous run of nulls is never normal UTF-16 text (it would
44
+ be consecutive U+0000), so that case is handled separately by ``_is_corrupt``.
45
+ """
46
+ if chunk[:2] in (b"\xff\xfe", b"\xfe\xff"):
47
+ return True
48
+ if len(chunk) < 4:
49
+ return False
50
+ even = chunk[0::2]
51
+ odd = chunk[1::2]
52
+ even_nulls, odd_nulls = even.count(0), odd.count(0)
53
+ return (odd_nulls > 0.4 * len(odd) and even_nulls < 0.05 * len(even)) or (
54
+ even_nulls > 0.4 * len(even) and odd_nulls < 0.05 * len(odd)
55
+ )
56
+
57
+
58
+ def _is_corrupt(chunk: bytes) -> bool:
59
+ """True if a chunk shows genuine truncation/corruption (null bytes not explained by UTF-16)."""
60
+ if b"\x00" not in chunk:
61
+ return False
62
+ if _NULL_RUN in chunk: # contiguous zero-fill, regardless of text encoding
63
+ return True
64
+ return not _looks_utf16(chunk)
65
+
66
+
67
+ def scan_integrity(root: str | os.PathLike[str], graph: Graph) -> dict[str, list[str]]:
68
+ """Find empty (zero-byte) and truncated/corrupted (null-byte) files among indexed nodes.
69
+
70
+ Empty ``__init__.py`` files are excluded - they are conventionally empty, not a problem.
71
+ """
72
+ root_p = Path(root)
73
+ empty: list[str] = []
74
+ truncated: list[str] = []
75
+ for n in graph.nodes.values():
76
+ if not n.path:
77
+ continue
78
+ p = root_p / n.path
79
+ try:
80
+ sz = p.stat().st_size
81
+ except OSError:
82
+ continue
83
+ if sz == 0:
84
+ if os.path.basename(n.path) != "__init__.py":
85
+ empty.append(n.path)
86
+ continue
87
+ if _ext(n.path) in _TEXT_EXTS:
88
+ try:
89
+ with open(p, "rb") as f:
90
+ head = f.read(_SCAN_CAP)
91
+ tail = b""
92
+ if sz > _SCAN_CAP: # also scan the file's end, where zero-fill usually lands
93
+ f.seek(-min(65536, sz), os.SEEK_END)
94
+ tail = f.read()
95
+ except OSError:
96
+ continue
97
+ if _is_corrupt(head) or _NULL_RUN in tail:
98
+ truncated.append(n.path)
99
+ return {"empty": sorted(empty), "truncated": sorted(truncated)}
100
+
101
+
102
+ def _tok(chars: int) -> int:
103
+ return round(chars / 4)
104
+
105
+
106
+ def _human(n: int) -> str:
107
+ f = float(n)
108
+ for u in ("B", "KB", "MB", "GB", "TB"):
109
+ if f < 1024 or u == "TB":
110
+ return f"{f:.0f} {u}" if u == "B" else f"{f:.1f} {u}"
111
+ f /= 1024
112
+ return f"{f:.1f} TB"
113
+
114
+
115
+ def _digest_text(m: dict) -> str:
116
+ lines = [f"{m['project']}: {m['files']} files, {m['areas']} areas, {m['links']} links"]
117
+ for a in m["by_area"]:
118
+ lines.append(f"{a['area']}: {a['files']} files {a['size']}B [{','.join(a['top_types'])}]")
119
+ for x in m["most_connected"]:
120
+ lines.append(f"{x['degree']} {x['id']} {x['type']}")
121
+ return "\n".join(lines)
122
+
123
+
124
+ def assess(root: str | os.PathLike[str]) -> dict:
125
+ """Run the read-only assessment and return all metrics as plain data."""
126
+ root_p = Path(root).resolve()
127
+ graph, _ = index(root_p)
128
+ m = query.project_map(graph, top=10)
129
+ integ = scan_integrity(root_p, graph)
130
+ files = [n for n in graph.nodes.values() if n.path]
131
+ inventory_chars = sum(len(n.path) + 1 for n in files)
132
+ doc_bytes = sum(int(n.meta.get("size", 0)) for n in files if n.type in _DOC_TYPES)
133
+ graph_bytes = len(graph.to_json(indent=None).encode("utf-8"))
134
+ digest_chars = len(_digest_text(m))
135
+ orphans_pct = round(100 * m["orphans"] / max(1, m["files"]))
136
+ return {
137
+ "project": graph.project,
138
+ "files": m["files"], "areas": m["areas"], "links": m["links"], "size": m["size"],
139
+ "node_types": m["node_types"], "edge_types": m["edge_types"],
140
+ "decisions": m["node_types"].get("decision", 0),
141
+ "sessions": m["node_types"].get("session", 0),
142
+ "orphans": m["orphans"], "orphans_pct": orphans_pct, "broken_refs": m["broken_refs"],
143
+ "empty": len(integ["empty"]), "truncated": len(integ["truncated"]),
144
+ "empty_files": integ["empty"][:_LIST_CAP],
145
+ "truncated_files": integ["truncated"][:_LIST_CAP],
146
+ "most_connected": m["most_connected"], "by_area": m["by_area"],
147
+ "tokens_inventory": _tok(inventory_chars),
148
+ "tokens_read_all_docs": _tok(doc_bytes),
149
+ "tokens_read_all_files": _tok(m["size"]),
150
+ "tokens_digest": _tok(digest_chars),
151
+ "graph_bytes": graph_bytes,
152
+ }
153
+
154
+
155
+ def _file_lines(names: list[str], total: int) -> list[str]:
156
+ """Indented bullets listing file names, with an '...and N more' when capped."""
157
+ out = [f" - `{n}`" for n in names]
158
+ if total > len(names):
159
+ out.append(f" - ...and {total - len(names)} more")
160
+ return out
161
+
162
+
163
+ def render_markdown(r: dict) -> str:
164
+ """Render an assessment dict as a Markdown before/after report."""
165
+ digest = max(1, r["tokens_digest"])
166
+ orient = max(r["tokens_read_all_docs"], r["tokens_inventory"]) # cheapest realistic orient
167
+ factor = round(orient / digest) if orient > digest else None
168
+ saving = (
169
+ f"- **~{factor}x less** than reading the docs to get oriented - and roughly constant "
170
+ "as the project grows"
171
+ if factor
172
+ else "- at this scale the digest already costs no more than just listing the files"
173
+ )
174
+ types = ", ".join(f"{k} {v}" for k, v in sorted(r["node_types"].items(), key=lambda kv: -kv[1]))
175
+ out = [
176
+ f"# Second Brain - assessment of `{r['project']}`",
177
+ "",
178
+ "Read-only snapshot. Re-run after changes; use `second-brain gate` to catch drift.",
179
+ "",
180
+ "## Scale",
181
+ "",
182
+ f"- **{r['files']} files** in **{r['areas']} areas**, **{r['links']} links**, "
183
+ f"{_human(r['size'])}",
184
+ f"- node types: {types}",
185
+ "",
186
+ "## What was hidden (before Second Brain)",
187
+ "",
188
+ f"- **{r['truncated']}** truncated/corrupted files (null bytes)",
189
+ *_file_lines(r.get("truncated_files", []), r["truncated"]),
190
+ f"- **{r['empty']}** empty files (zero bytes)",
191
+ *_file_lines(r.get("empty_files", []), r["empty"]),
192
+ f"- **{r['orphans']}** orphan files (~{r['orphans_pct']}%) - linked to nothing",
193
+ f"- **{r['broken_refs']}** broken references",
194
+ f"- **{r['decisions']}** decisions scattered in docs - now queryable nodes",
195
+ "",
196
+ "## Token cost to orient an assistant",
197
+ "",
198
+ f"- WITHOUT Second Brain: ~**{r['tokens_read_all_files']:,} tokens** to read every "
199
+ f"indexed file, ~{r['tokens_read_all_docs']:,} for just the documents, or "
200
+ f"~{r['tokens_inventory']:,} just to list every file's location",
201
+ f"- WITH Second Brain: ~**{r['tokens_digest']:,} tokens** (the `map` digest); the full "
202
+ f"index is {_human(r['graph_bytes'])}, queried on demand and never loaded into context",
203
+ saving,
204
+ "",
205
+ "## Most connected files",
206
+ "",
207
+ ]
208
+ for x in r["most_connected"]:
209
+ out.append(f"- {x['degree']:>3} `{x['id']}` ({x['type']})")
210
+ out += ["", "*Generated by `second-brain assess` (read-only).*", ""]
211
+ return "\n".join(out)
@@ -0,0 +1,87 @@
1
+ """Classify a file into a NodeType from its path and name (heuristic, tunable).
2
+
3
+ The order of checks matters: more specific signals win. The classification is deliberately
4
+ conservative and documented; it is meant to be refined per project over time.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import re
11
+
12
+ from second_brain.model import NodeType
13
+
14
+ _PROGRAM_EXTS = {
15
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
16
+ ".go", ".rs", ".java", ".kt", ".c", ".cc", ".cpp", ".h", ".hpp",
17
+ ".rb", ".php", ".cs", ".swift", ".scala", ".lua",
18
+ ".ps1", ".psm1", ".sh", ".bash", ".bat", ".cmd", ".sql", ".vcl", ".r",
19
+ }
20
+ _DATA_EXTS = {
21
+ ".db", ".sqlite", ".sqlite3", ".duckdb", ".parquet", ".csv", ".tsv", ".jsonl", ".ndjson",
22
+ }
23
+ _CONFIG_EXTS = {
24
+ ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".env", ".json", ".xml", ".properties",
25
+ }
26
+ _DOC_EXTS = {".md", ".markdown", ".rst", ".txt", ".pdf", ".html", ".htm", ".docx", ".pptx", ".odt"}
27
+
28
+ _STRUCTURE_NAMES = {
29
+ "progetto.md", "progetto-storia.md", "readme.md", "readme", "index.md",
30
+ "data-map.md", "changelog.md", "contributing.md", "license", "license.md",
31
+ "license.txt", "authors", "notice",
32
+ }
33
+
34
+ _DECISION_RE = re.compile(r"(?i)(?:^|[-_/])(?:adr|decision|decisione|decisioni)(?:[-_/.]|$)")
35
+ _DESIGN_RE = re.compile(
36
+ r"(?i)(?:^|[-_/])(?:disegno|design|piano|plan|roadmap|spec|blueprint|brief|"
37
+ r"architettura|architecture)(?:[-_/.]|$)"
38
+ )
39
+ _REPORT_RE = re.compile(
40
+ r"(?i)(?:^|[-_/])(?:report|rapporto|collaudo|diagnosi|revisione|readiness|analisi|analysis|audit|verifica|backtest|indagine|strumentazione)(?:[-_/.]|$)"
41
+ )
42
+ _DATE_RE = re.compile(r"(?<!\d)(?:20\d{2}[-_]?\d{2}[-_]?\d{2}|20\d{2}[-_]\d{2})(?!\d)")
43
+
44
+
45
+ def _ext(name: str) -> str:
46
+ # os.path.splitext treats leading-dot names (".gitignore") as extensionless, which is
47
+ # what we want (a dotfile has no "extension").
48
+ return os.path.splitext(name)[1].lower()
49
+
50
+
51
+ def classify(rel_posix: str) -> NodeType:
52
+ """Return the NodeType for a file given its POSIX relative path."""
53
+ name = rel_posix.rsplit("/", 1)[-1]
54
+ low = name.lower()
55
+ ext = _ext(low)
56
+ parts = [p.lower() for p in rel_posix.split("/")]
57
+
58
+ # 1. Memory (path- or name-based)
59
+ if "memory" in parts or low.startswith("memory.") or low == "memory.md":
60
+ return NodeType.MEMORY
61
+
62
+ # 2. Config / data / program by extension (strong signals)
63
+ if ext in _DATA_EXTS:
64
+ return NodeType.DATA
65
+ if ext in _PROGRAM_EXTS:
66
+ return NodeType.PROGRAM
67
+ config_names = {".gitignore", ".secondbrainignore", "dockerfile", "caddyfile", "makefile"}
68
+ if ext in _CONFIG_EXTS or low in config_names:
69
+ return NodeType.CONFIG
70
+
71
+ # 3. Foundation structure docs by name
72
+ if low in _STRUCTURE_NAMES:
73
+ return NodeType.STRUCTURE
74
+
75
+ # 4. Document sub-typing by keyword / date (only for document-like files)
76
+ if ext in _DOC_EXTS or ext == "":
77
+ if _DECISION_RE.search(rel_posix):
78
+ return NodeType.DECISION
79
+ if _DESIGN_RE.search(rel_posix):
80
+ return NodeType.DESIGN
81
+ if _REPORT_RE.search(rel_posix) or _DATE_RE.search(name):
82
+ return NodeType.REPORT
83
+ # Fallback for loose documents: treat as project structure/knowledge.
84
+ return NodeType.STRUCTURE
85
+
86
+ # 5. Anything else (unknown extension) -> config-like by default.
87
+ return NodeType.CONFIG
second_brain/cli.py ADDED
@@ -0,0 +1,185 @@
1
+ """Command-line interface: build, gate, view, stats, and the query commands.
2
+
3
+ Read-only on your sources: every command only reads the project and writes derived files
4
+ under ``.secondbrain/``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import sys
11
+
12
+ from second_brain import __version__, assess, gate, query, store
13
+ from second_brain.freshness import build_manifest, index
14
+ from second_brain.model import Graph
15
+ from second_brain.viewer import write_view
16
+
17
+
18
+ def _human(n: int) -> str:
19
+ f = float(n)
20
+ for unit in ("B", "KB", "MB", "GB", "TB"):
21
+ if f < 1024 or unit == "TB":
22
+ return f"{f:.0f} {unit}" if unit == "B" else f"{f:.1f} {unit}"
23
+ f /= 1024
24
+ return f"{f:.1f} TB"
25
+
26
+
27
+ def _build(path: str) -> tuple[Graph, dict[str, str]]:
28
+ # Single filesystem walk produces both the graph and the manifest.
29
+ return index(path)
30
+
31
+
32
+ def _load_or_build(path: str) -> Graph:
33
+ return store.load_graph(path) or _build(path)[0]
34
+
35
+
36
+ def cmd_build(args: argparse.Namespace) -> int:
37
+ g, m = _build(args.path)
38
+ store.save(args.path, g, m)
39
+ c = g.counts()
40
+ print(f"built '{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
41
+ print(" nodes:", c["nodes"])
42
+ print(" edges:", c["edges"])
43
+ print(f" store: {store.store_dir(args.path)}")
44
+ return 0
45
+
46
+
47
+ def cmd_gate(args: argparse.Namespace) -> int:
48
+ g = store.load_graph(args.path)
49
+ old = store.load_manifest(args.path)
50
+ if g is None or old is None:
51
+ print("no graph found \u2014 run 'second-brain build' first", file=sys.stderr)
52
+ return 2
53
+ rep = gate.evaluate(g, old, build_manifest(args.path))
54
+ print(rep.summary())
55
+ msg = "OK: fresh and clean" if rep.ok else "DRIFT: rebuild and/or fix the issues above"
56
+ print(msg)
57
+ return 0 if rep.ok else 1
58
+
59
+
60
+ _BACKBONE_AUTO = 8000 # graphs bigger than this auto-render as a backbone to stay light
61
+
62
+
63
+ def cmd_view(args: argparse.Namespace) -> int:
64
+ g, _ = _build(args.path)
65
+ full = len(g.nodes)
66
+ if args.backbone or full > _BACKBONE_AUTO:
67
+ g = query.backbone(g)
68
+ print(f"backbone: rendering {len(g.nodes)} of {full} nodes "
69
+ "(isolated data files summarized on their area)")
70
+ out = write_view(args.path, g)
71
+ print(f"view written: {out}")
72
+ print("open it in a browser (double-click).")
73
+ return 0
74
+
75
+
76
+ def cmd_stats(args: argparse.Namespace) -> int:
77
+ g, _ = _build(args.path)
78
+ c = g.counts()
79
+ print(f"'{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
80
+ for k, v in sorted(c["nodes"].items()):
81
+ print(f" {k:10} {v}")
82
+ for k, v in sorted(c["edges"].items()):
83
+ print(f" -{k:9} {v}")
84
+ return 0
85
+
86
+
87
+ def cmd_map(args: argparse.Namespace) -> int:
88
+ m = query.project_map(_load_or_build(args.path))
89
+ print(f"{m['project']}: {m['files']} files \u00b7 {m['areas']} areas \u00b7 "
90
+ f"{m['links']} links \u00b7 {_human(m['size'])}")
91
+ print("by area:")
92
+ for a in m["by_area"]:
93
+ tt = ", ".join(a["top_types"])
94
+ print(f" {a['area']:16} {a['files']:4} files {_human(a['size']):>9} [{tt}]")
95
+ print("most connected:")
96
+ for x in m["most_connected"]:
97
+ print(f" {x['degree']:3} {x['id']} ({x['type']})")
98
+ print(f"orphans: {m['orphans']} \u00b7 broken refs: {m['broken_refs']}")
99
+ return 0
100
+
101
+
102
+ def cmd_find(args: argparse.Namespace) -> int:
103
+ res = query.find(_load_or_build(args.path), args.query)
104
+ for r in res:
105
+ print(f" {r['type']:9} {r['id']}")
106
+ print(f"({len(res)} matches)")
107
+ return 0
108
+
109
+
110
+ def cmd_neighbors(args: argparse.Namespace) -> int:
111
+ n = query.neighbors(_load_or_build(args.path), args.node)
112
+ if n is None:
113
+ print(f"node not found: {args.node}", file=sys.stderr)
114
+ return 1
115
+ print(f"{n['id']} ({n['type']}) \u00b7 {_human(n['size'])}")
116
+ if n["description"]:
117
+ print(f" {n['description']}")
118
+ if n["broken_refs"]:
119
+ print(f" broken refs: {n['broken_refs']}")
120
+ print(f" outgoing ({len(n['outgoing'])}):")
121
+ for o in n["outgoing"]:
122
+ print(f" -{o['edge']}-> {o['id']} ({o['type']})")
123
+ print(f" incoming ({len(n['incoming'])}):")
124
+ for o in n["incoming"]:
125
+ print(f" <-{o['edge']}- {o['id']} ({o['type']})")
126
+ return 0
127
+
128
+
129
+ def cmd_assess(args: argparse.Namespace) -> int:
130
+ r = assess.assess(args.path)
131
+ out = store.store_dir(args.path)
132
+ out.mkdir(parents=True, exist_ok=True)
133
+ p = out / "assessment.md"
134
+ p.write_text(assess.render_markdown(r), encoding="utf-8", newline="\n")
135
+ print(f"{r['project']}: {r['files']} files, {r['areas']} areas, {r['links']} links")
136
+ print(f" hidden: {r['truncated']} truncated, {r['empty']} empty, "
137
+ f"{r['orphans']} orphans (~{r['orphans_pct']}%), {r['broken_refs']} broken, "
138
+ f"{r['decisions']} decisions")
139
+ print(f" orient an assistant: ~{r['tokens_read_all_files']} tokens to read all files "
140
+ f"(~{r['tokens_read_all_docs']} for docs) -> ~{r['tokens_digest']} with SB")
141
+ print(f" report: {p}")
142
+ return 0
143
+
144
+
145
+ def main(argv: list[str] | None = None) -> int:
146
+ parser = argparse.ArgumentParser(
147
+ prog="second-brain",
148
+ description="Second Brain \u2014 a living, low-token map of a project.",
149
+ )
150
+ parser.add_argument("--version", action="version", version=f"second-brain {__version__}")
151
+ sub = parser.add_subparsers(dest="cmd", required=True)
152
+
153
+ for name, fn, help_text in [
154
+ ("build", cmd_build, "index the project -> .secondbrain/graph.json"),
155
+ ("gate", cmd_gate, "anti-drift check (broken refs, stale files, orphans)"),
156
+ ("stats", cmd_stats, "quick counts by node/edge type"),
157
+ ("map", cmd_map, "compact project digest (areas, sizes, most connected)"),
158
+ ("assess", cmd_assess, "one-shot before/after report: problems + token savings"),
159
+ ]:
160
+ sp = sub.add_parser(name, help=help_text)
161
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
162
+ sp.set_defaults(func=fn)
163
+
164
+ sp = sub.add_parser("view", help="write a self-contained 3D viewer -> .secondbrain/view.html")
165
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
166
+ sp.add_argument("--backbone", action="store_true",
167
+ help="render only areas + knowledge-connected files (auto for huge graphs)")
168
+ sp.set_defaults(func=cmd_view)
169
+
170
+ sp = sub.add_parser("find", help="find nodes by name or path substring")
171
+ sp.add_argument("query", help="substring to search for")
172
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
173
+ sp.set_defaults(func=cmd_find)
174
+
175
+ sp = sub.add_parser("neighbors", help="show a node and its connections")
176
+ sp.add_argument("node", help="node id (relative path)")
177
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
178
+ sp.set_defaults(func=cmd_neighbors)
179
+
180
+ args = parser.parse_args(argv)
181
+ return int(args.func(args))
182
+
183
+
184
+ if __name__ == "__main__": # pragma: no cover
185
+ raise SystemExit(main())
@@ -0,0 +1,121 @@
1
+ """Content-hash freshness + the single-walk ``index`` entry point.
2
+
3
+ Hashing uses the standard-library BLAKE2b (fast, no dependency). For text files the hash is
4
+ computed on newline-normalized bytes so a CRLF<->LF flip (e.g. a git checkout on Windows)
5
+ does not look like a content change. The manifest is a small ``{relative_path: hash}`` map
6
+ stored next to the graph; diffing it against the current files tells the gate whether the
7
+ brain is still in sync with the project.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import os
14
+ from pathlib import Path
15
+
16
+ from second_brain.ignore import load_ignore_patterns
17
+ from second_brain.indexer import build_graph, iter_files
18
+ from second_brain.model import Graph
19
+
20
+ _CHUNK = 65536
21
+ # Above this size a text file is hashed raw instead of normalized: it bounds memory, and a
22
+ # CRLF flip on a huge file then reads as "changed" - a SAFE false positive for the gate
23
+ # (it never hides a real change), unlike loading a multi-hundred-MB file into RAM.
24
+ _NORMALIZE_CAP = 8_000_000
25
+
26
+ # Files hashed with normalized line endings (CRLF/CR -> LF) to avoid cross-platform churn.
27
+ _TEXT_HASH_EXTS = {
28
+ ".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
29
+ ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".html",
30
+ ".htm", ".css", ".sql", ".ps1", ".psm1", ".sh", ".bash", ".go", ".rs", ".java", ".c",
31
+ ".cc", ".cpp", ".h", ".hpp", ".rb", ".php", ".cs",
32
+ }
33
+ # Content-hash text source/docs up to this size; data, binaries and larger files use a cheap
34
+ # size+mtime signature instead (no bytes read) so indexing stays light on data-heavy projects.
35
+ _CONTENT_HASH_CAP = 1_000_000
36
+
37
+
38
+ def _ext(name: str) -> str:
39
+ return os.path.splitext(name)[1].lower()
40
+
41
+
42
+ def file_hash(path: Path, *, normalize_newlines: bool = False) -> str:
43
+ """Return a short, stable content hash of a file.
44
+
45
+ With ``normalize_newlines`` the bytes are read whole and CRLF/CR collapsed to LF before
46
+ hashing (correct across chunk boundaries); otherwise the file is streamed in chunks.
47
+ """
48
+ h = hashlib.blake2b(digest_size=16)
49
+ if normalize_newlines and path.stat().st_size <= _NORMALIZE_CAP:
50
+ data = path.read_bytes().replace(b"\r\n", b"\n").replace(b"\r", b"\n")
51
+ h.update(data)
52
+ else:
53
+ with open(path, "rb") as f:
54
+ for chunk in iter(lambda: f.read(_CHUNK), b""):
55
+ h.update(chunk)
56
+ return h.hexdigest()
57
+
58
+
59
+ def _hash_rel(root: Path, rel: str) -> str | None:
60
+ """Freshness signature for a file.
61
+
62
+ Source/doc/config text up to ``_CONTENT_HASH_CAP`` is content-hashed with newline
63
+ normalization (precise and cross-platform stable). Data, binaries and large files use a cheap
64
+ ``size+mtime`` signature - no bytes are read - so indexing stays fast on data-heavy projects.
65
+ """
66
+ p = root / rel
67
+ try:
68
+ st = p.stat()
69
+ except OSError:
70
+ return None
71
+ if _ext(rel) in _TEXT_HASH_EXTS and st.st_size <= _CONTENT_HASH_CAP:
72
+ try:
73
+ return file_hash(p, normalize_newlines=True)
74
+ except OSError:
75
+ return None
76
+ return f"s{st.st_size}:m{int(st.st_mtime)}"
77
+
78
+
79
+ def build_manifest(root: str | os.PathLike[str]) -> dict[str, str]:
80
+ """Return ``{relative_path: hash}`` for every indexable file under ``root``."""
81
+ root_p = Path(root).resolve()
82
+ rels = iter_files(root_p, load_ignore_patterns(root_p))
83
+ out: dict[str, str] = {}
84
+ for rel in rels:
85
+ hv = _hash_rel(root_p, rel)
86
+ if hv is not None:
87
+ out[rel] = hv
88
+ return out
89
+
90
+
91
+ def index(
92
+ root: str | os.PathLike[str], *, operational: bool = True
93
+ ) -> tuple[Graph, dict[str, str]]:
94
+ """Build the graph and the manifest from one directory walk.
95
+
96
+ The filesystem is enumerated once (``iter_files``); file *contents* are still read again to
97
+ hash them for the manifest, so this is not zero double-I/O - just a single directory listing
98
+ shared by graph build and manifest.
99
+ """
100
+ root_p = Path(root).resolve()
101
+ rels = iter_files(root_p, load_ignore_patterns(root_p))
102
+ graph = build_graph(root_p, _rels=rels)
103
+ if operational:
104
+ from second_brain.operational import enrich
105
+ enrich(graph, root_p)
106
+ manifest: dict[str, str] = {}
107
+ for rel in rels:
108
+ hv = _hash_rel(root_p, rel)
109
+ if hv is not None:
110
+ manifest[rel] = hv
111
+ return graph, manifest
112
+
113
+
114
+ def diff_manifest(old: dict[str, str], new: dict[str, str]) -> dict[str, list[str]]:
115
+ """Return ``{added, removed, changed}`` between two manifests (sorted lists)."""
116
+ old_k, new_k = set(old), set(new)
117
+ return {
118
+ "added": sorted(new_k - old_k),
119
+ "removed": sorted(old_k - new_k),
120
+ "changed": sorted(k for k in (old_k & new_k) if old[k] != new[k]),
121
+ }