veridge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
veridge/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ """Veridge — the always-fresh, low-token map of a whole project.
2
+
3
+ Veridge indexes a project read-only into a typed graph that unifies four layers most
4
+ tools keep apart:
5
+
6
+ * **documents** (with references — including plain path mentions written in prose),
7
+ * **code** down to the *symbol* (functions/classes), via pluggable parsers,
8
+ * **decisions** (ADR / RFC / D-XXX ids found in docs), and
9
+ * **sessions** (git commits and the files they touched).
10
+
11
+ It then *ranks* the graph with PageRank, so queries can return the **minimal relevant
12
+ subgraph within a token budget** — the cheap, accurate context an AI assistant needs to
13
+ orient itself, and a map a human can read. The core has zero runtime dependencies.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ __version__ = "0.1.0"
19
+
20
+ __all__ = ["__version__"]
veridge/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Enable ``python -m veridge`` as an alias for the ``veridge`` CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from veridge.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())
veridge/budget.py ADDED
@@ -0,0 +1,59 @@
1
+ """Token budgeting: turn a ranking into the *most* context that fits a token ceiling.
2
+
3
+ The point of Veridge is to hand an assistant the **minimal relevant slice** of a project,
4
+ not the whole thing. Given a ranked list of node ids and a token budget, we greedily admit
5
+ the highest-ranked nodes whose compact rows still fit. Token cost uses the standard ~4
6
+ chars/token heuristic on the *compact row* an assistant would actually read — ids, kinds,
7
+ sizes and edge counts, never file contents.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ from veridge.model import Graph, Kind
15
+
16
+
17
+ def estimate_tokens(text: str) -> int:
18
+ return max(1, round(len(text) / 4))
19
+
20
+
21
+ def node_row(graph: Graph, nid: str) -> dict[str, Any]:
22
+ """A compact, contents-free row for one node (what an assistant reads)."""
23
+ n = graph.nodes[nid]
24
+ row: dict[str, Any] = {"id": n.id, "kind": n.kind.value}
25
+ if n.category:
26
+ row["cat"] = n.category.value
27
+ if n.kind is Kind.FILE:
28
+ row["size"] = int(n.meta.get("size", 0))
29
+ if n.kind is Kind.SYMBOL and n.meta.get("line"):
30
+ row["line"] = n.meta["line"]
31
+ row["deg"] = graph.degree(nid)
32
+ return row
33
+
34
+
35
+ def _row_cost(row: dict[str, Any]) -> int:
36
+ # Approximate the serialized footprint the assistant pays for.
37
+ return estimate_tokens(",".join(f"{k}={v}" for k, v in row.items()))
38
+
39
+
40
+ def select_within_budget(
41
+ graph: Graph, ranked_ids: list[str], budget_tokens: int,
42
+ ) -> tuple[list[dict[str, Any]], int]:
43
+ """Admit ranked nodes (best first) until the token budget is exhausted.
44
+
45
+ Returns ``(rows, used_tokens)``. Always returns at least the single best node, even if it
46
+ alone exceeds the budget, so a query never comes back empty.
47
+ """
48
+ rows: list[dict[str, Any]] = []
49
+ used = 0
50
+ for nid in ranked_ids:
51
+ if nid not in graph.nodes:
52
+ continue
53
+ row = node_row(graph, nid)
54
+ cost = _row_cost(row)
55
+ if rows and used + cost > budget_tokens:
56
+ break
57
+ rows.append(row)
58
+ used += cost
59
+ return rows, used
veridge/classify.py ADDED
@@ -0,0 +1,63 @@
1
+ """Classify a file into a (Kind.FILE, Category) from its path — generic and language-agnostic.
2
+
3
+ Unlike a project-specific taxonomy, the rules here key on widely shared conventions
4
+ (extensions, well-known names) so they work out of the box on any repository. They are
5
+ intentionally simple and tunable.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+
12
+ from veridge.model import Category
13
+
14
+ _CODE_EXTS = {
15
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
16
+ ".go", ".rs", ".java", ".kt", ".c", ".cc", ".cpp", ".h", ".hpp",
17
+ ".rb", ".php", ".cs", ".swift", ".scala", ".lua",
18
+ ".ps1", ".psm1", ".sh", ".bash", ".sql",
19
+ }
20
+ _DATA_EXTS = {
21
+ ".db", ".sqlite", ".sqlite3", ".duckdb", ".parquet", ".csv", ".tsv",
22
+ ".jsonl", ".ndjson",
23
+ }
24
+ _CONFIG_EXTS = {
25
+ ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".env", ".json",
26
+ ".xml", ".properties",
27
+ }
28
+ _DOC_EXTS = {".md", ".markdown", ".rst", ".txt", ".pdf", ".html", ".htm", ".docx", ".adoc"}
29
+
30
+ _STRUCTURE_NAMES = {
31
+ "readme", "readme.md", "readme.rst", "index.md", "architecture.md",
32
+ "changelog.md", "contributing.md", "license", "license.md", "authors", "notice",
33
+ }
34
+ _CONFIG_NAMES = {
35
+ ".gitignore", ".veridgeignore", "dockerfile", "makefile", "caddyfile",
36
+ ".editorconfig",
37
+ }
38
+
39
+
40
+ def _ext(name: str) -> str:
41
+ return os.path.splitext(name)[1].lower()
42
+
43
+
44
+ def classify(rel_posix: str) -> Category:
45
+ """Return the Category for a file given its POSIX relative path."""
46
+ name = rel_posix.rsplit("/", 1)[-1]
47
+ low = name.lower()
48
+ ext = _ext(low)
49
+ parts = [p.lower() for p in rel_posix.split("/")]
50
+
51
+ if "memory" in parts or low.startswith("memory."):
52
+ return Category.MEMORY
53
+ if ext in _DATA_EXTS:
54
+ return Category.DATA
55
+ if ext in _CODE_EXTS:
56
+ return Category.CODE
57
+ if ext in _CONFIG_EXTS or low in _CONFIG_NAMES:
58
+ return Category.CONFIG
59
+ if low in _STRUCTURE_NAMES:
60
+ return Category.STRUCTURE
61
+ if ext in _DOC_EXTS or ext == "":
62
+ return Category.DOC
63
+ return Category.CONFIG
veridge/cli.py ADDED
@@ -0,0 +1,276 @@
1
+ """Command line: build, map, find, neighbors, focus, gate, stats.
2
+
3
+ Read-only on your sources: commands only read the project and write derived files under
4
+ ``.veridge/``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import json
11
+ import sys
12
+
13
+ from veridge import __version__, query, store
14
+ from veridge.freshness import build_manifest, evaluate, index
15
+ from veridge.model import Graph
16
+
17
+
18
+ def _human(n: int) -> str:
19
+ f = float(n)
20
+ for unit in ("B", "KB", "MB", "GB", "TB"):
21
+ if f < 1024 or unit == "TB":
22
+ return f"{f:.0f} {unit}" if unit == "B" else f"{f:.1f} {unit}"
23
+ f /= 1024
24
+ return f"{f:.1f} TB"
25
+
26
+
27
+ def _load_or_build(path: str) -> Graph:
28
+ return store.load_graph(path) or index(path)[0]
29
+
30
+
31
+ def cmd_build(args: argparse.Namespace) -> int:
32
+ g, m = index(args.path)
33
+ store.save(args.path, g, m)
34
+ c = g.counts()
35
+ print(f"built '{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
36
+ print(" nodes:", c["nodes"])
37
+ print(" edges:", c["edges"])
38
+ print(f" store: {store.store_dir(args.path)}")
39
+ return 0
40
+
41
+
42
+ def cmd_stats(args: argparse.Namespace) -> int:
43
+ g = _load_or_build(args.path)
44
+ c = g.counts()
45
+ print(f"'{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
46
+ for k, v in sorted(c["nodes"].items()):
47
+ print(f" {k:9} {v}")
48
+ for k, v in sorted(c["edges"].items()):
49
+ print(f" -{k:8} {v}")
50
+ return 0
51
+
52
+
53
+ def cmd_map(args: argparse.Namespace) -> int:
54
+ m = query.project_map(_load_or_build(args.path))
55
+ if args.json:
56
+ print(json.dumps(m, ensure_ascii=False, indent=2))
57
+ return 0
58
+ print(f"{m['project']}: {m['files']} files · {m['symbols']} symbols · "
59
+ f"{m['areas']} areas · {m['edges']} edges · {_human(m['size'])}")
60
+ print("by area:")
61
+ for a in m["by_area"]:
62
+ print(f" {a['area']:16} {a['files']:4} files {_human(a['size']):>9} "
63
+ f"[{', '.join(a['top_cats'])}]")
64
+ print("by layer:")
65
+ for ly in m["by_layer"]:
66
+ print(f" {ly['layer']:11} {ly['files']:4} files {_human(ly['size']):>9}")
67
+ print("most important (PageRank):")
68
+ for x in m["most_important"]:
69
+ print(f" {x['score']:.4f} {x['id']} ({x['kind']})")
70
+ print(f"orphans: {m['orphans']} · broken refs: {m['broken_refs']}")
71
+ return 0
72
+
73
+
74
+ def cmd_find(args: argparse.Namespace) -> int:
75
+ res = query.find(_load_or_build(args.path), args.query)
76
+ for r in res:
77
+ print(f" {r['kind']:8} {r['id']}")
78
+ print(f"({len(res)} matches)")
79
+ return 0
80
+
81
+
82
+ def cmd_neighbors(args: argparse.Namespace) -> int:
83
+ n = query.neighbors(_load_or_build(args.path), args.node)
84
+ if n is None:
85
+ print(f"node not found: {args.node}", file=sys.stderr)
86
+ return 1
87
+ print(f"{n['id']} ({n['kind']}) · {_human(n['size'])}")
88
+ if n["description"]:
89
+ print(f" {n['description']}")
90
+ if n["broken_refs"]:
91
+ print(f" broken refs: {n['broken_refs']}")
92
+ print(f" outgoing ({len(n['outgoing'])}):")
93
+ for o in n["outgoing"]:
94
+ print(f" -{o['edge']}-> {o['id']} ({o['kind']})")
95
+ print(f" incoming ({len(n['incoming'])}):")
96
+ for o in n["incoming"]:
97
+ print(f" <-{o['edge']}- {o['id']} ({o['kind']})")
98
+ return 0
99
+
100
+
101
+ def cmd_focus(args: argparse.Namespace) -> int:
102
+ res = query.focus(_load_or_build(args.path), args.query, budget_tokens=args.budget)
103
+ if args.json:
104
+ print(json.dumps(res, ensure_ascii=False, indent=2))
105
+ return 0
106
+ if not res["nodes"]:
107
+ print(res.get("note", "no matches"))
108
+ return 0
109
+ print(f"focus '{res['query']}' · {len(res['nodes'])} nodes · "
110
+ f"~{res['used_tokens']}/{res['budget_tokens']} tokens")
111
+ print(f" seeds: {', '.join(res['seeds'])}")
112
+ for r in res["nodes"]:
113
+ extra = f" ({r['cat']})" if r.get("cat") else ""
114
+ print(f" {r.get('score', 0):.4f} {r['id']}{extra} [{r['kind']}, deg {r['deg']}]")
115
+ return 0
116
+
117
+
118
+ def cmd_impact(args: argparse.Namespace) -> int:
119
+ seed_ids = None
120
+ query_str = args.seed or ""
121
+ proj = args.path
122
+ if args.diff:
123
+ # In --diff mode there is no seed, so a lone positional is the project path.
124
+ if args.seed and args.path == ".":
125
+ proj = args.seed
126
+ from veridge.sessions import git_changed_files
127
+ seed_ids = git_changed_files(proj)
128
+ query_str = "git diff (HEAD)"
129
+ if not seed_ids:
130
+ print("no changed files vs HEAD (or not a git repository)")
131
+ return 0
132
+ elif not args.seed:
133
+ print("provide a seed (file/symbol) or use --diff", file=sys.stderr)
134
+ return 2
135
+ g = _load_or_build(proj)
136
+ direction = "dependencies" if args.deps else "dependents"
137
+ res = query.impact(g, query_str, seed_ids=seed_ids, budget_tokens=args.budget,
138
+ hops=args.hops, direction=direction)
139
+ if args.json:
140
+ print(json.dumps(res, ensure_ascii=False, indent=2))
141
+ return 0
142
+ verb = "depends on" if args.deps else "affected by"
143
+ print(f"impact ({direction}) of '{res['query']}' · {res['total_affected']} {verb}")
144
+ if res["seeds"]:
145
+ print(f" seeds: {', '.join(res['seeds'])}")
146
+ if not res["nodes"]:
147
+ print(f" {res.get('note', 'nothing found')}")
148
+ return 0
149
+ print(f" showing {len(res['nodes'])} · ~{res['used_tokens']}/{res['budget_tokens']} tokens")
150
+ for r in res["nodes"]:
151
+ extra = f" ({r['cat']})" if r.get("cat") else ""
152
+ print(f" {r.get('score', 0):.4f} d{r.get('dist', '?')} {r['id']}{extra} [{r['kind']}]")
153
+ return 0
154
+
155
+
156
+ def cmd_why(args: argparse.Namespace) -> int:
157
+ res = query.why(_load_or_build(args.path), args.a, args.b)
158
+ if args.json:
159
+ print(json.dumps(res, ensure_ascii=False, indent=2))
160
+ return 0
161
+ if not res["found"]:
162
+ print(res.get("note", "no path"))
163
+ return 1
164
+ path = res["path"]
165
+ print(f"why: {res['a']} -> {res['b']} · {res['length']} hops")
166
+ print(f" {path[0]['id']} ({path[0]['kind']})")
167
+ for step in path[1:]:
168
+ connector = f"--{step['edge']}-->" if step["dir"] == "->" else f"<--{step['edge']}--"
169
+ print(f" {connector} {step['id']} ({step['kind']})")
170
+ return 0
171
+
172
+
173
+ def cmd_tour(args: argparse.Namespace) -> int:
174
+ res = query.tour(_load_or_build(args.path), budget_tokens=args.budget)
175
+ if args.json:
176
+ print(json.dumps(res, ensure_ascii=False, indent=2))
177
+ return 0
178
+ print(f"tour of '{res['project']}' · {len(res['stops'])}/{res['total_files']} stops · "
179
+ f"~{res['used_tokens']}/{res['budget_tokens']} tokens")
180
+ print("(read top to bottom: dependencies before the files that use them)")
181
+ for s in res["stops"]:
182
+ print(f" {s['step']:2}. {s['id']} [{s['layer']}]")
183
+ if s["uses"]:
184
+ print(f" uses: {', '.join(s['uses'])}")
185
+ if s["used_by"]:
186
+ print(f" used by: {', '.join(s['used_by'])}")
187
+ return 0
188
+
189
+
190
+ def cmd_gate(args: argparse.Namespace) -> int:
191
+ g = store.load_graph(args.path)
192
+ old = store.load_manifest(args.path)
193
+ if g is None or old is None:
194
+ print("no graph found — run 'veridge build' first", file=sys.stderr)
195
+ return 2
196
+ rep = evaluate(g, old, build_manifest(args.path))
197
+ print(rep.summary())
198
+ print("OK: fresh and clean" if rep.ok else "DRIFT: rebuild and/or fix the issues above")
199
+ return 0 if rep.ok else 1
200
+
201
+
202
+ def main(argv: list[str] | None = None) -> int:
203
+ # Print UTF-8 regardless of the console's locale (Windows defaults to cp1252, which
204
+ # would mangle the '·' separators). Best-effort: ignore if the stream can't reconfigure.
205
+ for stream in (sys.stdout, sys.stderr):
206
+ try:
207
+ stream.reconfigure(encoding="utf-8") # type: ignore[union-attr]
208
+ except (AttributeError, ValueError):
209
+ pass
210
+
211
+ parser = argparse.ArgumentParser(
212
+ prog="veridge", description="Veridge — the always-fresh, low-token map of a project.")
213
+ parser.add_argument("--version", action="version", version=f"veridge {__version__}")
214
+ sub = parser.add_subparsers(dest="cmd", required=True)
215
+
216
+ for name, fn, help_text in [
217
+ ("build", cmd_build, "index the project -> .veridge/graph.json"),
218
+ ("stats", cmd_stats, "counts by node/edge type"),
219
+ ("gate", cmd_gate, "anti-drift check (broken refs, stale files, orphans)"),
220
+ ]:
221
+ sp = sub.add_parser(name, help=help_text)
222
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
223
+ sp.set_defaults(func=fn)
224
+
225
+ sp = sub.add_parser("map", help="compact project digest (PageRank-ranked)")
226
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
227
+ sp.add_argument("--json", action="store_true", help="emit JSON")
228
+ sp.set_defaults(func=cmd_map)
229
+
230
+ sp = sub.add_parser("find", help="find nodes by name/path substring")
231
+ sp.add_argument("query", help="substring to search for")
232
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
233
+ sp.set_defaults(func=cmd_find)
234
+
235
+ sp = sub.add_parser("neighbors", help="a node and its connections")
236
+ sp.add_argument("node", help="node id")
237
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
238
+ sp.set_defaults(func=cmd_neighbors)
239
+
240
+ sp = sub.add_parser("focus", help="minimal relevant subgraph for a task, within a budget")
241
+ sp.add_argument("query", help="a task description, a file path, or a symbol name")
242
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
243
+ sp.add_argument("--budget", type=int, default=1500, help="token budget (default: 1500)")
244
+ sp.add_argument("--json", action="store_true", help="emit JSON")
245
+ sp.set_defaults(func=cmd_focus)
246
+
247
+ sp = sub.add_parser("impact", help="blast-radius: what a change to a file/symbol affects")
248
+ sp.add_argument("seed", nargs="?", help="a file path or symbol name (omit with --diff)")
249
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
250
+ sp.add_argument("--diff", action="store_true", help="seed from files changed vs git HEAD")
251
+ sp.add_argument("--deps", action="store_true",
252
+ help="invert: what the seed depends ON, not what depends on it")
253
+ sp.add_argument("--hops", type=int, default=None, help="max propagation distance")
254
+ sp.add_argument("--budget", type=int, default=1500, help="token budget (default: 1500)")
255
+ sp.add_argument("--json", action="store_true", help="emit JSON")
256
+ sp.set_defaults(func=cmd_impact)
257
+
258
+ sp = sub.add_parser("why", help="shortest typed path between two nodes")
259
+ sp.add_argument("a", help="first node (id, path, or name)")
260
+ sp.add_argument("b", help="second node (id, path, or name)")
261
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
262
+ sp.add_argument("--json", action="store_true", help="emit JSON")
263
+ sp.set_defaults(func=cmd_why)
264
+
265
+ sp = sub.add_parser("tour", help="dependency-ordered reading tour of the key files")
266
+ sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
267
+ sp.add_argument("--budget", type=int, default=2000, help="token budget (default: 2000)")
268
+ sp.add_argument("--json", action="store_true", help="emit JSON")
269
+ sp.set_defaults(func=cmd_tour)
270
+
271
+ args = parser.parse_args(argv)
272
+ return int(args.func(args))
273
+
274
+
275
+ if __name__ == "__main__": # pragma: no cover
276
+ raise SystemExit(main())
veridge/freshness.py ADDED
@@ -0,0 +1,145 @@
1
+ """Content-hash freshness, the single-walk ``index`` entry point, and the anti-drift gate.
2
+
3
+ Text source/docs up to a cap are content-hashed with newline normalisation (precise and
4
+ stable across a CRLF/LF flip); data, binaries and large files use a cheap ``size+mtime``
5
+ signature so indexing stays light on data-heavy projects. The manifest is a small
6
+ ``{path: hash}`` map; diffing it against the current tree tells the gate whether the map is
7
+ still in sync.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import os
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+
17
+ from veridge.indexer import build_graph
18
+ from veridge.model import EdgeType, Graph, Kind
19
+ from veridge.walk import iter_files
20
+
21
+ _CHUNK = 65536
22
+ _NORMALIZE_CAP = 8_000_000
23
+ _CONTENT_HASH_CAP = 1_000_000
24
+ _TEXT_HASH_EXTS = {
25
+ ".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
26
+ ".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".html", ".htm", ".css", ".sql",
27
+ ".ps1", ".psm1", ".sh", ".bash", ".go", ".rs", ".java", ".c", ".cc", ".cpp", ".h",
28
+ ".hpp", ".rb", ".php", ".cs", ".adoc",
29
+ }
30
+
31
+
32
+ def _ext(name: str) -> str:
33
+ return os.path.splitext(name)[1].lower()
34
+
35
+
36
+ def file_hash(path: Path, *, normalize_newlines: bool = False) -> str:
37
+ h = hashlib.blake2b(digest_size=16)
38
+ if normalize_newlines and path.stat().st_size <= _NORMALIZE_CAP:
39
+ data = path.read_bytes().replace(b"\r\n", b"\n").replace(b"\r", b"\n")
40
+ h.update(data)
41
+ else:
42
+ with open(path, "rb") as f:
43
+ for chunk in iter(lambda: f.read(_CHUNK), b""):
44
+ h.update(chunk)
45
+ return h.hexdigest()
46
+
47
+
48
+ def _hash_rel(root: Path, rel: str) -> str | None:
49
+ p = root / rel
50
+ try:
51
+ st = p.stat()
52
+ except OSError:
53
+ return None
54
+ if _ext(rel) in _TEXT_HASH_EXTS and st.st_size <= _CONTENT_HASH_CAP:
55
+ try:
56
+ return file_hash(p, normalize_newlines=True)
57
+ except OSError:
58
+ return None
59
+ return f"s{st.st_size}:m{int(st.st_mtime)}"
60
+
61
+
62
+ def build_manifest(root: str | os.PathLike[str]) -> dict[str, str]:
63
+ root_p = Path(root).resolve()
64
+ out: dict[str, str] = {}
65
+ for rel in iter_files(root_p):
66
+ hv = _hash_rel(root_p, rel)
67
+ if hv is not None:
68
+ out[rel] = hv
69
+ return out
70
+
71
+
72
+ def index(root: str | os.PathLike[str]) -> tuple[Graph, dict[str, str]]:
73
+ """Build the graph and the manifest from one directory walk."""
74
+ root_p = Path(root).resolve()
75
+ rels = iter_files(root_p)
76
+ graph = build_graph(root_p, _rels=rels)
77
+ manifest = {rel: hv for rel in rels if (hv := _hash_rel(root_p, rel)) is not None}
78
+ return graph, manifest
79
+
80
+
81
+ def diff_manifest(old: dict[str, str], new: dict[str, str]) -> dict[str, list[str]]:
82
+ old_k, new_k = set(old), set(new)
83
+ return {
84
+ "added": sorted(new_k - old_k),
85
+ "removed": sorted(old_k - new_k),
86
+ "changed": sorted(k for k in (old_k & new_k) if old[k] != new[k]),
87
+ }
88
+
89
+
90
+ # -- the gate ---------------------------------------------------------------
91
+ @dataclass
92
+ class GateReport:
93
+ broken: list[tuple[str, str]] = field(default_factory=list)
94
+ stale: dict[str, list[str]] = field(
95
+ default_factory=lambda: {"added": [], "removed": [], "changed": []})
96
+ orphans: list[str] = field(default_factory=list)
97
+
98
+ @property
99
+ def stale_count(self) -> int:
100
+ return sum(len(v) for v in self.stale.values())
101
+
102
+ @property
103
+ def ok(self) -> bool:
104
+ return not self.broken and self.stale_count == 0
105
+
106
+ def summary(self) -> str:
107
+ a, r, c = self.stale["added"], self.stale["removed"], self.stale["changed"]
108
+ lines = [
109
+ f"broken references: {len(self.broken)}",
110
+ f"stale files: {self.stale_count} (+{len(a)} / -{len(r)} / ~{len(c)})",
111
+ f"orphans: {len(self.orphans)} (info)",
112
+ ]
113
+ for src, tgt in self.broken[:20]:
114
+ lines.append(f" [broken] {src} -> {tgt}")
115
+ return "\n".join(lines)
116
+
117
+
118
+ def find_broken(graph: Graph) -> list[tuple[str, str]]:
119
+ out: list[tuple[str, str]] = []
120
+ for n in graph.nodes.values():
121
+ for t in n.meta.get("broken_refs", []):
122
+ out.append((n.id, t))
123
+ return sorted(out)
124
+
125
+
126
+ def find_orphans(graph: Graph) -> list[str]:
127
+ """File nodes connected by nothing but their area edge."""
128
+ out: list[str] = []
129
+ for n in graph.nodes.values():
130
+ if n.kind is not Kind.FILE:
131
+ continue
132
+ knowledge = any(
133
+ e.type is not EdgeType.BELONGS_TO
134
+ for e in graph.out_edges(n.id) + graph.in_edges(n.id)
135
+ )
136
+ if not knowledge:
137
+ out.append(n.id)
138
+ return sorted(out)
139
+
140
+
141
+ def evaluate(graph: Graph, old_manifest: dict[str, str] | None,
142
+ new_manifest: dict[str, str]) -> GateReport:
143
+ stale = (diff_manifest(old_manifest, new_manifest) if old_manifest is not None
144
+ else {"added": [], "removed": [], "changed": []})
145
+ return GateReport(broken=find_broken(graph), stale=stale, orphans=find_orphans(graph))
veridge/ignore.py ADDED
@@ -0,0 +1,61 @@
1
+ """What the indexer skips: derived/vendor directories, binary noise, and user patterns.
2
+
3
+ A project may add a ``.veridgeignore`` at its root — one glob per line, ``#`` comments
4
+ allowed — matched against the POSIX relative path (case-sensitive, identical on every OS).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import fnmatch
10
+ import os
11
+ from pathlib import Path
12
+
13
+ DEFAULT_IGNORE_DIRS: frozenset[str] = frozenset({
14
+ ".git", ".hg", ".svn", ".veridge",
15
+ "__pycache__", ".pytest_cache", ".ruff_cache", ".mypy_cache", ".cache",
16
+ ".venv", "venv", "env", "node_modules", "site-packages",
17
+ "dist", "build", ".eggs", ".tox", ".next", ".nuxt", "target",
18
+ ".idea", ".vscode",
19
+ })
20
+
21
+ DEFAULT_IGNORE_FILES: frozenset[str] = frozenset({
22
+ ".DS_Store", "Thumbs.db", ".veridgeignore",
23
+ "package-lock.json", "poetry.lock", "yarn.lock", "pnpm-lock.yaml",
24
+ })
25
+
26
+ DEFAULT_IGNORE_EXTS: frozenset[str] = frozenset({
27
+ ".pyc", ".pyo", ".pyd",
28
+ ".png", ".jpg", ".jpeg", ".gif", ".ico", ".webp", ".bmp",
29
+ ".woff", ".woff2", ".ttf", ".eot",
30
+ ".zip", ".gz", ".tar", ".7z", ".rar",
31
+ ".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a",
32
+ ".lock",
33
+ })
34
+
35
+
36
+ def load_ignore_patterns(root: Path) -> list[str]:
37
+ f = root / ".veridgeignore"
38
+ if not f.is_file():
39
+ return []
40
+ out: list[str] = []
41
+ for line in f.read_text(encoding="utf-8", errors="ignore").splitlines():
42
+ s = line.strip()
43
+ if s and not s.startswith("#"):
44
+ out.append(s)
45
+ return out
46
+
47
+
48
+ def is_ignored_dir(name: str) -> bool:
49
+ return name in DEFAULT_IGNORE_DIRS or name.endswith(".egg-info")
50
+
51
+
52
+ def is_ignored_file(rel_posix: str, name: str, patterns: list[str]) -> bool:
53
+ if name in DEFAULT_IGNORE_FILES:
54
+ return True
55
+ ext = os.path.splitext(name)[1].lower()
56
+ if ext in DEFAULT_IGNORE_EXTS:
57
+ return True
58
+ for pat in patterns:
59
+ if fnmatch.fnmatchcase(rel_posix, pat) or fnmatch.fnmatchcase(name, pat):
60
+ return True
61
+ return False