veridge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veridge/__init__.py +20 -0
- veridge/__main__.py +8 -0
- veridge/budget.py +59 -0
- veridge/classify.py +63 -0
- veridge/cli.py +276 -0
- veridge/freshness.py +145 -0
- veridge/ignore.py +61 -0
- veridge/impact.py +69 -0
- veridge/indexer.py +292 -0
- veridge/layers.py +70 -0
- veridge/mcp_server.py +106 -0
- veridge/model.py +220 -0
- veridge/parse_docs.py +80 -0
- veridge/parse_python.py +83 -0
- veridge/py.typed +0 -0
- veridge/query.py +340 -0
- veridge/rank.py +87 -0
- veridge/sessions.py +73 -0
- veridge/store.py +62 -0
- veridge/treesitter.py +211 -0
- veridge/walk.py +29 -0
- veridge-0.1.0.dist-info/METADATA +193 -0
- veridge-0.1.0.dist-info/RECORD +27 -0
- veridge-0.1.0.dist-info/WHEEL +5 -0
- veridge-0.1.0.dist-info/entry_points.txt +3 -0
- veridge-0.1.0.dist-info/licenses/LICENSE +21 -0
- veridge-0.1.0.dist-info/top_level.txt +1 -0
veridge/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Veridge — the always-fresh, low-token map of a whole project.
|
|
2
|
+
|
|
3
|
+
Veridge indexes a project read-only into a typed graph that unifies four layers most
|
|
4
|
+
tools keep apart:
|
|
5
|
+
|
|
6
|
+
* **documents** (with references — including plain path mentions written in prose),
|
|
7
|
+
* **code** down to the *symbol* (functions/classes), via pluggable parsers,
|
|
8
|
+
* **decisions** (ADR / RFC / D-XXX ids found in docs), and
|
|
9
|
+
* **sessions** (git commits and the files they touched).
|
|
10
|
+
|
|
11
|
+
It then *ranks* the graph with PageRank, so queries can return the **minimal relevant
|
|
12
|
+
subgraph within a token budget** — the cheap, accurate context an AI assistant needs to
|
|
13
|
+
orient itself, and a map a human can read. The core has zero runtime dependencies.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
__version__ = "0.1.0"
|
|
19
|
+
|
|
20
|
+
__all__ = ["__version__"]
|
veridge/__main__.py
ADDED
veridge/budget.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Token budgeting: turn a ranking into the *most* context that fits a token ceiling.
|
|
2
|
+
|
|
3
|
+
The point of Veridge is to hand an assistant the **minimal relevant slice** of a project,
|
|
4
|
+
not the whole thing. Given a ranked list of node ids and a token budget, we greedily admit
|
|
5
|
+
the highest-ranked nodes whose compact rows still fit. Token cost uses the standard ~4
|
|
6
|
+
chars/token heuristic on the *compact row* an assistant would actually read — ids, kinds,
|
|
7
|
+
sizes and edge counts, never file contents.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from veridge.model import Graph, Kind
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def estimate_tokens(text: str) -> int:
|
|
18
|
+
return max(1, round(len(text) / 4))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def node_row(graph: Graph, nid: str) -> dict[str, Any]:
|
|
22
|
+
"""A compact, contents-free row for one node (what an assistant reads)."""
|
|
23
|
+
n = graph.nodes[nid]
|
|
24
|
+
row: dict[str, Any] = {"id": n.id, "kind": n.kind.value}
|
|
25
|
+
if n.category:
|
|
26
|
+
row["cat"] = n.category.value
|
|
27
|
+
if n.kind is Kind.FILE:
|
|
28
|
+
row["size"] = int(n.meta.get("size", 0))
|
|
29
|
+
if n.kind is Kind.SYMBOL and n.meta.get("line"):
|
|
30
|
+
row["line"] = n.meta["line"]
|
|
31
|
+
row["deg"] = graph.degree(nid)
|
|
32
|
+
return row
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _row_cost(row: dict[str, Any]) -> int:
|
|
36
|
+
# Approximate the serialized footprint the assistant pays for.
|
|
37
|
+
return estimate_tokens(",".join(f"{k}={v}" for k, v in row.items()))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def select_within_budget(
|
|
41
|
+
graph: Graph, ranked_ids: list[str], budget_tokens: int,
|
|
42
|
+
) -> tuple[list[dict[str, Any]], int]:
|
|
43
|
+
"""Admit ranked nodes (best first) until the token budget is exhausted.
|
|
44
|
+
|
|
45
|
+
Returns ``(rows, used_tokens)``. Always returns at least the single best node, even if it
|
|
46
|
+
alone exceeds the budget, so a query never comes back empty.
|
|
47
|
+
"""
|
|
48
|
+
rows: list[dict[str, Any]] = []
|
|
49
|
+
used = 0
|
|
50
|
+
for nid in ranked_ids:
|
|
51
|
+
if nid not in graph.nodes:
|
|
52
|
+
continue
|
|
53
|
+
row = node_row(graph, nid)
|
|
54
|
+
cost = _row_cost(row)
|
|
55
|
+
if rows and used + cost > budget_tokens:
|
|
56
|
+
break
|
|
57
|
+
rows.append(row)
|
|
58
|
+
used += cost
|
|
59
|
+
return rows, used
|
veridge/classify.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Classify a file into a (Kind.FILE, Category) from its path — generic and language-agnostic.
|
|
2
|
+
|
|
3
|
+
Unlike a project-specific taxonomy, the rules here key on widely shared conventions
|
|
4
|
+
(extensions, well-known names) so they work out of the box on any repository. They are
|
|
5
|
+
intentionally simple and tunable.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
from veridge.model import Category
|
|
13
|
+
|
|
14
|
+
_CODE_EXTS = {
|
|
15
|
+
".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
|
|
16
|
+
".go", ".rs", ".java", ".kt", ".c", ".cc", ".cpp", ".h", ".hpp",
|
|
17
|
+
".rb", ".php", ".cs", ".swift", ".scala", ".lua",
|
|
18
|
+
".ps1", ".psm1", ".sh", ".bash", ".sql",
|
|
19
|
+
}
|
|
20
|
+
_DATA_EXTS = {
|
|
21
|
+
".db", ".sqlite", ".sqlite3", ".duckdb", ".parquet", ".csv", ".tsv",
|
|
22
|
+
".jsonl", ".ndjson",
|
|
23
|
+
}
|
|
24
|
+
_CONFIG_EXTS = {
|
|
25
|
+
".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".env", ".json",
|
|
26
|
+
".xml", ".properties",
|
|
27
|
+
}
|
|
28
|
+
_DOC_EXTS = {".md", ".markdown", ".rst", ".txt", ".pdf", ".html", ".htm", ".docx", ".adoc"}
|
|
29
|
+
|
|
30
|
+
_STRUCTURE_NAMES = {
|
|
31
|
+
"readme", "readme.md", "readme.rst", "index.md", "architecture.md",
|
|
32
|
+
"changelog.md", "contributing.md", "license", "license.md", "authors", "notice",
|
|
33
|
+
}
|
|
34
|
+
_CONFIG_NAMES = {
|
|
35
|
+
".gitignore", ".veridgeignore", "dockerfile", "makefile", "caddyfile",
|
|
36
|
+
".editorconfig",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ext(name: str) -> str:
|
|
41
|
+
return os.path.splitext(name)[1].lower()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def classify(rel_posix: str) -> Category:
|
|
45
|
+
"""Return the Category for a file given its POSIX relative path."""
|
|
46
|
+
name = rel_posix.rsplit("/", 1)[-1]
|
|
47
|
+
low = name.lower()
|
|
48
|
+
ext = _ext(low)
|
|
49
|
+
parts = [p.lower() for p in rel_posix.split("/")]
|
|
50
|
+
|
|
51
|
+
if "memory" in parts or low.startswith("memory."):
|
|
52
|
+
return Category.MEMORY
|
|
53
|
+
if ext in _DATA_EXTS:
|
|
54
|
+
return Category.DATA
|
|
55
|
+
if ext in _CODE_EXTS:
|
|
56
|
+
return Category.CODE
|
|
57
|
+
if ext in _CONFIG_EXTS or low in _CONFIG_NAMES:
|
|
58
|
+
return Category.CONFIG
|
|
59
|
+
if low in _STRUCTURE_NAMES:
|
|
60
|
+
return Category.STRUCTURE
|
|
61
|
+
if ext in _DOC_EXTS or ext == "":
|
|
62
|
+
return Category.DOC
|
|
63
|
+
return Category.CONFIG
|
veridge/cli.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Command line: build, map, find, neighbors, focus, gate, stats.
|
|
2
|
+
|
|
3
|
+
Read-only on your sources: commands only read the project and write derived files under
|
|
4
|
+
``.veridge/``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
|
|
13
|
+
from veridge import __version__, query, store
|
|
14
|
+
from veridge.freshness import build_manifest, evaluate, index
|
|
15
|
+
from veridge.model import Graph
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _human(n: int) -> str:
|
|
19
|
+
f = float(n)
|
|
20
|
+
for unit in ("B", "KB", "MB", "GB", "TB"):
|
|
21
|
+
if f < 1024 or unit == "TB":
|
|
22
|
+
return f"{f:.0f} {unit}" if unit == "B" else f"{f:.1f} {unit}"
|
|
23
|
+
f /= 1024
|
|
24
|
+
return f"{f:.1f} TB"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_or_build(path: str) -> Graph:
|
|
28
|
+
return store.load_graph(path) or index(path)[0]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def cmd_build(args: argparse.Namespace) -> int:
|
|
32
|
+
g, m = index(args.path)
|
|
33
|
+
store.save(args.path, g, m)
|
|
34
|
+
c = g.counts()
|
|
35
|
+
print(f"built '{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
|
|
36
|
+
print(" nodes:", c["nodes"])
|
|
37
|
+
print(" edges:", c["edges"])
|
|
38
|
+
print(f" store: {store.store_dir(args.path)}")
|
|
39
|
+
return 0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cmd_stats(args: argparse.Namespace) -> int:
|
|
43
|
+
g = _load_or_build(args.path)
|
|
44
|
+
c = g.counts()
|
|
45
|
+
print(f"'{g.project}': {len(g.nodes)} nodes, {len(g.edges)} edges")
|
|
46
|
+
for k, v in sorted(c["nodes"].items()):
|
|
47
|
+
print(f" {k:9} {v}")
|
|
48
|
+
for k, v in sorted(c["edges"].items()):
|
|
49
|
+
print(f" -{k:8} {v}")
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def cmd_map(args: argparse.Namespace) -> int:
|
|
54
|
+
m = query.project_map(_load_or_build(args.path))
|
|
55
|
+
if args.json:
|
|
56
|
+
print(json.dumps(m, ensure_ascii=False, indent=2))
|
|
57
|
+
return 0
|
|
58
|
+
print(f"{m['project']}: {m['files']} files · {m['symbols']} symbols · "
|
|
59
|
+
f"{m['areas']} areas · {m['edges']} edges · {_human(m['size'])}")
|
|
60
|
+
print("by area:")
|
|
61
|
+
for a in m["by_area"]:
|
|
62
|
+
print(f" {a['area']:16} {a['files']:4} files {_human(a['size']):>9} "
|
|
63
|
+
f"[{', '.join(a['top_cats'])}]")
|
|
64
|
+
print("by layer:")
|
|
65
|
+
for ly in m["by_layer"]:
|
|
66
|
+
print(f" {ly['layer']:11} {ly['files']:4} files {_human(ly['size']):>9}")
|
|
67
|
+
print("most important (PageRank):")
|
|
68
|
+
for x in m["most_important"]:
|
|
69
|
+
print(f" {x['score']:.4f} {x['id']} ({x['kind']})")
|
|
70
|
+
print(f"orphans: {m['orphans']} · broken refs: {m['broken_refs']}")
|
|
71
|
+
return 0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def cmd_find(args: argparse.Namespace) -> int:
|
|
75
|
+
res = query.find(_load_or_build(args.path), args.query)
|
|
76
|
+
for r in res:
|
|
77
|
+
print(f" {r['kind']:8} {r['id']}")
|
|
78
|
+
print(f"({len(res)} matches)")
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def cmd_neighbors(args: argparse.Namespace) -> int:
|
|
83
|
+
n = query.neighbors(_load_or_build(args.path), args.node)
|
|
84
|
+
if n is None:
|
|
85
|
+
print(f"node not found: {args.node}", file=sys.stderr)
|
|
86
|
+
return 1
|
|
87
|
+
print(f"{n['id']} ({n['kind']}) · {_human(n['size'])}")
|
|
88
|
+
if n["description"]:
|
|
89
|
+
print(f" {n['description']}")
|
|
90
|
+
if n["broken_refs"]:
|
|
91
|
+
print(f" broken refs: {n['broken_refs']}")
|
|
92
|
+
print(f" outgoing ({len(n['outgoing'])}):")
|
|
93
|
+
for o in n["outgoing"]:
|
|
94
|
+
print(f" -{o['edge']}-> {o['id']} ({o['kind']})")
|
|
95
|
+
print(f" incoming ({len(n['incoming'])}):")
|
|
96
|
+
for o in n["incoming"]:
|
|
97
|
+
print(f" <-{o['edge']}- {o['id']} ({o['kind']})")
|
|
98
|
+
return 0
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def cmd_focus(args: argparse.Namespace) -> int:
|
|
102
|
+
res = query.focus(_load_or_build(args.path), args.query, budget_tokens=args.budget)
|
|
103
|
+
if args.json:
|
|
104
|
+
print(json.dumps(res, ensure_ascii=False, indent=2))
|
|
105
|
+
return 0
|
|
106
|
+
if not res["nodes"]:
|
|
107
|
+
print(res.get("note", "no matches"))
|
|
108
|
+
return 0
|
|
109
|
+
print(f"focus '{res['query']}' · {len(res['nodes'])} nodes · "
|
|
110
|
+
f"~{res['used_tokens']}/{res['budget_tokens']} tokens")
|
|
111
|
+
print(f" seeds: {', '.join(res['seeds'])}")
|
|
112
|
+
for r in res["nodes"]:
|
|
113
|
+
extra = f" ({r['cat']})" if r.get("cat") else ""
|
|
114
|
+
print(f" {r.get('score', 0):.4f} {r['id']}{extra} [{r['kind']}, deg {r['deg']}]")
|
|
115
|
+
return 0
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def cmd_impact(args: argparse.Namespace) -> int:
|
|
119
|
+
seed_ids = None
|
|
120
|
+
query_str = args.seed or ""
|
|
121
|
+
proj = args.path
|
|
122
|
+
if args.diff:
|
|
123
|
+
# In --diff mode there is no seed, so a lone positional is the project path.
|
|
124
|
+
if args.seed and args.path == ".":
|
|
125
|
+
proj = args.seed
|
|
126
|
+
from veridge.sessions import git_changed_files
|
|
127
|
+
seed_ids = git_changed_files(proj)
|
|
128
|
+
query_str = "git diff (HEAD)"
|
|
129
|
+
if not seed_ids:
|
|
130
|
+
print("no changed files vs HEAD (or not a git repository)")
|
|
131
|
+
return 0
|
|
132
|
+
elif not args.seed:
|
|
133
|
+
print("provide a seed (file/symbol) or use --diff", file=sys.stderr)
|
|
134
|
+
return 2
|
|
135
|
+
g = _load_or_build(proj)
|
|
136
|
+
direction = "dependencies" if args.deps else "dependents"
|
|
137
|
+
res = query.impact(g, query_str, seed_ids=seed_ids, budget_tokens=args.budget,
|
|
138
|
+
hops=args.hops, direction=direction)
|
|
139
|
+
if args.json:
|
|
140
|
+
print(json.dumps(res, ensure_ascii=False, indent=2))
|
|
141
|
+
return 0
|
|
142
|
+
verb = "depends on" if args.deps else "affected by"
|
|
143
|
+
print(f"impact ({direction}) of '{res['query']}' · {res['total_affected']} {verb}")
|
|
144
|
+
if res["seeds"]:
|
|
145
|
+
print(f" seeds: {', '.join(res['seeds'])}")
|
|
146
|
+
if not res["nodes"]:
|
|
147
|
+
print(f" {res.get('note', 'nothing found')}")
|
|
148
|
+
return 0
|
|
149
|
+
print(f" showing {len(res['nodes'])} · ~{res['used_tokens']}/{res['budget_tokens']} tokens")
|
|
150
|
+
for r in res["nodes"]:
|
|
151
|
+
extra = f" ({r['cat']})" if r.get("cat") else ""
|
|
152
|
+
print(f" {r.get('score', 0):.4f} d{r.get('dist', '?')} {r['id']}{extra} [{r['kind']}]")
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def cmd_why(args: argparse.Namespace) -> int:
|
|
157
|
+
res = query.why(_load_or_build(args.path), args.a, args.b)
|
|
158
|
+
if args.json:
|
|
159
|
+
print(json.dumps(res, ensure_ascii=False, indent=2))
|
|
160
|
+
return 0
|
|
161
|
+
if not res["found"]:
|
|
162
|
+
print(res.get("note", "no path"))
|
|
163
|
+
return 1
|
|
164
|
+
path = res["path"]
|
|
165
|
+
print(f"why: {res['a']} -> {res['b']} · {res['length']} hops")
|
|
166
|
+
print(f" {path[0]['id']} ({path[0]['kind']})")
|
|
167
|
+
for step in path[1:]:
|
|
168
|
+
connector = f"--{step['edge']}-->" if step["dir"] == "->" else f"<--{step['edge']}--"
|
|
169
|
+
print(f" {connector} {step['id']} ({step['kind']})")
|
|
170
|
+
return 0
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def cmd_tour(args: argparse.Namespace) -> int:
|
|
174
|
+
res = query.tour(_load_or_build(args.path), budget_tokens=args.budget)
|
|
175
|
+
if args.json:
|
|
176
|
+
print(json.dumps(res, ensure_ascii=False, indent=2))
|
|
177
|
+
return 0
|
|
178
|
+
print(f"tour of '{res['project']}' · {len(res['stops'])}/{res['total_files']} stops · "
|
|
179
|
+
f"~{res['used_tokens']}/{res['budget_tokens']} tokens")
|
|
180
|
+
print("(read top to bottom: dependencies before the files that use them)")
|
|
181
|
+
for s in res["stops"]:
|
|
182
|
+
print(f" {s['step']:2}. {s['id']} [{s['layer']}]")
|
|
183
|
+
if s["uses"]:
|
|
184
|
+
print(f" uses: {', '.join(s['uses'])}")
|
|
185
|
+
if s["used_by"]:
|
|
186
|
+
print(f" used by: {', '.join(s['used_by'])}")
|
|
187
|
+
return 0
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def cmd_gate(args: argparse.Namespace) -> int:
|
|
191
|
+
g = store.load_graph(args.path)
|
|
192
|
+
old = store.load_manifest(args.path)
|
|
193
|
+
if g is None or old is None:
|
|
194
|
+
print("no graph found — run 'veridge build' first", file=sys.stderr)
|
|
195
|
+
return 2
|
|
196
|
+
rep = evaluate(g, old, build_manifest(args.path))
|
|
197
|
+
print(rep.summary())
|
|
198
|
+
print("OK: fresh and clean" if rep.ok else "DRIFT: rebuild and/or fix the issues above")
|
|
199
|
+
return 0 if rep.ok else 1
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def main(argv: list[str] | None = None) -> int:
|
|
203
|
+
# Print UTF-8 regardless of the console's locale (Windows defaults to cp1252, which
|
|
204
|
+
# would mangle the '·' separators). Best-effort: ignore if the stream can't reconfigure.
|
|
205
|
+
for stream in (sys.stdout, sys.stderr):
|
|
206
|
+
try:
|
|
207
|
+
stream.reconfigure(encoding="utf-8") # type: ignore[union-attr]
|
|
208
|
+
except (AttributeError, ValueError):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
parser = argparse.ArgumentParser(
|
|
212
|
+
prog="veridge", description="Veridge — the always-fresh, low-token map of a project.")
|
|
213
|
+
parser.add_argument("--version", action="version", version=f"veridge {__version__}")
|
|
214
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
215
|
+
|
|
216
|
+
for name, fn, help_text in [
|
|
217
|
+
("build", cmd_build, "index the project -> .veridge/graph.json"),
|
|
218
|
+
("stats", cmd_stats, "counts by node/edge type"),
|
|
219
|
+
("gate", cmd_gate, "anti-drift check (broken refs, stale files, orphans)"),
|
|
220
|
+
]:
|
|
221
|
+
sp = sub.add_parser(name, help=help_text)
|
|
222
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
223
|
+
sp.set_defaults(func=fn)
|
|
224
|
+
|
|
225
|
+
sp = sub.add_parser("map", help="compact project digest (PageRank-ranked)")
|
|
226
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
227
|
+
sp.add_argument("--json", action="store_true", help="emit JSON")
|
|
228
|
+
sp.set_defaults(func=cmd_map)
|
|
229
|
+
|
|
230
|
+
sp = sub.add_parser("find", help="find nodes by name/path substring")
|
|
231
|
+
sp.add_argument("query", help="substring to search for")
|
|
232
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
233
|
+
sp.set_defaults(func=cmd_find)
|
|
234
|
+
|
|
235
|
+
sp = sub.add_parser("neighbors", help="a node and its connections")
|
|
236
|
+
sp.add_argument("node", help="node id")
|
|
237
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
238
|
+
sp.set_defaults(func=cmd_neighbors)
|
|
239
|
+
|
|
240
|
+
sp = sub.add_parser("focus", help="minimal relevant subgraph for a task, within a budget")
|
|
241
|
+
sp.add_argument("query", help="a task description, a file path, or a symbol name")
|
|
242
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
243
|
+
sp.add_argument("--budget", type=int, default=1500, help="token budget (default: 1500)")
|
|
244
|
+
sp.add_argument("--json", action="store_true", help="emit JSON")
|
|
245
|
+
sp.set_defaults(func=cmd_focus)
|
|
246
|
+
|
|
247
|
+
sp = sub.add_parser("impact", help="blast-radius: what a change to a file/symbol affects")
|
|
248
|
+
sp.add_argument("seed", nargs="?", help="a file path or symbol name (omit with --diff)")
|
|
249
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
250
|
+
sp.add_argument("--diff", action="store_true", help="seed from files changed vs git HEAD")
|
|
251
|
+
sp.add_argument("--deps", action="store_true",
|
|
252
|
+
help="invert: what the seed depends ON, not what depends on it")
|
|
253
|
+
sp.add_argument("--hops", type=int, default=None, help="max propagation distance")
|
|
254
|
+
sp.add_argument("--budget", type=int, default=1500, help="token budget (default: 1500)")
|
|
255
|
+
sp.add_argument("--json", action="store_true", help="emit JSON")
|
|
256
|
+
sp.set_defaults(func=cmd_impact)
|
|
257
|
+
|
|
258
|
+
sp = sub.add_parser("why", help="shortest typed path between two nodes")
|
|
259
|
+
sp.add_argument("a", help="first node (id, path, or name)")
|
|
260
|
+
sp.add_argument("b", help="second node (id, path, or name)")
|
|
261
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
262
|
+
sp.add_argument("--json", action="store_true", help="emit JSON")
|
|
263
|
+
sp.set_defaults(func=cmd_why)
|
|
264
|
+
|
|
265
|
+
sp = sub.add_parser("tour", help="dependency-ordered reading tour of the key files")
|
|
266
|
+
sp.add_argument("path", nargs="?", default=".", help="project root (default: .)")
|
|
267
|
+
sp.add_argument("--budget", type=int, default=2000, help="token budget (default: 2000)")
|
|
268
|
+
sp.add_argument("--json", action="store_true", help="emit JSON")
|
|
269
|
+
sp.set_defaults(func=cmd_tour)
|
|
270
|
+
|
|
271
|
+
args = parser.parse_args(argv)
|
|
272
|
+
return int(args.func(args))
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
if __name__ == "__main__": # pragma: no cover
|
|
276
|
+
raise SystemExit(main())
|
veridge/freshness.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Content-hash freshness, the single-walk ``index`` entry point, and the anti-drift gate.
|
|
2
|
+
|
|
3
|
+
Text source/docs up to a cap are content-hashed with newline normalisation (precise and
|
|
4
|
+
stable across a CRLF/LF flip); data, binaries and large files use a cheap ``size+mtime``
|
|
5
|
+
signature so indexing stays light on data-heavy projects. The manifest is a small
|
|
6
|
+
``{path: hash}`` map; diffing it against the current tree tells the gate whether the map is
|
|
7
|
+
still in sync.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import os
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from veridge.indexer import build_graph
|
|
18
|
+
from veridge.model import EdgeType, Graph, Kind
|
|
19
|
+
from veridge.walk import iter_files
|
|
20
|
+
|
|
21
|
+
_CHUNK = 65536
|
|
22
|
+
_NORMALIZE_CAP = 8_000_000
|
|
23
|
+
_CONTENT_HASH_CAP = 1_000_000
|
|
24
|
+
_TEXT_HASH_EXTS = {
|
|
25
|
+
".md", ".markdown", ".rst", ".txt", ".py", ".js", ".ts", ".tsx", ".jsx", ".mjs", ".cjs",
|
|
26
|
+
".toml", ".ini", ".cfg", ".conf", ".yaml", ".yml", ".html", ".htm", ".css", ".sql",
|
|
27
|
+
".ps1", ".psm1", ".sh", ".bash", ".go", ".rs", ".java", ".c", ".cc", ".cpp", ".h",
|
|
28
|
+
".hpp", ".rb", ".php", ".cs", ".adoc",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _ext(name: str) -> str:
|
|
33
|
+
return os.path.splitext(name)[1].lower()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def file_hash(path: Path, *, normalize_newlines: bool = False) -> str:
|
|
37
|
+
h = hashlib.blake2b(digest_size=16)
|
|
38
|
+
if normalize_newlines and path.stat().st_size <= _NORMALIZE_CAP:
|
|
39
|
+
data = path.read_bytes().replace(b"\r\n", b"\n").replace(b"\r", b"\n")
|
|
40
|
+
h.update(data)
|
|
41
|
+
else:
|
|
42
|
+
with open(path, "rb") as f:
|
|
43
|
+
for chunk in iter(lambda: f.read(_CHUNK), b""):
|
|
44
|
+
h.update(chunk)
|
|
45
|
+
return h.hexdigest()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _hash_rel(root: Path, rel: str) -> str | None:
|
|
49
|
+
p = root / rel
|
|
50
|
+
try:
|
|
51
|
+
st = p.stat()
|
|
52
|
+
except OSError:
|
|
53
|
+
return None
|
|
54
|
+
if _ext(rel) in _TEXT_HASH_EXTS and st.st_size <= _CONTENT_HASH_CAP:
|
|
55
|
+
try:
|
|
56
|
+
return file_hash(p, normalize_newlines=True)
|
|
57
|
+
except OSError:
|
|
58
|
+
return None
|
|
59
|
+
return f"s{st.st_size}:m{int(st.st_mtime)}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_manifest(root: str | os.PathLike[str]) -> dict[str, str]:
|
|
63
|
+
root_p = Path(root).resolve()
|
|
64
|
+
out: dict[str, str] = {}
|
|
65
|
+
for rel in iter_files(root_p):
|
|
66
|
+
hv = _hash_rel(root_p, rel)
|
|
67
|
+
if hv is not None:
|
|
68
|
+
out[rel] = hv
|
|
69
|
+
return out
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def index(root: str | os.PathLike[str]) -> tuple[Graph, dict[str, str]]:
|
|
73
|
+
"""Build the graph and the manifest from one directory walk."""
|
|
74
|
+
root_p = Path(root).resolve()
|
|
75
|
+
rels = iter_files(root_p)
|
|
76
|
+
graph = build_graph(root_p, _rels=rels)
|
|
77
|
+
manifest = {rel: hv for rel in rels if (hv := _hash_rel(root_p, rel)) is not None}
|
|
78
|
+
return graph, manifest
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def diff_manifest(old: dict[str, str], new: dict[str, str]) -> dict[str, list[str]]:
|
|
82
|
+
old_k, new_k = set(old), set(new)
|
|
83
|
+
return {
|
|
84
|
+
"added": sorted(new_k - old_k),
|
|
85
|
+
"removed": sorted(old_k - new_k),
|
|
86
|
+
"changed": sorted(k for k in (old_k & new_k) if old[k] != new[k]),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# -- the gate ---------------------------------------------------------------
|
|
91
|
+
@dataclass
|
|
92
|
+
class GateReport:
|
|
93
|
+
broken: list[tuple[str, str]] = field(default_factory=list)
|
|
94
|
+
stale: dict[str, list[str]] = field(
|
|
95
|
+
default_factory=lambda: {"added": [], "removed": [], "changed": []})
|
|
96
|
+
orphans: list[str] = field(default_factory=list)
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def stale_count(self) -> int:
|
|
100
|
+
return sum(len(v) for v in self.stale.values())
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def ok(self) -> bool:
|
|
104
|
+
return not self.broken and self.stale_count == 0
|
|
105
|
+
|
|
106
|
+
def summary(self) -> str:
|
|
107
|
+
a, r, c = self.stale["added"], self.stale["removed"], self.stale["changed"]
|
|
108
|
+
lines = [
|
|
109
|
+
f"broken references: {len(self.broken)}",
|
|
110
|
+
f"stale files: {self.stale_count} (+{len(a)} / -{len(r)} / ~{len(c)})",
|
|
111
|
+
f"orphans: {len(self.orphans)} (info)",
|
|
112
|
+
]
|
|
113
|
+
for src, tgt in self.broken[:20]:
|
|
114
|
+
lines.append(f" [broken] {src} -> {tgt}")
|
|
115
|
+
return "\n".join(lines)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def find_broken(graph: Graph) -> list[tuple[str, str]]:
|
|
119
|
+
out: list[tuple[str, str]] = []
|
|
120
|
+
for n in graph.nodes.values():
|
|
121
|
+
for t in n.meta.get("broken_refs", []):
|
|
122
|
+
out.append((n.id, t))
|
|
123
|
+
return sorted(out)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def find_orphans(graph: Graph) -> list[str]:
|
|
127
|
+
"""File nodes connected by nothing but their area edge."""
|
|
128
|
+
out: list[str] = []
|
|
129
|
+
for n in graph.nodes.values():
|
|
130
|
+
if n.kind is not Kind.FILE:
|
|
131
|
+
continue
|
|
132
|
+
knowledge = any(
|
|
133
|
+
e.type is not EdgeType.BELONGS_TO
|
|
134
|
+
for e in graph.out_edges(n.id) + graph.in_edges(n.id)
|
|
135
|
+
)
|
|
136
|
+
if not knowledge:
|
|
137
|
+
out.append(n.id)
|
|
138
|
+
return sorted(out)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def evaluate(graph: Graph, old_manifest: dict[str, str] | None,
|
|
142
|
+
new_manifest: dict[str, str]) -> GateReport:
|
|
143
|
+
stale = (diff_manifest(old_manifest, new_manifest) if old_manifest is not None
|
|
144
|
+
else {"added": [], "removed": [], "changed": []})
|
|
145
|
+
return GateReport(broken=find_broken(graph), stale=stale, orphans=find_orphans(graph))
|
veridge/ignore.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""What the indexer skips: derived/vendor directories, binary noise, and user patterns.
|
|
2
|
+
|
|
3
|
+
A project may add a ``.veridgeignore`` at its root — one glob per line, ``#`` comments
|
|
4
|
+
allowed — matched against the POSIX relative path (case-sensitive, identical on every OS).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import fnmatch
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
DEFAULT_IGNORE_DIRS: frozenset[str] = frozenset({
|
|
14
|
+
".git", ".hg", ".svn", ".veridge",
|
|
15
|
+
"__pycache__", ".pytest_cache", ".ruff_cache", ".mypy_cache", ".cache",
|
|
16
|
+
".venv", "venv", "env", "node_modules", "site-packages",
|
|
17
|
+
"dist", "build", ".eggs", ".tox", ".next", ".nuxt", "target",
|
|
18
|
+
".idea", ".vscode",
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
DEFAULT_IGNORE_FILES: frozenset[str] = frozenset({
|
|
22
|
+
".DS_Store", "Thumbs.db", ".veridgeignore",
|
|
23
|
+
"package-lock.json", "poetry.lock", "yarn.lock", "pnpm-lock.yaml",
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
DEFAULT_IGNORE_EXTS: frozenset[str] = frozenset({
|
|
27
|
+
".pyc", ".pyo", ".pyd",
|
|
28
|
+
".png", ".jpg", ".jpeg", ".gif", ".ico", ".webp", ".bmp",
|
|
29
|
+
".woff", ".woff2", ".ttf", ".eot",
|
|
30
|
+
".zip", ".gz", ".tar", ".7z", ".rar",
|
|
31
|
+
".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a",
|
|
32
|
+
".lock",
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_ignore_patterns(root: Path) -> list[str]:
|
|
37
|
+
f = root / ".veridgeignore"
|
|
38
|
+
if not f.is_file():
|
|
39
|
+
return []
|
|
40
|
+
out: list[str] = []
|
|
41
|
+
for line in f.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
42
|
+
s = line.strip()
|
|
43
|
+
if s and not s.startswith("#"):
|
|
44
|
+
out.append(s)
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def is_ignored_dir(name: str) -> bool:
|
|
49
|
+
return name in DEFAULT_IGNORE_DIRS or name.endswith(".egg-info")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def is_ignored_file(rel_posix: str, name: str, patterns: list[str]) -> bool:
|
|
53
|
+
if name in DEFAULT_IGNORE_FILES:
|
|
54
|
+
return True
|
|
55
|
+
ext = os.path.splitext(name)[1].lower()
|
|
56
|
+
if ext in DEFAULT_IGNORE_EXTS:
|
|
57
|
+
return True
|
|
58
|
+
for pat in patterns:
|
|
59
|
+
if fnmatch.fnmatchcase(rel_posix, pat) or fnmatch.fnmatchcase(name, pat):
|
|
60
|
+
return True
|
|
61
|
+
return False
|