@oriro/orirocli 0.1.9 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -18
- package/dist/cli.js +4776 -2964
- package/package.json +2 -2
- package/skills/craft/ai-engineering/SKILL.md +2 -2
- package/skills/graphify/SKILL.md +0 -619
- package/skills/graphify/__init__.py +0 -28
- package/skills/graphify/__main__.py +0 -4582
- package/skills/graphify/affected.py +0 -154
- package/skills/graphify/always_on/agents-md.md +0 -12
- package/skills/graphify/always_on/antigravity-rules.md +0 -14
- package/skills/graphify/always_on/claude-md.md +0 -9
- package/skills/graphify/always_on/gemini-md.md +0 -9
- package/skills/graphify/always_on/kiro-steering.md +0 -5
- package/skills/graphify/always_on/vscode-instructions.md +0 -17
- package/skills/graphify/analyze.py +0 -724
- package/skills/graphify/benchmark.py +0 -155
- package/skills/graphify/build.py +0 -487
- package/skills/graphify/cache.py +0 -417
- package/skills/graphify/callflow_html.py +0 -2020
- package/skills/graphify/cluster.py +0 -272
- package/skills/graphify/command-kilo.md +0 -15
- package/skills/graphify/dedup.py +0 -429
- package/skills/graphify/detect.py +0 -1379
- package/skills/graphify/diagnostics.py +0 -390
- package/skills/graphify/export.py +0 -1408
- package/skills/graphify/extract.py +0 -11570
- package/skills/graphify/global_graph.py +0 -159
- package/skills/graphify/google_workspace.py +0 -223
- package/skills/graphify/hooks.py +0 -457
- package/skills/graphify/ingest.py +0 -331
- package/skills/graphify/llm.py +0 -1896
- package/skills/graphify/manifest.py +0 -4
- package/skills/graphify/mcp_ingest.py +0 -392
- package/skills/graphify/multigraph_compat.py +0 -212
- package/skills/graphify/pg_introspect.py +0 -142
- package/skills/graphify/prs.py +0 -748
- package/skills/graphify/querylog.py +0 -70
- package/skills/graphify/report.py +0 -218
- package/skills/graphify/scip_ingest.py +0 -363
- package/skills/graphify/security.py +0 -336
- package/skills/graphify/semantic_cleanup.py +0 -319
- package/skills/graphify/serve.py +0 -1309
- package/skills/graphify/skill-aider.md +0 -1246
- package/skills/graphify/skill-amp.md +0 -613
- package/skills/graphify/skill-claw.md +0 -616
- package/skills/graphify/skill-codex.md +0 -613
- package/skills/graphify/skill-copilot.md +0 -616
- package/skills/graphify/skill-devin.md +0 -1372
- package/skills/graphify/skill-droid.md +0 -613
- package/skills/graphify/skill-kilo.md +0 -625
- package/skills/graphify/skill-kiro.md +0 -615
- package/skills/graphify/skill-opencode.md +0 -608
- package/skills/graphify/skill-pi.md +0 -615
- package/skills/graphify/skill-trae.md +0 -614
- package/skills/graphify/skill-vscode.md +0 -612
- package/skills/graphify/skill-windows.md +0 -651
- package/skills/graphify/skills/amp/references/add-watch.md +0 -56
- package/skills/graphify/skills/amp/references/exports.md +0 -71
- package/skills/graphify/skills/amp/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/amp/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/amp/references/hooks.md +0 -33
- package/skills/graphify/skills/amp/references/query.md +0 -249
- package/skills/graphify/skills/amp/references/transcribe.md +0 -48
- package/skills/graphify/skills/amp/references/update.md +0 -179
- package/skills/graphify/skills/claude/references/add-watch.md +0 -56
- package/skills/graphify/skills/claude/references/exports.md +0 -71
- package/skills/graphify/skills/claude/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/claude/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/claude/references/hooks.md +0 -33
- package/skills/graphify/skills/claude/references/query.md +0 -103
- package/skills/graphify/skills/claude/references/transcribe.md +0 -48
- package/skills/graphify/skills/claude/references/update.md +0 -179
- package/skills/graphify/skills/claw/references/add-watch.md +0 -56
- package/skills/graphify/skills/claw/references/exports.md +0 -71
- package/skills/graphify/skills/claw/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/claw/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/claw/references/hooks.md +0 -33
- package/skills/graphify/skills/claw/references/query.md +0 -249
- package/skills/graphify/skills/claw/references/transcribe.md +0 -48
- package/skills/graphify/skills/claw/references/update.md +0 -179
- package/skills/graphify/skills/codex/references/add-watch.md +0 -56
- package/skills/graphify/skills/codex/references/exports.md +0 -71
- package/skills/graphify/skills/codex/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/codex/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/codex/references/hooks.md +0 -33
- package/skills/graphify/skills/codex/references/query.md +0 -249
- package/skills/graphify/skills/codex/references/transcribe.md +0 -48
- package/skills/graphify/skills/codex/references/update.md +0 -179
- package/skills/graphify/skills/copilot/references/add-watch.md +0 -56
- package/skills/graphify/skills/copilot/references/exports.md +0 -71
- package/skills/graphify/skills/copilot/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/copilot/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/copilot/references/hooks.md +0 -33
- package/skills/graphify/skills/copilot/references/query.md +0 -249
- package/skills/graphify/skills/copilot/references/transcribe.md +0 -48
- package/skills/graphify/skills/copilot/references/update.md +0 -179
- package/skills/graphify/skills/droid/references/add-watch.md +0 -56
- package/skills/graphify/skills/droid/references/exports.md +0 -71
- package/skills/graphify/skills/droid/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/droid/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/droid/references/hooks.md +0 -33
- package/skills/graphify/skills/droid/references/query.md +0 -249
- package/skills/graphify/skills/droid/references/transcribe.md +0 -48
- package/skills/graphify/skills/droid/references/update.md +0 -179
- package/skills/graphify/skills/kilo/references/add-watch.md +0 -56
- package/skills/graphify/skills/kilo/references/exports.md +0 -71
- package/skills/graphify/skills/kilo/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/kilo/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/kilo/references/hooks.md +0 -33
- package/skills/graphify/skills/kilo/references/query.md +0 -249
- package/skills/graphify/skills/kilo/references/transcribe.md +0 -48
- package/skills/graphify/skills/kilo/references/update.md +0 -179
- package/skills/graphify/skills/kiro/references/add-watch.md +0 -56
- package/skills/graphify/skills/kiro/references/exports.md +0 -71
- package/skills/graphify/skills/kiro/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/kiro/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/kiro/references/hooks.md +0 -33
- package/skills/graphify/skills/kiro/references/query.md +0 -249
- package/skills/graphify/skills/kiro/references/transcribe.md +0 -48
- package/skills/graphify/skills/kiro/references/update.md +0 -179
- package/skills/graphify/skills/opencode/references/add-watch.md +0 -56
- package/skills/graphify/skills/opencode/references/exports.md +0 -71
- package/skills/graphify/skills/opencode/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/opencode/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/opencode/references/hooks.md +0 -33
- package/skills/graphify/skills/opencode/references/query.md +0 -249
- package/skills/graphify/skills/opencode/references/transcribe.md +0 -48
- package/skills/graphify/skills/opencode/references/update.md +0 -179
- package/skills/graphify/skills/pi/references/add-watch.md +0 -56
- package/skills/graphify/skills/pi/references/exports.md +0 -71
- package/skills/graphify/skills/pi/references/extraction-spec.md +0 -29
- package/skills/graphify/skills/pi/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/pi/references/hooks.md +0 -33
- package/skills/graphify/skills/pi/references/query.md +0 -249
- package/skills/graphify/skills/pi/references/transcribe.md +0 -48
- package/skills/graphify/skills/pi/references/update.md +0 -179
- package/skills/graphify/skills/trae/references/add-watch.md +0 -56
- package/skills/graphify/skills/trae/references/exports.md +0 -71
- package/skills/graphify/skills/trae/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/trae/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/trae/references/hooks.md +0 -35
- package/skills/graphify/skills/trae/references/query.md +0 -249
- package/skills/graphify/skills/trae/references/transcribe.md +0 -48
- package/skills/graphify/skills/trae/references/update.md +0 -179
- package/skills/graphify/skills/vscode/references/add-watch.md +0 -56
- package/skills/graphify/skills/vscode/references/exports.md +0 -71
- package/skills/graphify/skills/vscode/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/vscode/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/vscode/references/hooks.md +0 -33
- package/skills/graphify/skills/vscode/references/query.md +0 -249
- package/skills/graphify/skills/vscode/references/transcribe.md +0 -48
- package/skills/graphify/skills/vscode/references/update.md +0 -179
- package/skills/graphify/skills/windows/references/add-watch.md +0 -56
- package/skills/graphify/skills/windows/references/exports.md +0 -71
- package/skills/graphify/skills/windows/references/extraction-spec.md +0 -68
- package/skills/graphify/skills/windows/references/github-and-merge.md +0 -46
- package/skills/graphify/skills/windows/references/hooks.md +0 -33
- package/skills/graphify/skills/windows/references/query.md +0 -249
- package/skills/graphify/skills/windows/references/transcribe.md +0 -48
- package/skills/graphify/skills/windows/references/update.md +0 -179
- package/skills/graphify/symbol_resolution.py +0 -538
- package/skills/graphify/transcribe.py +0 -184
- package/skills/graphify/tree_html.py +0 -582
- package/skills/graphify/validate.py +0 -72
- package/skills/graphify/watch.py +0 -898
- package/skills/graphify/wiki.py +0 -282
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
"""Token-reduction benchmark - measures how much context graphify saves vs naive full-corpus approach."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
import json
|
|
4
|
-
import sys
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import networkx as nx
|
|
7
|
-
from networkx.readwrite import json_graph
|
|
8
|
-
|
|
9
|
-
from graphify.build import edge_data
|
|
10
|
-
from graphify.serve import _query_terms
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
_CHARS_PER_TOKEN = 4 # standard approximation
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def _safe(unicode_char: str, ascii_fallback: str) -> str:
|
|
17
|
-
"""Return unicode_char if stdout can encode it, else ascii_fallback.
|
|
18
|
-
|
|
19
|
-
Windows consoles often default to cp1252 which cannot encode box-drawing
|
|
20
|
-
or arrow glyphs; printing them raises UnicodeEncodeError mid-output.
|
|
21
|
-
"""
|
|
22
|
-
encoding = getattr(sys.stdout, "encoding", None) or ""
|
|
23
|
-
try:
|
|
24
|
-
unicode_char.encode(encoding)
|
|
25
|
-
return unicode_char
|
|
26
|
-
except (UnicodeEncodeError, LookupError):
|
|
27
|
-
return ascii_fallback
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def _hr(width: int = 50) -> str:
|
|
31
|
-
"""Horizontal rule that survives non-UTF-8 stdout (e.g. Windows cp1252 console)."""
|
|
32
|
-
return _safe("─", "-") * width
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _estimate_tokens(text: str) -> int:
|
|
36
|
-
return max(1, len(text) // _CHARS_PER_TOKEN)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def _query_subgraph_tokens(G: nx.Graph, question: str, depth: int = 3) -> int:
|
|
40
|
-
"""Run BFS from best-matching nodes and return estimated tokens in the subgraph context."""
|
|
41
|
-
terms = _query_terms(question)
|
|
42
|
-
scored = []
|
|
43
|
-
for nid, data in G.nodes(data=True):
|
|
44
|
-
label = data.get("label", "").lower()
|
|
45
|
-
score = sum(1 for t in terms if t in label)
|
|
46
|
-
if score > 0:
|
|
47
|
-
scored.append((score, nid))
|
|
48
|
-
scored.sort(reverse=True)
|
|
49
|
-
start_nodes = [nid for _, nid in scored[:3]]
|
|
50
|
-
if not start_nodes:
|
|
51
|
-
return 0
|
|
52
|
-
|
|
53
|
-
visited: set[str] = set(start_nodes)
|
|
54
|
-
frontier = set(start_nodes)
|
|
55
|
-
edges_seen: list[tuple] = []
|
|
56
|
-
for _ in range(depth):
|
|
57
|
-
next_frontier: set[str] = set()
|
|
58
|
-
for n in frontier:
|
|
59
|
-
for neighbor in G.neighbors(n):
|
|
60
|
-
if neighbor not in visited:
|
|
61
|
-
next_frontier.add(neighbor)
|
|
62
|
-
edges_seen.append((n, neighbor))
|
|
63
|
-
visited.update(next_frontier)
|
|
64
|
-
frontier = next_frontier
|
|
65
|
-
|
|
66
|
-
lines = []
|
|
67
|
-
for nid in visited:
|
|
68
|
-
d = G.nodes[nid]
|
|
69
|
-
lines.append(f"NODE {d.get('label', nid)} src={d.get('source_file', '')} loc={d.get('source_location', '')}")
|
|
70
|
-
for u, v in edges_seen:
|
|
71
|
-
if u in visited and v in visited:
|
|
72
|
-
d = edge_data(G, u, v)
|
|
73
|
-
lines.append(f"EDGE {G.nodes[u].get('label', u)} --{d.get('relation', '')}--> {G.nodes[v].get('label', v)}")
|
|
74
|
-
|
|
75
|
-
return _estimate_tokens("\n".join(lines))
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
_SAMPLE_QUESTIONS = [
|
|
79
|
-
"how does authentication work",
|
|
80
|
-
"what is the main entry point",
|
|
81
|
-
"how are errors handled",
|
|
82
|
-
"what connects the data layer to the api",
|
|
83
|
-
"what are the core abstractions",
|
|
84
|
-
]
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def run_benchmark(
|
|
88
|
-
graph_path: str = "graphify-out/graph.json",
|
|
89
|
-
corpus_words: int | None = None,
|
|
90
|
-
questions: list[str] | None = None,
|
|
91
|
-
) -> dict:
|
|
92
|
-
"""Measure token reduction: corpus tokens vs graphify query tokens.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
graph_path: path to the built graph
|
|
96
|
-
corpus_words: total word count from detect() output; if None, estimated from graph
|
|
97
|
-
questions: list of questions to benchmark; defaults to _SAMPLE_QUESTIONS
|
|
98
|
-
|
|
99
|
-
Returns dict with: corpus_tokens, avg_query_tokens, reduction_ratio, per_question
|
|
100
|
-
"""
|
|
101
|
-
from graphify.security import check_graph_file_size_cap
|
|
102
|
-
check_graph_file_size_cap(Path(graph_path))
|
|
103
|
-
data = json.loads(Path(graph_path).read_text(encoding="utf-8"))
|
|
104
|
-
try:
|
|
105
|
-
G = json_graph.node_link_graph(data, edges="links")
|
|
106
|
-
except TypeError:
|
|
107
|
-
G = json_graph.node_link_graph(data)
|
|
108
|
-
|
|
109
|
-
if corpus_words is None:
|
|
110
|
-
# Rough estimate: each node label is ~3 words, plus source context
|
|
111
|
-
corpus_words = G.number_of_nodes() * 50
|
|
112
|
-
|
|
113
|
-
corpus_tokens = corpus_words * 100 // 75 # words → tokens (100 words ≈ 133 tokens)
|
|
114
|
-
|
|
115
|
-
qs = questions or _SAMPLE_QUESTIONS
|
|
116
|
-
per_question = []
|
|
117
|
-
for q in qs:
|
|
118
|
-
qt = _query_subgraph_tokens(G, q)
|
|
119
|
-
if qt > 0:
|
|
120
|
-
per_question.append({"question": q, "query_tokens": qt, "reduction": round(corpus_tokens / qt, 1)})
|
|
121
|
-
|
|
122
|
-
if not per_question:
|
|
123
|
-
return {"error": "No matching nodes found for sample questions. Build the graph first."}
|
|
124
|
-
|
|
125
|
-
avg_query_tokens = sum(p["query_tokens"] for p in per_question) // len(per_question)
|
|
126
|
-
reduction_ratio = round(corpus_tokens / avg_query_tokens, 1) if avg_query_tokens > 0 else 0
|
|
127
|
-
|
|
128
|
-
return {
|
|
129
|
-
"corpus_tokens": corpus_tokens,
|
|
130
|
-
"corpus_words": corpus_words,
|
|
131
|
-
"nodes": G.number_of_nodes(),
|
|
132
|
-
"edges": G.number_of_edges(),
|
|
133
|
-
"avg_query_tokens": avg_query_tokens,
|
|
134
|
-
"reduction_ratio": reduction_ratio,
|
|
135
|
-
"per_question": per_question,
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def print_benchmark(result: dict) -> None:
|
|
140
|
-
"""Print a human-readable benchmark report."""
|
|
141
|
-
if "error" in result:
|
|
142
|
-
print(f"Benchmark error: {result['error']}")
|
|
143
|
-
return
|
|
144
|
-
|
|
145
|
-
print(f"\ngraphify token reduction benchmark")
|
|
146
|
-
print(_hr(50))
|
|
147
|
-
arrow = _safe("→", "->")
|
|
148
|
-
print(f" Corpus: {result['corpus_words']:,} words {arrow} ~{result['corpus_tokens']:,} tokens (naive)")
|
|
149
|
-
print(f" Graph: {result['nodes']:,} nodes, {result['edges']:,} edges")
|
|
150
|
-
print(f" Avg query cost: ~{result['avg_query_tokens']:,} tokens")
|
|
151
|
-
print(f" Reduction: {result['reduction_ratio']}x fewer tokens per query")
|
|
152
|
-
print(f"\n Per question:")
|
|
153
|
-
for p in result["per_question"]:
|
|
154
|
-
print(f" [{p['reduction']}x] {p['question'][:55]}")
|
|
155
|
-
print()
|
package/skills/graphify/build.py
DELETED
|
@@ -1,487 +0,0 @@
|
|
|
1
|
-
# assemble node+edge dicts into a NetworkX graph, preserving edge direction
|
|
2
|
-
#
|
|
3
|
-
# Node deduplication — three layers:
|
|
4
|
-
#
|
|
5
|
-
# 1. Within a file (AST): each extractor tracks a `seen_ids` set. A node ID is
|
|
6
|
-
# emitted at most once per file, so duplicate class/function definitions in
|
|
7
|
-
# the same source file are collapsed to the first occurrence.
|
|
8
|
-
#
|
|
9
|
-
# 2. Between files (build): NetworkX G.add_node() is idempotent — calling it
|
|
10
|
-
# twice with the same ID overwrites the attributes with the second call's
|
|
11
|
-
# values. Nodes are added in extraction order (AST first, then semantic),
|
|
12
|
-
# so if the same entity is extracted by both passes the semantic node
|
|
13
|
-
# silently overwrites the AST node. This is intentional: semantic nodes
|
|
14
|
-
# carry richer labels and cross-file context, while AST nodes have precise
|
|
15
|
-
# source_location. If you need to change the priority, reorder extractions
|
|
16
|
-
# passed to build().
|
|
17
|
-
#
|
|
18
|
-
# 3. Semantic merge (skill): before calling build(), the skill merges cached
|
|
19
|
-
# and new semantic results using an explicit `seen` set keyed on node["id"],
|
|
20
|
-
# so duplicates across cache hits and new extractions are resolved there
|
|
21
|
-
# before any graph construction happens.
|
|
22
|
-
#
|
|
23
|
-
from __future__ import annotations
|
|
24
|
-
import json
|
|
25
|
-
import os
|
|
26
|
-
import re
|
|
27
|
-
import sys
|
|
28
|
-
import unicodedata
|
|
29
|
-
from pathlib import Path
|
|
30
|
-
import networkx as nx
|
|
31
|
-
from .validate import validate_extraction
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# Synonym mapper for known invalid file_type values that LLM subagents commonly
|
|
35
|
-
# emit. Keeps semantic intent close (markdown→document, tool→code) and falls
|
|
36
|
-
# back to "concept" for any other invalid value (see #840).
|
|
37
|
-
_FILE_TYPE_SYNONYMS = {
|
|
38
|
-
"markdown": "document",
|
|
39
|
-
"text": "document",
|
|
40
|
-
"tool": "code",
|
|
41
|
-
"library": "code",
|
|
42
|
-
"pattern": "concept",
|
|
43
|
-
"principle": "concept",
|
|
44
|
-
"constraint": "concept",
|
|
45
|
-
"tech": "concept",
|
|
46
|
-
"technology": "concept",
|
|
47
|
-
"data-source": "concept",
|
|
48
|
-
"data_source": "concept",
|
|
49
|
-
"gotcha": "concept",
|
|
50
|
-
"framework": "concept",
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _normalize_id(s: str) -> str:
|
|
55
|
-
r"""Normalize an ID string the same way extract._make_id does.
|
|
56
|
-
|
|
57
|
-
Used to reconcile edge endpoints when the LLM generates IDs with slightly
|
|
58
|
-
different punctuation or casing than the AST extractor. Must stay in sync
|
|
59
|
-
with extract._make_id — NFKC normalization, \w with re.UNICODE, underscore
|
|
60
|
-
collapse, and casefold must all match (#811).
|
|
61
|
-
"""
|
|
62
|
-
s = unicodedata.normalize("NFKC", s)
|
|
63
|
-
cleaned = re.sub(r"[^\w]+", "_", s, flags=re.UNICODE)
|
|
64
|
-
cleaned = re.sub(r"_+", "_", cleaned)
|
|
65
|
-
return cleaned.strip("_").casefold()
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
|
|
69
|
-
"""Normalize path separators and relativize absolute paths.
|
|
70
|
-
|
|
71
|
-
Converts backslashes to forward slashes (Windows compatibility) and, when
|
|
72
|
-
root is provided, strips the absolute prefix from paths produced by semantic
|
|
73
|
-
subagents so source_file is always repo-relative (fixes #932).
|
|
74
|
-
"""
|
|
75
|
-
if not p:
|
|
76
|
-
return p
|
|
77
|
-
p = p.replace("\\", "/")
|
|
78
|
-
if root and os.path.isabs(p):
|
|
79
|
-
try:
|
|
80
|
-
p = Path(p).relative_to(root).as_posix()
|
|
81
|
-
except ValueError:
|
|
82
|
-
pass
|
|
83
|
-
return p
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def edge_data(G: nx.Graph, u: str, v: str) -> dict:
|
|
87
|
-
"""Return one edge attribute dict for (u, v), tolerating MultiGraph.
|
|
88
|
-
|
|
89
|
-
For MultiGraph/MultiDiGraph there can be multiple parallel edges;
|
|
90
|
-
this returns the first one (sufficient for callers that only need
|
|
91
|
-
relation/confidence for rendering). Fixes #796.
|
|
92
|
-
"""
|
|
93
|
-
raw = G[u][v]
|
|
94
|
-
if isinstance(G, (nx.MultiGraph, nx.MultiDiGraph)):
|
|
95
|
-
return next(iter(raw.values()), {})
|
|
96
|
-
return raw
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def edge_datas(G: nx.Graph, u: str, v: str) -> list[dict]:
|
|
100
|
-
"""Return every edge attribute dict for (u, v); always a list."""
|
|
101
|
-
raw = G[u][v]
|
|
102
|
-
if isinstance(G, (nx.MultiGraph, nx.MultiDiGraph)):
|
|
103
|
-
return list(raw.values())
|
|
104
|
-
return [raw]
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def build_from_json(extraction: dict, *, directed: bool = False, root: str | Path | None = None) -> nx.Graph:
|
|
108
|
-
"""Build a NetworkX graph from an extraction dict.
|
|
109
|
-
|
|
110
|
-
directed=True produces a DiGraph that preserves edge direction (source→target).
|
|
111
|
-
directed=False (default) produces an undirected Graph for backward compatibility.
|
|
112
|
-
root: if given, absolute source_file paths from semantic subagents are made
|
|
113
|
-
relative to root so all nodes share a consistent path key (#932).
|
|
114
|
-
"""
|
|
115
|
-
_root = str(Path(root).resolve()) if root else None
|
|
116
|
-
# NetworkX <= 3.1 serialised edges as "links"; remap to "edges" for compatibility.
|
|
117
|
-
if "edges" not in extraction and "links" in extraction:
|
|
118
|
-
extraction = dict(extraction, edges=extraction["links"])
|
|
119
|
-
|
|
120
|
-
# Canonicalize legacy node/edge schema before validation.
|
|
121
|
-
for node in extraction.get("nodes", []):
|
|
122
|
-
if not isinstance(node, dict):
|
|
123
|
-
continue
|
|
124
|
-
if "source" in node and "source_file" not in node:
|
|
125
|
-
# Count edges that reference this node so the warning is actionable (#479)
|
|
126
|
-
node_id = node.get("id", "?")
|
|
127
|
-
affected_edges = sum(
|
|
128
|
-
1 for e in extraction.get("edges", [])
|
|
129
|
-
if e.get("source") == node_id or e.get("target") == node_id
|
|
130
|
-
)
|
|
131
|
-
print(
|
|
132
|
-
f"[graphify] WARNING: node '{node_id}' uses field 'source' instead of "
|
|
133
|
-
f"'source_file' — {affected_edges} edge(s) may be misrouted. "
|
|
134
|
-
f"Rename the field to 'source_file' to silence this warning.",
|
|
135
|
-
file=sys.stderr,
|
|
136
|
-
)
|
|
137
|
-
node["source_file"] = node.pop("source")
|
|
138
|
-
# Default missing/None file_type to "concept" so legacy graph.json
|
|
139
|
-
# entries (and stub nodes preserved by `_rebuild_code` from older
|
|
140
|
-
# graphify versions that didn't always populate file_type) don't
|
|
141
|
-
# trigger spurious "invalid file_type 'None'" validator warnings (#660).
|
|
142
|
-
if node.get("file_type") in (None, ""):
|
|
143
|
-
node["file_type"] = "concept"
|
|
144
|
-
ft = node.get("file_type", "")
|
|
145
|
-
if ft and ft not in {"code", "document", "paper", "image", "rationale", "concept"}:
|
|
146
|
-
node["file_type"] = _FILE_TYPE_SYNONYMS.get(ft, "concept")
|
|
147
|
-
|
|
148
|
-
errors = validate_extraction(extraction)
|
|
149
|
-
# Dangling edges (stdlib/external imports) are expected - only warn about real schema errors.
|
|
150
|
-
real_errors = [e for e in errors if "does not match any node id" not in e]
|
|
151
|
-
if real_errors:
|
|
152
|
-
print(f"[graphify] Extraction warning ({len(real_errors)} issues): {real_errors[0]}", file=sys.stderr)
|
|
153
|
-
G: nx.Graph = nx.DiGraph() if directed else nx.Graph()
|
|
154
|
-
for node in extraction.get("nodes", []):
|
|
155
|
-
if "source_file" in node:
|
|
156
|
-
node["source_file"] = _norm_source_file(node["source_file"], _root)
|
|
157
|
-
G.add_node(node["id"], **{k: v for k, v in node.items() if k != "id"})
|
|
158
|
-
node_set = set(G.nodes())
|
|
159
|
-
|
|
160
|
-
# #1145: merge semantic ghost-duplicate nodes into AST nodes.
|
|
161
|
-
# When AST and semantic extractors emit different IDs for the same symbol
|
|
162
|
-
# (one has source_location=L<n>, the other has source_location=None), find
|
|
163
|
-
# pairs that share (source_file basename, label) and collapse the semantic
|
|
164
|
-
# copy into the AST copy so edges re-point to a single node.
|
|
165
|
-
# Two passes: first collect all AST (located) nodes, then find ghosts.
|
|
166
|
-
_loc_nodes: dict[tuple[str, str], str] = {} # (basename, label) -> AST node id
|
|
167
|
-
_noloc_nodes: dict[tuple[str, str], str] = {} # (basename, label) -> semantic node id
|
|
168
|
-
for nid in node_set:
|
|
169
|
-
attrs = G.nodes[nid]
|
|
170
|
-
label = str(attrs.get("label", "")).strip()
|
|
171
|
-
sf = str(attrs.get("source_file", ""))
|
|
172
|
-
basename = Path(sf).name if sf else ""
|
|
173
|
-
if not label or not basename:
|
|
174
|
-
continue
|
|
175
|
-
if attrs.get("source_location"):
|
|
176
|
-
_loc_nodes[(basename, label)] = nid
|
|
177
|
-
for nid in node_set:
|
|
178
|
-
attrs = G.nodes[nid]
|
|
179
|
-
label = str(attrs.get("label", "")).strip()
|
|
180
|
-
sf = str(attrs.get("source_file", ""))
|
|
181
|
-
basename = Path(sf).name if sf else ""
|
|
182
|
-
if not label or not basename or attrs.get("source_location"):
|
|
183
|
-
continue
|
|
184
|
-
key = (basename, label)
|
|
185
|
-
if key in _loc_nodes and _loc_nodes[key] != nid:
|
|
186
|
-
_noloc_nodes[key] = nid
|
|
187
|
-
# For every ghost that has an AST counterpart, record a remap.
|
|
188
|
-
_ghost_remap: dict[str, str] = {} # ghost_id -> canonical_id
|
|
189
|
-
for key, sem_id in _noloc_nodes.items():
|
|
190
|
-
ast_id = _loc_nodes.get(key)
|
|
191
|
-
if ast_id is not None:
|
|
192
|
-
_ghost_remap[sem_id] = ast_id
|
|
193
|
-
# Remove ghost nodes from the graph; edges will be re-pointed via norm_to_id.
|
|
194
|
-
for ghost_id in _ghost_remap:
|
|
195
|
-
G.remove_node(ghost_id)
|
|
196
|
-
node_set.discard(ghost_id)
|
|
197
|
-
|
|
198
|
-
# Normalized ID map: lets edges survive when the LLM generates IDs with
|
|
199
|
-
# slightly different casing or punctuation than the AST extractor.
|
|
200
|
-
# e.g. "Session_ValidateToken" maps to "session_validatetoken".
|
|
201
|
-
norm_to_id: dict[str, str] = {_normalize_id(nid): nid for nid in node_set}
|
|
202
|
-
# Also map ghost IDs to their canonical AST replacements.
|
|
203
|
-
for ghost_id, canonical_id in _ghost_remap.items():
|
|
204
|
-
norm_to_id[_normalize_id(ghost_id)] = canonical_id
|
|
205
|
-
norm_to_id[ghost_id] = canonical_id
|
|
206
|
-
# Iterate edges in a deterministic order. The graph is undirected and stores
|
|
207
|
-
# direction in _src/_tgt; when two edges collapse onto the same node pair the
|
|
208
|
-
# last write wins, so an unstable iteration order flips _src/_tgt run-to-run
|
|
209
|
-
# and makes the serialized graph churn. Sorting fixes the last-write outcome.
|
|
210
|
-
for edge in sorted(
|
|
211
|
-
extraction.get("edges", []),
|
|
212
|
-
key=lambda e: (
|
|
213
|
-
str(e.get("source", e.get("from", ""))),
|
|
214
|
-
str(e.get("target", e.get("to", ""))),
|
|
215
|
-
str(e.get("relation", "")),
|
|
216
|
-
),
|
|
217
|
-
):
|
|
218
|
-
if "source" not in edge and "from" in edge:
|
|
219
|
-
edge["source"] = edge["from"]
|
|
220
|
-
if "target" not in edge and "to" in edge:
|
|
221
|
-
edge["target"] = edge["to"]
|
|
222
|
-
if "source" not in edge or "target" not in edge:
|
|
223
|
-
continue
|
|
224
|
-
src, tgt = edge["source"], edge["target"]
|
|
225
|
-
# Remap mismatched IDs via normalization before dropping the edge.
|
|
226
|
-
if src not in node_set:
|
|
227
|
-
src = norm_to_id.get(_normalize_id(src), src)
|
|
228
|
-
if tgt not in node_set:
|
|
229
|
-
tgt = norm_to_id.get(_normalize_id(tgt), tgt)
|
|
230
|
-
if src not in node_set or tgt not in node_set:
|
|
231
|
-
continue # skip edges to external/stdlib nodes - expected, not an error
|
|
232
|
-
attrs = {k: v for k, v in edge.items() if k not in ("source", "target")}
|
|
233
|
-
if "source_file" in attrs:
|
|
234
|
-
attrs["source_file"] = _norm_source_file(attrs["source_file"], _root)
|
|
235
|
-
# Drop cross-language INFERRED `calls` edges — same short names (render,
|
|
236
|
-
# parse, etc.) appear across language boundaries in multi-language chunks,
|
|
237
|
-
# producing phantom edges that don't represent real call relationships.
|
|
238
|
-
if attrs.get("relation") == "calls" and attrs.get("confidence") == "INFERRED":
|
|
239
|
-
_LANG_FAMILY: dict[str, str] = {
|
|
240
|
-
".py": "py", ".pyi": "py",
|
|
241
|
-
".js": "js", ".mjs": "js", ".cjs": "js", ".jsx": "js",
|
|
242
|
-
".ts": "js", ".tsx": "js",
|
|
243
|
-
".go": "go", ".rs": "rs",
|
|
244
|
-
".java": "jvm", ".kt": "jvm", ".scala": "jvm", ".groovy": "jvm",
|
|
245
|
-
".c": "c", ".h": "c", ".cc": "cpp", ".cpp": "cpp", ".hpp": "cpp",
|
|
246
|
-
".rb": "rb", ".php": "php", ".cs": "cs", ".swift": "swift", ".lua": "lua",
|
|
247
|
-
}
|
|
248
|
-
src_ext = Path(G.nodes[src].get("source_file") or "").suffix.lower()
|
|
249
|
-
tgt_ext = Path(G.nodes[tgt].get("source_file") or "").suffix.lower()
|
|
250
|
-
if src_ext and tgt_ext and _LANG_FAMILY.get(src_ext) != _LANG_FAMILY.get(tgt_ext):
|
|
251
|
-
continue
|
|
252
|
-
# Preserve original edge direction - undirected graphs lose it otherwise,
|
|
253
|
-
# causing display functions to show edges backwards.
|
|
254
|
-
attrs["_src"] = src
|
|
255
|
-
attrs["_tgt"] = tgt
|
|
256
|
-
# When the graph is undirected and the same node pair appears twice with
|
|
257
|
-
# the same relation but opposite directions (e.g. a `calls` b and b `calls` a),
|
|
258
|
-
# nx.Graph collapses them into one edge. The deterministic sort above means
|
|
259
|
-
# the lexicographically-later direction would systematically overwrite the
|
|
260
|
-
# earlier one's _src/_tgt, silently flipping the surviving edge's caller
|
|
261
|
-
# and callee. First-seen direction wins instead — drop the redundant
|
|
262
|
-
# reverse-direction duplicate so the original direction is preserved (#1061).
|
|
263
|
-
if not G.is_directed() and G.has_edge(src, tgt):
|
|
264
|
-
existing = edge_data(G, src, tgt)
|
|
265
|
-
if existing.get("relation") == attrs.get("relation") and (
|
|
266
|
-
existing.get("_src") == tgt and existing.get("_tgt") == src
|
|
267
|
-
):
|
|
268
|
-
continue
|
|
269
|
-
G.add_edge(src, tgt, **attrs)
|
|
270
|
-
hyperedges = extraction.get("hyperedges", [])
|
|
271
|
-
if hyperedges:
|
|
272
|
-
G.graph["hyperedges"] = hyperedges
|
|
273
|
-
return G
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
def build(
|
|
277
|
-
extractions: list[dict],
|
|
278
|
-
*,
|
|
279
|
-
directed: bool = False,
|
|
280
|
-
dedup: bool = True,
|
|
281
|
-
dedup_llm_backend: str | None = None,
|
|
282
|
-
root: str | Path | None = None,
|
|
283
|
-
) -> nx.Graph:
|
|
284
|
-
"""Merge multiple extraction results into one graph.
|
|
285
|
-
|
|
286
|
-
directed=True produces a DiGraph that preserves edge direction (source→target).
|
|
287
|
-
directed=False (default) produces an undirected Graph for backward compatibility.
|
|
288
|
-
dedup=True (default) runs entity deduplication before building the graph.
|
|
289
|
-
dedup_llm_backend: if set (e.g. "gemini", "claude", or "kimi"), uses LLM to resolve
|
|
290
|
-
ambiguous pairs in the 75–92 Jaro-Winkler score zone.
|
|
291
|
-
root: if given, absolute source_file paths are made relative to root (#932).
|
|
292
|
-
|
|
293
|
-
Extractions are merged in order. For nodes with the same ID, the last
|
|
294
|
-
extraction's attributes win (NetworkX add_node overwrites). Pass AST
|
|
295
|
-
results before semantic results so semantic labels take precedence, or
|
|
296
|
-
reverse the order if you prefer AST source_location precision to win.
|
|
297
|
-
"""
|
|
298
|
-
from graphify.dedup import deduplicate_entities
|
|
299
|
-
combined: dict = {"nodes": [], "edges": [], "hyperedges": [], "input_tokens": 0, "output_tokens": 0}
|
|
300
|
-
for ext in extractions:
|
|
301
|
-
combined["nodes"].extend(ext.get("nodes", []))
|
|
302
|
-
combined["edges"].extend(ext.get("edges", []))
|
|
303
|
-
combined["hyperedges"].extend(ext.get("hyperedges", []))
|
|
304
|
-
combined["input_tokens"] += ext.get("input_tokens", 0)
|
|
305
|
-
combined["output_tokens"] += ext.get("output_tokens", 0)
|
|
306
|
-
if dedup and combined["nodes"]:
|
|
307
|
-
combined["nodes"], combined["edges"] = deduplicate_entities(
|
|
308
|
-
combined["nodes"], combined["edges"], communities={},
|
|
309
|
-
dedup_llm_backend=dedup_llm_backend,
|
|
310
|
-
)
|
|
311
|
-
return build_from_json(combined, directed=directed, root=root)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def _norm_label(label: str) -> str:
|
|
315
|
-
"""Canonical dedup key — Unicode-aware, preserves CJK/word characters."""
|
|
316
|
-
label = unicodedata.normalize("NFKC", label)
|
|
317
|
-
return re.sub(r"[\W_ ]+", " ", label.casefold(), flags=re.UNICODE).strip()
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
def deduplicate_by_label(nodes: list[dict], edges: list[dict]) -> tuple[list[dict], list[dict]]:
|
|
321
|
-
"""Merge nodes that share a normalised label, rewriting edge references.
|
|
322
|
-
|
|
323
|
-
Prefers IDs without chunk suffixes (_c\\d+) and shorter IDs when tied.
|
|
324
|
-
Drops self-loops created by the merge. Called in build() automatically.
|
|
325
|
-
"""
|
|
326
|
-
_CHUNK_SUFFIX = re.compile(r"_c\d+$")
|
|
327
|
-
canonical: dict[str, dict] = {} # norm_label -> surviving node
|
|
328
|
-
remap: dict[str, str] = {} # old_id -> surviving_id
|
|
329
|
-
|
|
330
|
-
for node in nodes:
|
|
331
|
-
key = _norm_label(node.get("label", node.get("id", "")))
|
|
332
|
-
if not key:
|
|
333
|
-
continue
|
|
334
|
-
existing = canonical.get(key)
|
|
335
|
-
if existing is None:
|
|
336
|
-
canonical[key] = node
|
|
337
|
-
else:
|
|
338
|
-
has_suffix = bool(_CHUNK_SUFFIX.search(node["id"]))
|
|
339
|
-
existing_has_suffix = bool(_CHUNK_SUFFIX.search(existing["id"]))
|
|
340
|
-
if has_suffix and not existing_has_suffix:
|
|
341
|
-
remap[node["id"]] = existing["id"]
|
|
342
|
-
elif existing_has_suffix and not has_suffix:
|
|
343
|
-
remap[existing["id"]] = node["id"]
|
|
344
|
-
canonical[key] = node
|
|
345
|
-
elif len(node["id"]) < len(existing["id"]):
|
|
346
|
-
remap[existing["id"]] = node["id"]
|
|
347
|
-
canonical[key] = node
|
|
348
|
-
else:
|
|
349
|
-
remap[node["id"]] = existing["id"]
|
|
350
|
-
|
|
351
|
-
if not remap:
|
|
352
|
-
return nodes, edges
|
|
353
|
-
|
|
354
|
-
print(f"[graphify] Deduplicated {len(remap)} duplicate node(s) by label.", file=sys.stderr)
|
|
355
|
-
deduped_nodes = list(canonical.values())
|
|
356
|
-
deduped_edges = []
|
|
357
|
-
for edge in edges:
|
|
358
|
-
e = dict(edge)
|
|
359
|
-
e["source"] = remap.get(e["source"], e["source"])
|
|
360
|
-
e["target"] = remap.get(e["target"], e["target"])
|
|
361
|
-
if e["source"] != e["target"]:
|
|
362
|
-
deduped_edges.append(e)
|
|
363
|
-
return deduped_nodes, deduped_edges
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
def build_merge(
|
|
367
|
-
new_chunks: list[dict],
|
|
368
|
-
graph_path: str | Path = "graphify-out/graph.json",
|
|
369
|
-
prune_sources: list[str] | None = None,
|
|
370
|
-
*,
|
|
371
|
-
directed: bool = False,
|
|
372
|
-
dedup: bool = True,
|
|
373
|
-
dedup_llm_backend: str | None = None,
|
|
374
|
-
root: str | Path | None = None,
|
|
375
|
-
) -> nx.Graph:
|
|
376
|
-
"""Load existing graph.json, merge new chunks into it, and save back.
|
|
377
|
-
|
|
378
|
-
Never replaces - only grows (or prunes deleted-file nodes via prune_sources).
|
|
379
|
-
Safe to call repeatedly: existing nodes and edges are preserved.
|
|
380
|
-
root: if given, absolute source_file paths in new_chunks are made relative (#932).
|
|
381
|
-
"""
|
|
382
|
-
graph_path = Path(graph_path)
|
|
383
|
-
if graph_path.exists():
|
|
384
|
-
# Read JSON directly instead of going through node_link_graph().
|
|
385
|
-
# The latter rebuilds an undirected nx.Graph and then enumerating
|
|
386
|
-
# edges() yields endpoints based on node insertion order, which
|
|
387
|
-
# silently flips directional edges (e.g. `calls`) when the callee
|
|
388
|
-
# was inserted before the caller. The _src/_tgt direction-preserving
|
|
389
|
-
# attrs are popped before saving in export.py, so going through the
|
|
390
|
-
# NetworkX round-trip loses direction permanently (#760).
|
|
391
|
-
from graphify.security import check_graph_file_size_cap
|
|
392
|
-
check_graph_file_size_cap(graph_path)
|
|
393
|
-
data = json.loads(graph_path.read_text(encoding="utf-8"))
|
|
394
|
-
links_key = "links" if "links" in data else "edges"
|
|
395
|
-
existing_nodes = list(data.get("nodes", []))
|
|
396
|
-
existing_edges = list(data.get(links_key, []))
|
|
397
|
-
base = [{"nodes": existing_nodes, "edges": existing_edges}]
|
|
398
|
-
else:
|
|
399
|
-
existing_nodes = []
|
|
400
|
-
base = []
|
|
401
|
-
|
|
402
|
-
all_chunks = base + list(new_chunks)
|
|
403
|
-
G = build(all_chunks, directed=directed, dedup=dedup, dedup_llm_backend=dedup_llm_backend, root=root)
|
|
404
|
-
|
|
405
|
-
# Prune nodes and edges from deleted source files
|
|
406
|
-
if prune_sources:
|
|
407
|
-
# Build a set containing both the raw form (matches nodes that kept
|
|
408
|
-
# absolute source_file) and the normalised relative form (matches nodes
|
|
409
|
-
# that were relativised by _norm_source_file at build time).
|
|
410
|
-
# .resolve() handles symlinked roots and redundant ".." / "./" segments
|
|
411
|
-
# so Path.relative_to() succeeds even when the scan root is a symlink.
|
|
412
|
-
# (#1007: manifest absolute paths vs graph relative source_file mismatch)
|
|
413
|
-
_root_str = str(Path(root).resolve()) if root is not None else None
|
|
414
|
-
prune_set: set[str] = set()
|
|
415
|
-
for p in prune_sources:
|
|
416
|
-
if not p:
|
|
417
|
-
continue
|
|
418
|
-
prune_set.add(p)
|
|
419
|
-
norm = _norm_source_file(p, _root_str)
|
|
420
|
-
if norm:
|
|
421
|
-
prune_set.add(norm)
|
|
422
|
-
to_remove = [
|
|
423
|
-
n for n, d in G.nodes(data=True)
|
|
424
|
-
if d.get("source_file") in prune_set
|
|
425
|
-
]
|
|
426
|
-
G.remove_nodes_from(to_remove)
|
|
427
|
-
n_files = len(prune_sources)
|
|
428
|
-
n_nodes = len(to_remove)
|
|
429
|
-
if n_nodes:
|
|
430
|
-
print(
|
|
431
|
-
f"[graphify] Pruned {n_nodes} node(s) from {n_files} deleted source file(s).",
|
|
432
|
-
file=sys.stderr,
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
edges_to_remove = [
|
|
436
|
-
(u, v) for u, v, d in G.edges(data=True)
|
|
437
|
-
if d.get("source_file") in prune_set
|
|
438
|
-
]
|
|
439
|
-
if edges_to_remove:
|
|
440
|
-
G.remove_edges_from(edges_to_remove)
|
|
441
|
-
print(
|
|
442
|
-
f"[graphify] Pruned {len(edges_to_remove)} edge(s) from deleted source file(s).",
|
|
443
|
-
file=sys.stderr,
|
|
444
|
-
)
|
|
445
|
-
|
|
446
|
-
if not n_nodes and not edges_to_remove:
|
|
447
|
-
print(
|
|
448
|
-
f"[graphify] {n_files} source file(s) deleted since last run — "
|
|
449
|
-
f"no matching nodes or edges in graph, already clean.",
|
|
450
|
-
file=sys.stderr,
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
# Safety check: refuse to shrink the graph silently (#479)
|
|
454
|
-
# Skip when dedup or prune_sources is active — shrinkage is intentional there.
|
|
455
|
-
if graph_path.exists() and not dedup and not prune_sources:
|
|
456
|
-
existing_n = len(existing_nodes)
|
|
457
|
-
new_n = G.number_of_nodes()
|
|
458
|
-
if new_n < existing_n:
|
|
459
|
-
raise ValueError(
|
|
460
|
-
f"graphify: build_merge would shrink graph from {existing_n} → {new_n} nodes. "
|
|
461
|
-
f"Pass prune_sources explicitly if you intend to remove nodes."
|
|
462
|
-
)
|
|
463
|
-
|
|
464
|
-
return G
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
def prefix_graph_for_global(G: nx.Graph, repo_tag: str) -> nx.Graph:
|
|
468
|
-
"""Return a copy of G with all node IDs prefixed with repo_tag::.
|
|
469
|
-
|
|
470
|
-
Labels are preserved unchanged (for display). A 'local_id' attribute
|
|
471
|
-
is added to each node so the original ID can be recovered. Edges are
|
|
472
|
-
rewritten to match the new prefixed IDs. The 'repo' attribute is set
|
|
473
|
-
on every node.
|
|
474
|
-
"""
|
|
475
|
-
relabel = {n: f"{repo_tag}::{n}" for n in G.nodes}
|
|
476
|
-
H = nx.relabel_nodes(G, relabel, copy=True)
|
|
477
|
-
for node, data in H.nodes(data=True):
|
|
478
|
-
data["repo"] = repo_tag
|
|
479
|
-
data.setdefault("local_id", node.split("::", 1)[1])
|
|
480
|
-
return H
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
def prune_repo_from_graph(G: nx.Graph, repo_tag: str) -> int:
|
|
484
|
-
"""Remove all nodes tagged with repo_tag from G in-place. Returns count removed."""
|
|
485
|
-
to_remove = [n for n, d in G.nodes(data=True) if d.get("repo") == repo_tag]
|
|
486
|
-
G.remove_nodes_from(to_remove)
|
|
487
|
-
return len(to_remove)
|