@oriro/orirocli 0.1.9 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. package/README.md +16 -18
  2. package/dist/cli.js +4776 -2964
  3. package/package.json +2 -2
  4. package/skills/craft/ai-engineering/SKILL.md +2 -2
  5. package/skills/graphify/SKILL.md +0 -619
  6. package/skills/graphify/__init__.py +0 -28
  7. package/skills/graphify/__main__.py +0 -4582
  8. package/skills/graphify/affected.py +0 -154
  9. package/skills/graphify/always_on/agents-md.md +0 -12
  10. package/skills/graphify/always_on/antigravity-rules.md +0 -14
  11. package/skills/graphify/always_on/claude-md.md +0 -9
  12. package/skills/graphify/always_on/gemini-md.md +0 -9
  13. package/skills/graphify/always_on/kiro-steering.md +0 -5
  14. package/skills/graphify/always_on/vscode-instructions.md +0 -17
  15. package/skills/graphify/analyze.py +0 -724
  16. package/skills/graphify/benchmark.py +0 -155
  17. package/skills/graphify/build.py +0 -487
  18. package/skills/graphify/cache.py +0 -417
  19. package/skills/graphify/callflow_html.py +0 -2020
  20. package/skills/graphify/cluster.py +0 -272
  21. package/skills/graphify/command-kilo.md +0 -15
  22. package/skills/graphify/dedup.py +0 -429
  23. package/skills/graphify/detect.py +0 -1379
  24. package/skills/graphify/diagnostics.py +0 -390
  25. package/skills/graphify/export.py +0 -1408
  26. package/skills/graphify/extract.py +0 -11570
  27. package/skills/graphify/global_graph.py +0 -159
  28. package/skills/graphify/google_workspace.py +0 -223
  29. package/skills/graphify/hooks.py +0 -457
  30. package/skills/graphify/ingest.py +0 -331
  31. package/skills/graphify/llm.py +0 -1896
  32. package/skills/graphify/manifest.py +0 -4
  33. package/skills/graphify/mcp_ingest.py +0 -392
  34. package/skills/graphify/multigraph_compat.py +0 -212
  35. package/skills/graphify/pg_introspect.py +0 -142
  36. package/skills/graphify/prs.py +0 -748
  37. package/skills/graphify/querylog.py +0 -70
  38. package/skills/graphify/report.py +0 -218
  39. package/skills/graphify/scip_ingest.py +0 -363
  40. package/skills/graphify/security.py +0 -336
  41. package/skills/graphify/semantic_cleanup.py +0 -319
  42. package/skills/graphify/serve.py +0 -1309
  43. package/skills/graphify/skill-aider.md +0 -1246
  44. package/skills/graphify/skill-amp.md +0 -613
  45. package/skills/graphify/skill-claw.md +0 -616
  46. package/skills/graphify/skill-codex.md +0 -613
  47. package/skills/graphify/skill-copilot.md +0 -616
  48. package/skills/graphify/skill-devin.md +0 -1372
  49. package/skills/graphify/skill-droid.md +0 -613
  50. package/skills/graphify/skill-kilo.md +0 -625
  51. package/skills/graphify/skill-kiro.md +0 -615
  52. package/skills/graphify/skill-opencode.md +0 -608
  53. package/skills/graphify/skill-pi.md +0 -615
  54. package/skills/graphify/skill-trae.md +0 -614
  55. package/skills/graphify/skill-vscode.md +0 -612
  56. package/skills/graphify/skill-windows.md +0 -651
  57. package/skills/graphify/skills/amp/references/add-watch.md +0 -56
  58. package/skills/graphify/skills/amp/references/exports.md +0 -71
  59. package/skills/graphify/skills/amp/references/extraction-spec.md +0 -68
  60. package/skills/graphify/skills/amp/references/github-and-merge.md +0 -46
  61. package/skills/graphify/skills/amp/references/hooks.md +0 -33
  62. package/skills/graphify/skills/amp/references/query.md +0 -249
  63. package/skills/graphify/skills/amp/references/transcribe.md +0 -48
  64. package/skills/graphify/skills/amp/references/update.md +0 -179
  65. package/skills/graphify/skills/claude/references/add-watch.md +0 -56
  66. package/skills/graphify/skills/claude/references/exports.md +0 -71
  67. package/skills/graphify/skills/claude/references/extraction-spec.md +0 -68
  68. package/skills/graphify/skills/claude/references/github-and-merge.md +0 -46
  69. package/skills/graphify/skills/claude/references/hooks.md +0 -33
  70. package/skills/graphify/skills/claude/references/query.md +0 -103
  71. package/skills/graphify/skills/claude/references/transcribe.md +0 -48
  72. package/skills/graphify/skills/claude/references/update.md +0 -179
  73. package/skills/graphify/skills/claw/references/add-watch.md +0 -56
  74. package/skills/graphify/skills/claw/references/exports.md +0 -71
  75. package/skills/graphify/skills/claw/references/extraction-spec.md +0 -29
  76. package/skills/graphify/skills/claw/references/github-and-merge.md +0 -46
  77. package/skills/graphify/skills/claw/references/hooks.md +0 -33
  78. package/skills/graphify/skills/claw/references/query.md +0 -249
  79. package/skills/graphify/skills/claw/references/transcribe.md +0 -48
  80. package/skills/graphify/skills/claw/references/update.md +0 -179
  81. package/skills/graphify/skills/codex/references/add-watch.md +0 -56
  82. package/skills/graphify/skills/codex/references/exports.md +0 -71
  83. package/skills/graphify/skills/codex/references/extraction-spec.md +0 -29
  84. package/skills/graphify/skills/codex/references/github-and-merge.md +0 -46
  85. package/skills/graphify/skills/codex/references/hooks.md +0 -33
  86. package/skills/graphify/skills/codex/references/query.md +0 -249
  87. package/skills/graphify/skills/codex/references/transcribe.md +0 -48
  88. package/skills/graphify/skills/codex/references/update.md +0 -179
  89. package/skills/graphify/skills/copilot/references/add-watch.md +0 -56
  90. package/skills/graphify/skills/copilot/references/exports.md +0 -71
  91. package/skills/graphify/skills/copilot/references/extraction-spec.md +0 -68
  92. package/skills/graphify/skills/copilot/references/github-and-merge.md +0 -46
  93. package/skills/graphify/skills/copilot/references/hooks.md +0 -33
  94. package/skills/graphify/skills/copilot/references/query.md +0 -249
  95. package/skills/graphify/skills/copilot/references/transcribe.md +0 -48
  96. package/skills/graphify/skills/copilot/references/update.md +0 -179
  97. package/skills/graphify/skills/droid/references/add-watch.md +0 -56
  98. package/skills/graphify/skills/droid/references/exports.md +0 -71
  99. package/skills/graphify/skills/droid/references/extraction-spec.md +0 -68
  100. package/skills/graphify/skills/droid/references/github-and-merge.md +0 -46
  101. package/skills/graphify/skills/droid/references/hooks.md +0 -33
  102. package/skills/graphify/skills/droid/references/query.md +0 -249
  103. package/skills/graphify/skills/droid/references/transcribe.md +0 -48
  104. package/skills/graphify/skills/droid/references/update.md +0 -179
  105. package/skills/graphify/skills/kilo/references/add-watch.md +0 -56
  106. package/skills/graphify/skills/kilo/references/exports.md +0 -71
  107. package/skills/graphify/skills/kilo/references/extraction-spec.md +0 -68
  108. package/skills/graphify/skills/kilo/references/github-and-merge.md +0 -46
  109. package/skills/graphify/skills/kilo/references/hooks.md +0 -33
  110. package/skills/graphify/skills/kilo/references/query.md +0 -249
  111. package/skills/graphify/skills/kilo/references/transcribe.md +0 -48
  112. package/skills/graphify/skills/kilo/references/update.md +0 -179
  113. package/skills/graphify/skills/kiro/references/add-watch.md +0 -56
  114. package/skills/graphify/skills/kiro/references/exports.md +0 -71
  115. package/skills/graphify/skills/kiro/references/extraction-spec.md +0 -29
  116. package/skills/graphify/skills/kiro/references/github-and-merge.md +0 -46
  117. package/skills/graphify/skills/kiro/references/hooks.md +0 -33
  118. package/skills/graphify/skills/kiro/references/query.md +0 -249
  119. package/skills/graphify/skills/kiro/references/transcribe.md +0 -48
  120. package/skills/graphify/skills/kiro/references/update.md +0 -179
  121. package/skills/graphify/skills/opencode/references/add-watch.md +0 -56
  122. package/skills/graphify/skills/opencode/references/exports.md +0 -71
  123. package/skills/graphify/skills/opencode/references/extraction-spec.md +0 -68
  124. package/skills/graphify/skills/opencode/references/github-and-merge.md +0 -46
  125. package/skills/graphify/skills/opencode/references/hooks.md +0 -33
  126. package/skills/graphify/skills/opencode/references/query.md +0 -249
  127. package/skills/graphify/skills/opencode/references/transcribe.md +0 -48
  128. package/skills/graphify/skills/opencode/references/update.md +0 -179
  129. package/skills/graphify/skills/pi/references/add-watch.md +0 -56
  130. package/skills/graphify/skills/pi/references/exports.md +0 -71
  131. package/skills/graphify/skills/pi/references/extraction-spec.md +0 -29
  132. package/skills/graphify/skills/pi/references/github-and-merge.md +0 -46
  133. package/skills/graphify/skills/pi/references/hooks.md +0 -33
  134. package/skills/graphify/skills/pi/references/query.md +0 -249
  135. package/skills/graphify/skills/pi/references/transcribe.md +0 -48
  136. package/skills/graphify/skills/pi/references/update.md +0 -179
  137. package/skills/graphify/skills/trae/references/add-watch.md +0 -56
  138. package/skills/graphify/skills/trae/references/exports.md +0 -71
  139. package/skills/graphify/skills/trae/references/extraction-spec.md +0 -68
  140. package/skills/graphify/skills/trae/references/github-and-merge.md +0 -46
  141. package/skills/graphify/skills/trae/references/hooks.md +0 -35
  142. package/skills/graphify/skills/trae/references/query.md +0 -249
  143. package/skills/graphify/skills/trae/references/transcribe.md +0 -48
  144. package/skills/graphify/skills/trae/references/update.md +0 -179
  145. package/skills/graphify/skills/vscode/references/add-watch.md +0 -56
  146. package/skills/graphify/skills/vscode/references/exports.md +0 -71
  147. package/skills/graphify/skills/vscode/references/extraction-spec.md +0 -68
  148. package/skills/graphify/skills/vscode/references/github-and-merge.md +0 -46
  149. package/skills/graphify/skills/vscode/references/hooks.md +0 -33
  150. package/skills/graphify/skills/vscode/references/query.md +0 -249
  151. package/skills/graphify/skills/vscode/references/transcribe.md +0 -48
  152. package/skills/graphify/skills/vscode/references/update.md +0 -179
  153. package/skills/graphify/skills/windows/references/add-watch.md +0 -56
  154. package/skills/graphify/skills/windows/references/exports.md +0 -71
  155. package/skills/graphify/skills/windows/references/extraction-spec.md +0 -68
  156. package/skills/graphify/skills/windows/references/github-and-merge.md +0 -46
  157. package/skills/graphify/skills/windows/references/hooks.md +0 -33
  158. package/skills/graphify/skills/windows/references/query.md +0 -249
  159. package/skills/graphify/skills/windows/references/transcribe.md +0 -48
  160. package/skills/graphify/skills/windows/references/update.md +0 -179
  161. package/skills/graphify/symbol_resolution.py +0 -538
  162. package/skills/graphify/transcribe.py +0 -184
  163. package/skills/graphify/tree_html.py +0 -582
  164. package/skills/graphify/validate.py +0 -72
  165. package/skills/graphify/watch.py +0 -898
  166. package/skills/graphify/wiki.py +0 -282
@@ -1,429 +0,0 @@
1
- """Entity deduplication pipeline for graphify knowledge graphs.
2
-
3
- Pipeline: exact normalization → entropy gate → MinHash/LSH blocking →
4
- Jaro-Winkler verification → same-community boost → union-find merge.
5
- """
6
- from __future__ import annotations
7
- import math
8
- import re
9
- import unicodedata
10
- from collections import defaultdict
11
-
12
- from datasketch import MinHash, MinHashLSH
13
- from rapidfuzz.distance import JaroWinkler
14
-
15
-
16
- # ── helpers ───────────────────────────────────────────────────────────────────
17
-
18
- def _norm(label: str) -> str:
19
- """Lowercase + collapse non-alphanumeric runs to space (Unicode-aware)."""
20
- label = unicodedata.normalize("NFKC", label)
21
- return re.sub(r"[\W_]+", " ", label.casefold(), flags=re.UNICODE).strip()
22
-
23
-
24
- def _entropy(label: str) -> float:
25
- """Shannon entropy in bits/char of the normalised label."""
26
- s = _norm(label)
27
- if not s:
28
- return 0.0
29
- freq: dict[str, int] = defaultdict(int)
30
- for ch in s:
31
- freq[ch] += 1
32
- n = len(s)
33
- return -sum((c / n) * math.log2(c / n) for c in freq.values())
34
-
35
-
36
- def _shingles(text: str, k: int = 3) -> set[str]:
37
- """Return k-gram character shingles of text."""
38
- if len(text) < k:
39
- return {text}
40
- return {text[i : i + k] for i in range(len(text) - k + 1)}
41
-
42
-
43
- def _make_minhash(text: str, num_perm: int = 128) -> MinHash:
44
- # Strip spaces so "graph extractor" and "graphextractor" share shingles
45
- m = MinHash(num_perm=num_perm)
46
- for shingle in _shingles(text.replace(" ", "")):
47
- m.update(shingle.encode("utf-8"))
48
- return m
49
-
50
-
51
- # Matches labels whose trailing token is a version/variant suffix:
52
- # digits optionally followed by letters (chip SKUs: ASR1603, M1, Cortex-A55)
53
- # or 2+ letters (codename revisions: cranelr vs cranel).
54
- # Requires the stem to end in a letter so plain words don't accidentally match.
55
- _VARIANT_SUFFIX = re.compile(r"^(.*[a-z])([0-9]+[a-z]*|[a-z]{2,})$")
56
-
57
-
58
- def _is_variant_pair(a: str, b: str) -> bool:
59
- """True if a and b are sibling model/SKU variants (same stem, different suffix).
60
-
61
- Only applied to short labels (< 12 chars); long labels go through JW normally.
62
- """
63
- if a == b:
64
- return False
65
- if max(len(a), len(b)) >= 12:
66
- return False
67
- ma, mb = _VARIANT_SUFFIX.match(a), _VARIANT_SUFFIX.match(b)
68
- if not (ma and mb):
69
- return False
70
- return ma.group(1) == mb.group(1) and ma.group(2) != mb.group(2)
71
-
72
-
73
- def _short_label_blocked(a: str, b: str, jw_score: float) -> bool:
74
- """Block fuzzy merge for short labels unless it's a same-length single-char substitution.
75
-
76
- Insertions/deletions on short strings (cranel/cranelr, M1/M1 Pro) produce
77
- high Jaro-Winkler scores due to the prefix bonus but are almost never true
78
- duplicates — they're abbreviations or variants.
79
- """
80
- if max(len(a), len(b)) >= 12:
81
- return False
82
- from rapidfuzz.distance import DamerauLevenshtein
83
- # Allow only same-length single-char substitutions (true typos like "Extractor"/"Extractar").
84
- # Block length-differing pairs regardless of score.
85
- if jw_score >= 97.0 and len(a) == len(b) and DamerauLevenshtein.distance(a, b) <= 1:
86
- return False
87
- return True
88
-
89
-
90
- # ── union-find ────────────────────────────────────────────────────────────────
91
-
92
- class _UF:
93
- def __init__(self) -> None:
94
- self._parent: dict[str, str] = {}
95
-
96
- def find(self, x: str) -> str:
97
- self._parent.setdefault(x, x)
98
- while self._parent[x] != x:
99
- self._parent[x] = self._parent[self._parent[x]]
100
- x = self._parent[x]
101
- return x
102
-
103
- def union(self, x: str, y: str) -> None:
104
- self._parent.setdefault(x, x)
105
- self._parent.setdefault(y, y)
106
- rx, ry = self.find(x), self.find(y)
107
- if rx != ry:
108
- self._parent[ry] = rx
109
-
110
- def components(self) -> dict[str, list[str]]:
111
- groups: dict[str, list[str]] = defaultdict(list)
112
- for x in self._parent:
113
- groups[self.find(x)].append(x)
114
- return dict(groups)
115
-
116
-
117
- # ── constants ─────────────────────────────────────────────────────────────────
118
-
119
- _ENTROPY_THRESHOLD = 2.5
120
- _LSH_THRESHOLD = 0.7
121
- _MERGE_THRESHOLD = 92.0 # rapidfuzz normalized_similarity * 100
122
- _COMMUNITY_BOOST = 5.0 # score bonus when both nodes share community
123
- _NUM_PERM = 128
124
- _CHUNK_SUFFIX = re.compile(r"_c\d+$")
125
-
126
-
127
- # ── main entry point ──────────────────────────────────────────────────────────
128
-
129
- def deduplicate_entities(
130
- nodes: list[dict],
131
- edges: list[dict],
132
- *,
133
- communities: dict[str, int],
134
- dedup_llm_backend: str | None = None,
135
- ) -> tuple[list[dict], list[dict]]:
136
- """Deduplicate near-identical entities in a knowledge graph.
137
-
138
- Args:
139
- nodes: list of node dicts with at minimum {"id": str, "label": str}
140
- edges: list of edge dicts with {"source": str, "target": str, ...}
141
- communities: mapping of node_id -> community_id (from cluster())
142
- dedup_llm_backend: if set, use LLM to resolve ambiguous pairs
143
-
144
- Returns:
145
- (deduped_nodes, deduped_edges) with edges rewired to survivors
146
- """
147
- # Guard: cross-project dedup is not supported — nodes from different repos
148
- # share label names by coincidence and must never be merged by string similarity.
149
- # If you need to dedup a global graph, run deduplicate_entities per-repo first.
150
- repos_seen = {n.get("repo") for n in nodes if n.get("repo")}
151
- if len(repos_seen) > 1:
152
- raise ValueError(
153
- f"deduplicate_entities: nodes span multiple repos {sorted(repos_seen)!r}. "
154
- f"Cross-project dedup is disabled — run dedup per-repo before merging."
155
- )
156
-
157
- if len(nodes) <= 1:
158
- return nodes, edges
159
-
160
- # Pre-deduplicate: keep first occurrence of each id
161
- seen_ids: dict[str, dict] = {}
162
- for node in nodes:
163
- nid = node.get("id", "")
164
- if nid and nid not in seen_ids:
165
- seen_ids[nid] = node
166
- unique_nodes = list(seen_ids.values())
167
-
168
- if len(unique_nodes) <= 1:
169
- return unique_nodes, edges
170
-
171
- # ── pass 1: exact normalization ───────────────────────────────────────────
172
- norm_to_nodes: dict[str, list[dict]] = defaultdict(list)
173
- for node in unique_nodes:
174
- key = _norm(node.get("label", node.get("id", "")))
175
- if key:
176
- norm_to_nodes[key].append(node)
177
-
178
- uf = _UF()
179
- exact_merges = 0
180
- for key, group in norm_to_nodes.items():
181
- if len(group) <= 1:
182
- continue
183
- # Partition by source_file — only merge within the same file in Pass 1.
184
- # Cross-file matches fall through to Pass 2 fuzzy matching.
185
- by_file: dict[str, list[dict]] = defaultdict(list)
186
- for node in group:
187
- sf = node.get("source_file") or ""
188
- by_file[sf].append(node)
189
- for sf, file_group in by_file.items():
190
- if not sf:
191
- # No source_file — cannot prove same symbol; skip to avoid
192
- # collapsing distinct nodes that happen to share a label (#1178).
193
- continue
194
- if len(file_group) > 1:
195
- winner = _pick_winner(file_group)
196
- for node in file_group:
197
- uf.union(winner["id"], node["id"])
198
- exact_merges += len(file_group) - 1
199
-
200
- # ── pass 2: MinHash/LSH + Jaro-Winkler (high-entropy nodes only) ─────────
201
- candidates: list[dict] = []
202
- seen_norms: set[str] = set()
203
- for node in unique_nodes:
204
- key = _norm(node.get("label", node.get("id", "")))
205
- if key and key not in seen_norms:
206
- seen_norms.add(key)
207
- if _entropy(node.get("label", "")) >= _ENTROPY_THRESHOLD:
208
- candidates.append(node)
209
-
210
- fuzzy_merges = 0
211
- if len(candidates) >= 2:
212
- lsh = MinHashLSH(threshold=_LSH_THRESHOLD, num_perm=_NUM_PERM)
213
- minhashes: dict[str, MinHash] = {}
214
-
215
- for node in candidates:
216
- norm_label = _norm(node.get("label", node.get("id", "")))
217
- m = _make_minhash(norm_label)
218
- minhashes[node["id"]] = m
219
- try:
220
- lsh.insert(node["id"], m)
221
- except ValueError:
222
- pass # duplicate key in LSH — already inserted
223
-
224
- for node in candidates:
225
- node_id = node["id"]
226
- norm_label = _norm(node.get("label", node.get("id", "")))
227
- neighbors = lsh.query(minhashes[node_id])
228
-
229
- for neighbor_id in neighbors:
230
- if neighbor_id == node_id:
231
- continue
232
- if uf.find(node_id) == uf.find(neighbor_id):
233
- continue
234
-
235
- neighbor = next((n for n in candidates if n["id"] == neighbor_id), None)
236
- if neighbor is None:
237
- continue
238
-
239
- neighbor_norm = _norm(neighbor.get("label", neighbor.get("id", "")))
240
- score = JaroWinkler.normalized_similarity(norm_label, neighbor_norm) * 100
241
-
242
- if _is_variant_pair(norm_label, neighbor_norm):
243
- continue
244
- if _short_label_blocked(norm_label, neighbor_norm, score):
245
- continue
246
-
247
- c1 = communities.get(node_id)
248
- c2 = communities.get(neighbor_id)
249
- if (c1 is not None and c2 is not None and c1 == c2
250
- and min(len(norm_label), len(neighbor_norm)) >= 12):
251
- score += _COMMUNITY_BOOST
252
-
253
- if score >= _MERGE_THRESHOLD:
254
- # Identical labels across different source files almost always
255
- # means same-named-but-different symbols (trait impls, wrapper
256
- # methods, common type names). Mirror Pass 1's source_file
257
- # partition for this sub-case. (#1046, leaks #895's fix)
258
- if norm_label == neighbor_norm:
259
- sf_a = node.get("source_file") or ""
260
- sf_b = neighbor.get("source_file") or ""
261
- if sf_a != sf_b:
262
- continue
263
- all_group = norm_to_nodes.get(norm_label, [node]) + \
264
- norm_to_nodes.get(neighbor_norm, [neighbor])
265
- winner = _pick_winner(all_group)
266
- uf.union(winner["id"], node_id)
267
- uf.union(winner["id"], neighbor_id)
268
- fuzzy_merges += 1
269
-
270
- # ── pass 3: LLM tiebreaker for ambiguous pairs (opt-in) ──────────────────
271
- if dedup_llm_backend is not None:
272
- _llm_tiebreak(candidates, uf, communities, backend=dedup_llm_backend)
273
-
274
- # ── build remap table from union-find components ──────────────────────────
275
- components = uf.components()
276
- remap: dict[str, str] = {}
277
-
278
- for root, members in components.items():
279
- if len(members) == 1:
280
- continue
281
- group_nodes = [n for n in unique_nodes if n["id"] in members]
282
- winner = _pick_winner(group_nodes) if group_nodes else {"id": root}
283
- winner_id = winner["id"]
284
- for member in members:
285
- if member != winner_id:
286
- remap[member] = winner_id
287
-
288
- # ── apply remap ───────────────────────────────────────────────────────────
289
- if not remap:
290
- return unique_nodes, edges
291
-
292
- total = len(remap)
293
- msg = f"[graphify] Deduplicated {total} node(s)"
294
- if exact_merges:
295
- msg += f" ({exact_merges} exact"
296
- if fuzzy_merges:
297
- msg += f", {fuzzy_merges} fuzzy"
298
- msg += ")"
299
- print(msg + ".", flush=True)
300
-
301
- deduped_nodes = [n for n in unique_nodes if n["id"] not in remap]
302
- deduped_edges = []
303
- for edge in edges:
304
- e = dict(edge)
305
- # Tolerate "from"/"to" keys from LLM backends that don't follow the
306
- # schema exactly — build_from_json normalises later but dedup runs
307
- # first so bracket access would KeyError here (#803).
308
- # Use explicit key presence check (not `or`) so empty-string src/tgt
309
- # aren't silently replaced by the fallback key.
310
- src = e["source"] if "source" in e else e.get("from")
311
- tgt = e["target"] if "target" in e else e.get("to")
312
- if src is None or tgt is None:
313
- continue
314
- e["source"] = remap.get(src, src)
315
- e["target"] = remap.get(tgt, tgt)
316
- # Remove legacy keys so they don't leak into edge attrs in graph.json.
317
- e.pop("from", None)
318
- e.pop("to", None)
319
- if e["source"] != e["target"]:
320
- deduped_edges.append(e)
321
-
322
- return deduped_nodes, deduped_edges
323
-
324
-
325
- def _pick_winner(nodes: list[dict]) -> dict:
326
- """Pick the canonical survivor: prefer no chunk suffix, then shorter ID."""
327
- if not nodes:
328
- raise ValueError("Cannot pick winner from empty list")
329
-
330
- def _score(n: dict) -> tuple[int, int]:
331
- has_suffix = bool(_CHUNK_SUFFIX.search(n["id"]))
332
- return (1 if has_suffix else 0, len(n["id"]))
333
-
334
- return min(nodes, key=_score)
335
-
336
-
337
- def _llm_tiebreak(
338
- candidates: list[dict],
339
- uf: _UF,
340
- communities: dict[str, int],
341
- *,
342
- backend: str,
343
- batch_size: int = 30,
344
- low: float = 75.0,
345
- high: float = 92.0,
346
- ) -> None:
347
- """Batch-resolve ambiguous pairs (score in [low, high)) via LLM."""
348
- try:
349
- from graphify.llm import BACKENDS, _format_backend_env_keys, _get_backend_api_key
350
- if backend not in BACKENDS:
351
- print(f"[graphify] --dedup-llm: unknown backend {backend!r}, skipping LLM tiebreaker.", flush=True)
352
- return
353
- if not _get_backend_api_key(backend):
354
- env_keys = _format_backend_env_keys(backend)
355
- print(f"[graphify] --dedup-llm: {env_keys} not set, skipping LLM tiebreaker.", flush=True)
356
- return
357
- except ImportError:
358
- return
359
-
360
- ambiguous: list[tuple[dict, dict, float]] = []
361
- for i, node in enumerate(candidates):
362
- norm_i = _norm(node.get("label", node.get("id", "")))
363
- for j in range(i + 1, len(candidates)):
364
- neighbor = candidates[j]
365
- if uf.find(node["id"]) == uf.find(neighbor["id"]):
366
- continue
367
- norm_j = _norm(neighbor.get("label", neighbor.get("id", "")))
368
- score = JaroWinkler.normalized_similarity(norm_i, norm_j) * 100
369
- if _is_variant_pair(norm_i, norm_j):
370
- continue
371
- if _short_label_blocked(norm_i, norm_j, score):
372
- continue
373
- c1 = communities.get(node["id"])
374
- c2 = communities.get(neighbor["id"])
375
- if (c1 is not None and c2 is not None and c1 == c2
376
- and min(len(norm_i), len(norm_j)) >= 12):
377
- score += _COMMUNITY_BOOST
378
- if low <= score < high:
379
- ambiguous.append((node, neighbor, score))
380
-
381
- if not ambiguous:
382
- return
383
-
384
- try:
385
- from graphify.llm import _call_llm
386
- except ImportError as exc:
387
- # F-038: previously this silent fallback hid the fact that `_call_llm`
388
- # didn't exist in `graphify.llm` at all, so `--dedup-llm` was a no-op.
389
- # Surface the import failure so future regressions are visible.
390
- print(
391
- f"[graphify] --dedup-llm: cannot import _call_llm ({exc}); skipping LLM tiebreaker.",
392
- flush=True,
393
- )
394
- return
395
-
396
- for batch_start in range(0, len(ambiguous), batch_size):
397
- batch = ambiguous[batch_start : batch_start + batch_size]
398
- pairs_text = "\n".join(
399
- f"{i+1}. \"{a['label']}\" vs \"{b['label']}\""
400
- for i, (a, b, _) in enumerate(batch)
401
- )
402
- prompt = (
403
- "For each pair below, answer only 'yes' or 'no': are they the same real-world concept?\n\n"
404
- f"{pairs_text}\n\n"
405
- "Reply with one line per pair: '1. yes', '2. no', etc."
406
- )
407
- try:
408
- response = _call_llm(prompt, backend=backend, max_tokens=200)
409
- lines = response.strip().splitlines()
410
- for line in lines:
411
- line = line.strip()
412
- if not line:
413
- continue
414
- parts = line.split(".", 1)
415
- if len(parts) != 2:
416
- continue
417
- try:
418
- idx = int(parts[0].strip()) - 1
419
- except ValueError:
420
- continue
421
- if 0 <= idx < len(batch):
422
- answer = parts[1].strip().lower()
423
- if answer.startswith("yes"):
424
- a, b, _ = batch[idx]
425
- winner = _pick_winner([a, b])
426
- uf.union(winner["id"], a["id"])
427
- uf.union(winner["id"], b["id"])
428
- except Exception as exc:
429
- print(f"[graphify] --dedup-llm batch failed: {exc}", flush=True)