ultimate-pi 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/.pi/PACKAGING.md +35 -0
  2. package/.pi/extensions/lib/harness-paths.ts +8 -0
  3. package/.pi/extensions/sentrux-rules-sync.ts +2 -8
  4. package/.pi/harness/docs/adrs/0006-sentrux-dual-layer.md +1 -1
  5. package/.pi/harness/docs/adrs/0009-sentrux-rules-lifecycle.md +2 -2
  6. package/.pi/harness/sentrux/architecture.manifest.json +3 -3
  7. package/.pi/prompts/harness-setup.md +61 -24
  8. package/.pi/scripts/README.md +17 -0
  9. package/{scripts → .pi/scripts}/harness-verify.mjs +3 -3
  10. package/{scripts → .pi/scripts}/sentrux-rules-sync.mjs +2 -2
  11. package/.pi/{settings.json → settings.example.json} +1 -1
  12. package/.sentrux/.harness-rules-meta.json +2 -2
  13. package/.sentrux/rules.toml +3 -3
  14. package/CHANGELOG.md +17 -0
  15. package/package.json +47 -8
  16. package/.ckignore +0 -41
  17. package/.codex/hooks.json +0 -15
  18. package/.env.example +0 -21
  19. package/.gitattributes +0 -1
  20. package/.github/banner-v2.png +0 -0
  21. package/.github/workflows/lint.yml +0 -33
  22. package/.github/workflows/publish-github-packages.yml +0 -35
  23. package/.github/workflows/publish-npm.yml +0 -32
  24. package/.pi/harness/browser.json +0 -1
  25. package/.pi/harness/router/README.md +0 -35
  26. package/.pi/harness/router/apply-router-proposal.mjs +0 -153
  27. package/.pi/harness/router/propose-router-tuning.mjs +0 -149
  28. package/.pi/npm/.gitignore +0 -2
  29. package/CONTRIBUTING.md +0 -166
  30. package/lefthook.yml +0 -9
  31. package/scripts/__pycache__/merge_graphify_corpora.cpython-314.pyc +0 -0
  32. package/scripts/index_youtube_urls.py +0 -376
  33. package/scripts/merge_graphify_corpora.py +0 -398
  34. package/scripts/regen_graphify_html.py +0 -46
  35. package/test/harness-verify.test.mjs +0 -33
  36. /package/{scripts → .pi/scripts}/harness-cli-verify.sh +0 -0
  37. /package/{scripts → .pi/scripts}/harness-graphify-bootstrap.sh +0 -0
@@ -1,398 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Merge graphify-out with optional graphify-books-out and graphify-yt-transcripts-out into graphify-out.
4
-
5
- (Books/YouTube dirs were removed after a successful one-time merge; restore them from git to re-run.)
6
-
7
- - Prefixes all book and YouTube node IDs to avoid collisions and preserve provenance.
8
- - Merges hyperedges (normalizing books' member_nodes -> nodes).
9
- - Adds cross-corpus INFERRED edges via token overlap / Jaccard on normalized labels.
10
- - Re-clusters with graphify, writes graph.json, GRAPH_REPORT.md, analysis, labels, and graph.html (full viz via explicit node_limit).
11
- """
12
- from __future__ import annotations
13
-
14
- import json
15
- import re
16
- import shutil
17
- import sys
18
- from collections import defaultdict
19
- from datetime import datetime, timezone
20
- from pathlib import Path
21
-
22
- import networkx as nx
23
- from networkx.readwrite import json_graph
24
-
25
- from graphify.analyze import god_nodes, surprising_connections, suggest_questions
26
- from graphify.cluster import cluster, score_all
27
- from graphify.export import to_html, to_json
28
- from graphify.report import generate
29
-
30
- ROOT = Path(__file__).resolve().parents[1]
31
- OUT = ROOT / "graphify-out"
32
- MAIN_JSON = ROOT / "graphify-out" / "graph.json"
33
- BOOKS_JSON = ROOT / "graphify-books-out" / "graph.json"
34
- YT_JSON = ROOT / "graphify-yt-transcripts-out" / "graph.json"
35
- YT_SEM = ROOT / "graphify-yt-transcripts-out" / "semantic_extraction.json"
36
-
37
- BOOK_PREFIX = "books__"
38
- YT_PREFIX = "yt__"
39
-
40
-
41
- def _norm_tokens(text: str) -> set[str]:
42
- s = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
43
- return {t for t in s.split() if len(t) > 2}
44
-
45
-
46
- def load_node_link(path: Path) -> nx.Graph:
47
- data = json.loads(path.read_text(encoding="utf-8"))
48
- return json_graph.node_link_graph(data, edges="links")
49
-
50
-
51
- def load_youtube_nx(path: Path) -> nx.Graph:
52
- data = json.loads(path.read_text(encoding="utf-8"))
53
- G = nx.Graph()
54
- for n in data.get("nodes", []):
55
- nid = n["id"]
56
- attrs = {k: v for k, v in n.items() if k != "id"}
57
- if "source_file" not in attrs or attrs["source_file"] in (None, ""):
58
- attrs["source_file"] = "graphify-yt-transcripts-out/transcripts"
59
- if "file_type" not in attrs:
60
- attrs["file_type"] = "document"
61
- G.add_node(nid, **attrs)
62
- for e in data.get("edges", []):
63
- u, v = e["source"], e["target"]
64
- if u not in G or v not in G:
65
- continue
66
- ed = {k: v for k, v in e.items() if k not in ("source", "target")}
67
- G.add_edge(u, v, **ed)
68
- return G
69
-
70
-
71
- def prefix_graph(G: nx.Graph, prefix: str) -> tuple[nx.Graph, dict[str, str]]:
72
- """Return new graph with prefixed node ids; mapping old_id -> new_id."""
73
- mapping = {n: f"{prefix}{n}" for n in G.nodes()}
74
- H = nx.relabel_nodes(G, mapping, copy=True)
75
- return H, mapping
76
-
77
-
78
- def strip_community(G: nx.Graph) -> None:
79
- for _, d in G.nodes(data=True):
80
- d.pop("community", None)
81
-
82
-
83
- def collect_hyperedges_main(data: dict) -> list[dict]:
84
- g = data.get("graph") or {}
85
- return list(g.get("hyperedges") or [])
86
-
87
-
88
- def collect_hyperedges_books(data: dict, id_map: dict[str, str]) -> list[dict]:
89
- out: list[dict] = []
90
- for h in (data.get("graph") or {}).get("hyperedges") or []:
91
- members = h.get("member_nodes") or h.get("nodes") or []
92
- remapped = [id_map[m] for m in members if m in id_map]
93
- if len(remapped) < 2:
94
- continue
95
- h2 = dict(h)
96
- h2["nodes"] = remapped
97
- h2.pop("member_nodes", None)
98
- if "label" not in h2 and h2.get("description"):
99
- h2["label"] = str(h2["description"])[:200]
100
- if "relation" not in h2:
101
- h2["relation"] = "participate_in"
102
- if "confidence" not in h2:
103
- h2["confidence"] = "INFERRED"
104
- if "confidence_score" not in h2:
105
- h2["confidence_score"] = 0.7
106
- out.append(h2)
107
- return out
108
-
109
-
110
- def collect_hyperedges_yt(semantic: dict, id_map: dict[str, str]) -> list[dict]:
111
- out: list[dict] = []
112
- for h in semantic.get("hyperedges") or []:
113
- nodes = h.get("nodes") or []
114
- remapped = [id_map[n] for n in nodes if n in id_map]
115
- if len(remapped) < 2:
116
- continue
117
- h2 = dict(h)
118
- h2["nodes"] = remapped
119
- out.append(h2)
120
- return out
121
-
122
-
123
- def build_token_index(G: nx.Graph) -> tuple[dict[str, set[str]], dict[str, str]]:
124
- """node_id -> tokens, node_id -> display string for matching."""
125
- tokens: dict[str, set[str]] = {}
126
- labels: dict[str, str] = {}
127
- for nid, d in G.nodes(data=True):
128
- lab = d.get("norm_label") or d.get("label") or str(nid)
129
- labels[nid] = lab if isinstance(lab, str) else str(lab)
130
- tokens[nid] = _norm_tokens(labels[nid])
131
- return tokens, labels
132
-
133
-
134
- def add_cross_corpus_edges(
135
- G: nx.Graph,
136
- parts: list[tuple[str, nx.Graph, dict[str, set[str]], dict[str, str]]],
137
- *,
138
- max_edges: int = 12000,
139
- min_jaccard: float = 0.32,
140
- min_shared: int = 2,
141
- max_per_target_corpus: int = 2,
142
- ) -> int:
143
- """
144
- parts: (name, subgraph, tokens_map, labels_map) for each corpus.
145
- Adds INFERRED semantically_similar_to edges only between different corpora (id prefix).
146
- """
147
- inverted: dict[str, list[tuple[str, str]]] = defaultdict(list)
148
- for corpus, _Sg, tok_map, _lab in parts:
149
- for nid, toks in tok_map.items():
150
- for t in toks:
151
- inverted[t].append((corpus, nid))
152
-
153
- token_maps = {name: tm for name, _Sg, tm, _ in parts}
154
- def corpus_of(nid: str) -> str:
155
- if nid.startswith(BOOK_PREFIX):
156
- return "books"
157
- if nid.startswith(YT_PREFIX):
158
- return "yt"
159
- return "main"
160
-
161
- existing = {frozenset((u, v)) for u, v in G.edges()}
162
- added = 0
163
-
164
- for corpus_a, _Ga, tok_a, _lab_a in parts:
165
- for u, tu in tok_a.items():
166
- if not tu:
167
- continue
168
- cand: set[str] = set()
169
- for t in tu:
170
- for corp_b, v in inverted[t]:
171
- if corp_b == corpus_a:
172
- continue
173
- if corpus_of(u) == corpus_of(v):
174
- continue
175
- cand.add(v)
176
-
177
- scored: list[tuple[float, str]] = []
178
- for v in cand:
179
- tv = None
180
- for name in token_maps:
181
- if v in token_maps[name]:
182
- tv = token_maps[name][v]
183
- break
184
- if not tv:
185
- continue
186
- inter = len(tu & tv)
187
- if inter < min_shared:
188
- continue
189
- union = len(tu | tv) or 1
190
- j = inter / union
191
- if j < min_jaccard:
192
- continue
193
- scored.append((j, v))
194
-
195
- scored.sort(reverse=True)
196
- tgt_corpus_count: dict[str, int] = defaultdict(int)
197
- for j, v in scored:
198
- if added >= max_edges:
199
- return added
200
- cb = corpus_of(v)
201
- if tgt_corpus_count[cb] >= max_per_target_corpus:
202
- continue
203
- pair = frozenset((u, v))
204
- if pair in existing:
205
- continue
206
- existing.add(pair)
207
- tgt_corpus_count[cb] += 1
208
- rationale = f"cross_corpus token overlap jaccard={j:.2f}"
209
- G.add_edge(
210
- u,
211
- v,
212
- relation="semantically_similar_to",
213
- confidence="INFERRED",
214
- confidence_score=min(0.95, 0.55 + 0.4 * j),
215
- source_file="graphify_merge/cross_corpus",
216
- source_location=f"{corpus_a}->{cb}",
217
- weight=1.0,
218
- rationale=rationale[:500],
219
- )
220
- added += 1
221
- return added
222
-
223
-
224
- def auto_community_labels(
225
- G: nx.Graph, communities: dict[int, list[str]]
226
- ) -> dict[int, str]:
227
- """Short names from highest-degree node labels in each community."""
228
- deg = dict(G.degree())
229
- out: dict[int, str] = {}
230
- for cid, members in communities.items():
231
- ranked = sorted(members, key=lambda n: deg.get(n, 0), reverse=True)
232
- bits: list[str] = []
233
- seen_words: set[str] = set()
234
- for nid in ranked[:12]:
235
- lab = G.nodes[nid].get("label") or nid
236
- if not isinstance(lab, str):
237
- lab = str(lab)
238
- # shorten
239
- short = lab.strip()
240
- if len(short) > 42:
241
- short = short[:39] + "…"
242
- w = _norm_tokens(short)
243
- if not w:
244
- continue
245
- if short and short not in bits:
246
- bits.append(short)
247
- seen_words |= w
248
- if len(bits) >= 3:
249
- break
250
- if bits:
251
- name = " · ".join(bits[:3])
252
- else:
253
- name = f"Community {cid}"
254
- if len(name) > 90:
255
- name = name[:87] + "…"
256
- out[cid] = name
257
- return out
258
-
259
-
260
- def polish_labels(labels: dict[int, str], G: nx.Graph, communities: dict[int, list[str]]) -> dict[int, str]:
261
- """Short-circuit noisy labels from ingested graph-report summary nodes."""
262
- out = dict(labels)
263
- for cid, name in list(out.items()):
264
- nlow = name.lower()
265
- if "graph report" in nlow and "communities" in nlow:
266
- out[cid] = "Ingested graph-report hubs (books merge artifact)"
267
- elif "communities (" in nlow and "thin omitted" in nlow:
268
- out[cid] = "Book community index nodes (metadata)"
269
- return out
270
-
271
-
272
- def main() -> None:
273
- for p in (BOOKS_JSON, YT_JSON):
274
- if not p.exists():
275
- print(
276
- f"Missing {p}. Books/YouTube graphs were merged into graphify-out and "
277
- "the source dirs were removed; restore graphify-books-out/ and "
278
- "graphify-yt-transcripts-out/ from git (or a backup) to re-run this merge.",
279
- file=sys.stderr,
280
- )
281
- raise SystemExit(1)
282
-
283
- ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
284
- backup = OUT / f"graph.json.pre-merge-{ts}.bak"
285
- if MAIN_JSON.exists():
286
- shutil.copy2(MAIN_JSON, backup)
287
- print(f"Backed up graph.json -> {backup.name}")
288
-
289
- raw_main = json.loads(MAIN_JSON.read_text(encoding="utf-8"))
290
- raw_books = json.loads(BOOKS_JSON.read_text(encoding="utf-8"))
291
-
292
- G_main = load_node_link(MAIN_JSON)
293
- G_books = load_node_link(BOOKS_JSON)
294
- G_yt = load_youtube_nx(YT_JSON)
295
-
296
- strip_community(G_main)
297
- strip_community(G_books)
298
- strip_community(G_yt)
299
-
300
- G_books_p, map_b = prefix_graph(G_books, BOOK_PREFIX)
301
- G_yt_p, map_y = prefix_graph(G_yt, YT_PREFIX)
302
-
303
- G = nx.compose_all([G_main, G_books_p, G_yt_p])
304
-
305
- hyper: list[dict] = []
306
- hyper += collect_hyperedges_main(raw_main)
307
- hyper += collect_hyperedges_books(raw_books, map_b)
308
- if YT_SEM.exists():
309
- sem = json.loads(YT_SEM.read_text(encoding="utf-8"))
310
- hyper += collect_hyperedges_yt(sem, map_y)
311
- G.graph["hyperedges"] = hyper
312
- print(f"Merged hyperedges: {len(hyper)}")
313
-
314
- parts = []
315
- for name, sub in (
316
- ("main", G_main),
317
- ("books", G_books_p),
318
- ("yt", G_yt_p),
319
- ):
320
- tm, lm = build_token_index(sub)
321
- parts.append((name, sub, tm, lm))
322
-
323
- n_cross = add_cross_corpus_edges(G, parts)
324
- print(f"Cross-corpus edges added: {n_cross}")
325
- print(f"Combined graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
326
-
327
- communities = cluster(G)
328
- cohesion = score_all(G, communities)
329
- gods = god_nodes(G)
330
- surprises = surprising_connections(G, communities)
331
-
332
- labels = polish_labels(auto_community_labels(G, communities), G, communities)
333
- questions = suggest_questions(G, communities, labels)
334
-
335
- detection = {
336
- "total_files": 0,
337
- "total_words": 0,
338
- "needs_graph": True,
339
- "warning": None,
340
- "files": {"paper": [], "code": [], "document": [], "image": [], "video": []},
341
- "skipped_sensitive": [],
342
- "graphifyignore_patterns": 0,
343
- }
344
- tokens = {"input": 0, "output": 0}
345
-
346
- report = generate(
347
- G,
348
- communities,
349
- cohesion,
350
- labels,
351
- gods,
352
- surprises,
353
- detection,
354
- tokens,
355
- str(ROOT),
356
- suggested_questions=questions,
357
- )
358
- OUT.mkdir(parents=True, exist_ok=True)
359
- (OUT / "GRAPH_REPORT.md").write_text(report, encoding="utf-8")
360
-
361
- ok = to_json(G, communities, str(OUT / "graph.json"), force=True)
362
- if not ok:
363
- raise SystemExit("to_json refused to write; check stderr")
364
-
365
- analysis = {
366
- "communities": {str(k): v for k, v in communities.items()},
367
- "cohesion": {str(k): v for k, v in cohesion.items()},
368
- "gods": gods,
369
- "surprises": surprises,
370
- "questions": questions,
371
- "merge_meta": {
372
- "merged_at": datetime.now(timezone.utc).isoformat(),
373
- "sources": ["graphify-out", "graphify-books-out", "graphify-yt-transcripts-out"],
374
- "cross_corpus_edges": n_cross,
375
- "hyperedges": len(hyper),
376
- },
377
- }
378
- (OUT / ".graphify_analysis.json").write_text(
379
- json.dumps(analysis, indent=2), encoding="utf-8"
380
- )
381
- (OUT / ".graphify_labels.json").write_text(
382
- json.dumps({str(k): v for k, v in labels.items()}, indent=2),
383
- encoding="utf-8",
384
- )
385
-
386
- n = G.number_of_nodes()
387
- to_html(
388
- G,
389
- communities,
390
- str(OUT / "graph.html"),
391
- community_labels=labels,
392
- node_limit=n,
393
- )
394
- print(f"Wrote graph.html ({n} nodes, node_limit=n for graphify viz cap)")
395
-
396
-
397
- if __name__ == "__main__":
398
- main()
@@ -1,46 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Write graphify-out/graph.html from existing graph.json (full graph, bypasses 5k default cap)."""
3
- from __future__ import annotations
4
-
5
- import json
6
- import sys
7
- from pathlib import Path
8
-
9
- from networkx.readwrite import json_graph
10
-
11
- from graphify.export import to_html
12
-
13
- ROOT = Path(__file__).resolve().parents[1]
14
- OUT = ROOT / "graphify-out"
15
-
16
-
17
- def main() -> None:
18
- gj = OUT / "graph.json"
19
- if not gj.exists():
20
- print(f"Missing {gj}", file=sys.stderr)
21
- sys.exit(1)
22
- G = json_graph.node_link_graph(json.loads(gj.read_text(encoding="utf-8")), edges="links")
23
- analysis_path = OUT / ".graphify_analysis.json"
24
- if not analysis_path.exists():
25
- print(f"Missing {analysis_path}", file=sys.stderr)
26
- sys.exit(1)
27
- analysis = json.loads(analysis_path.read_text(encoding="utf-8"))
28
- communities = {int(k): v for k, v in analysis["communities"].items()}
29
- labels_path = OUT / ".graphify_labels.json"
30
- labels: dict[int, str] = {}
31
- if labels_path.exists():
32
- labels = {int(k): v for k, v in json.loads(labels_path.read_text(encoding="utf-8")).items()}
33
- n = G.number_of_nodes()
34
- # graphify skips full HTML when n > default limit; pass explicit limit for full-node viz.
35
- to_html(
36
- G,
37
- communities,
38
- str(OUT / "graph.html"),
39
- community_labels=labels or None,
40
- node_limit=n,
41
- )
42
- print(f"Wrote {OUT / 'graph.html'} ({n} nodes)")
43
-
44
-
45
- if __name__ == "__main__":
46
- main()
@@ -1,33 +0,0 @@
1
- import { spawn } from "node:child_process";
2
- import { test } from "node:test";
3
- import assert from "node:assert/strict";
4
- import { join, dirname } from "node:path";
5
- import { fileURLToPath } from "node:url";
6
-
7
- const ROOT = join(dirname(fileURLToPath(import.meta.url)), "..");
8
-
9
- function runHarnessVerify() {
10
- return new Promise((resolve, reject) => {
11
- const child = spawn("node", ["scripts/harness-verify.mjs"], {
12
- cwd: ROOT,
13
- stdio: ["ignore", "pipe", "pipe"],
14
- });
15
- let stdout = "";
16
- let stderr = "";
17
- child.stdout.on("data", (d) => {
18
- stdout += d.toString();
19
- });
20
- child.stderr.on("data", (d) => {
21
- stderr += d.toString();
22
- });
23
- child.on("close", (code) => {
24
- if (code === 0) resolve(stdout);
25
- else reject(new Error(stderr || stdout || `exit ${code}`));
26
- });
27
- });
28
- }
29
-
30
- test("harness:verify passes", async () => {
31
- const out = await runHarnessVerify();
32
- assert.match(out, /harness:verify PASS/);
33
- });
File without changes