npm - ultimate-pi - Versions diffs - 0.1.7 → 0.2.2 - Mend

ultimate-pi 0.1.7 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (524) hide show

package/scripts/merge_graphify_corpora.py ADDED Viewed

@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+"""
+Merge graphify-out with optional graphify-books-out and graphify-yt-transcripts-out into graphify-out.
+(Books/YouTube dirs were removed after a successful one-time merge; restore them from git to re-run.)
+- Prefixes all book and YouTube node IDs to avoid collisions and preserve provenance.
+- Merges hyperedges (normalizing books' member_nodes -> nodes).
+- Adds cross-corpus INFERRED edges via token overlap / Jaccard on normalized labels.
+- Re-clusters with graphify, writes graph.json, GRAPH_REPORT.md, analysis, labels, and graph.html (full viz via explicit node_limit).
+"""
+from __future__ import annotations
+import json
+import re
+import shutil
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+import networkx as nx
+from networkx.readwrite import json_graph
+from graphify.analyze import god_nodes, surprising_connections, suggest_questions
+from graphify.cluster import cluster, score_all
+from graphify.export import to_html, to_json
+from graphify.report import generate
+ROOT = Path(__file__).resolve().parents[1]
+OUT = ROOT / "graphify-out"
+MAIN_JSON = ROOT / "graphify-out" / "graph.json"
+BOOKS_JSON = ROOT / "graphify-books-out" / "graph.json"
+YT_JSON = ROOT / "graphify-yt-transcripts-out" / "graph.json"
+YT_SEM = ROOT / "graphify-yt-transcripts-out" / "semantic_extraction.json"
+BOOK_PREFIX = "books__"
+YT_PREFIX = "yt__"
+def _norm_tokens(text: str) -> set[str]:
+    s = re.sub(r"[^a-z0-9\s]", " ", (text or "").lower())
+    return {t for t in s.split() if len(t) > 2}
+def load_node_link(path: Path) -> nx.Graph:
+    data = json.loads(path.read_text(encoding="utf-8"))
+    return json_graph.node_link_graph(data, edges="links")
+def load_youtube_nx(path: Path) -> nx.Graph:
+    data = json.loads(path.read_text(encoding="utf-8"))
+    G = nx.Graph()
+    for n in data.get("nodes", []):
+        nid = n["id"]
+        attrs = {k: v for k, v in n.items() if k != "id"}
+        if "source_file" not in attrs or attrs["source_file"] in (None, ""):
+            attrs["source_file"] = "graphify-yt-transcripts-out/transcripts"
+        if "file_type" not in attrs:
+            attrs["file_type"] = "document"
+        G.add_node(nid, **attrs)
+    for e in data.get("edges", []):
+        u, v = e["source"], e["target"]
+        if u not in G or v not in G:
+            continue
+        ed = {k: v for k, v in e.items() if k not in ("source", "target")}
+        G.add_edge(u, v, **ed)
+    return G
+def prefix_graph(G: nx.Graph, prefix: str) -> tuple[nx.Graph, dict[str, str]]:
+    """Return new graph with prefixed node ids; mapping old_id -> new_id."""
+    mapping = {n: f"{prefix}{n}" for n in G.nodes()}
+    H = nx.relabel_nodes(G, mapping, copy=True)
+    return H, mapping
+def strip_community(G: nx.Graph) -> None:
+    for _, d in G.nodes(data=True):
+        d.pop("community", None)
+def collect_hyperedges_main(data: dict) -> list[dict]:
+    g = data.get("graph") or {}
+    return list(g.get("hyperedges") or [])
+def collect_hyperedges_books(data: dict, id_map: dict[str, str]) -> list[dict]:
+    out: list[dict] = []
+    for h in (data.get("graph") or {}).get("hyperedges") or []:
+        members = h.get("member_nodes") or h.get("nodes") or []
+        remapped = [id_map[m] for m in members if m in id_map]
+        if len(remapped) < 2:
+            continue
+        h2 = dict(h)
+        h2["nodes"] = remapped
+        h2.pop("member_nodes", None)
+        if "label" not in h2 and h2.get("description"):
+            h2["label"] = str(h2["description"])[:200]
+        if "relation" not in h2:
+            h2["relation"] = "participate_in"
+        if "confidence" not in h2:
+            h2["confidence"] = "INFERRED"
+        if "confidence_score" not in h2:
+            h2["confidence_score"] = 0.7
+        out.append(h2)
+    return out
+def collect_hyperedges_yt(semantic: dict, id_map: dict[str, str]) -> list[dict]:
+    out: list[dict] = []
+    for h in semantic.get("hyperedges") or []:
+        nodes = h.get("nodes") or []
+        remapped = [id_map[n] for n in nodes if n in id_map]
+        if len(remapped) < 2:
+            continue
+        h2 = dict(h)
+        h2["nodes"] = remapped
+        out.append(h2)
+    return out
+def build_token_index(G: nx.Graph) -> tuple[dict[str, set[str]], dict[str, str]]:
+    """node_id -> tokens, node_id -> display string for matching."""
+    tokens: dict[str, set[str]] = {}
+    labels: dict[str, str] = {}
+    for nid, d in G.nodes(data=True):
+        lab = d.get("norm_label") or d.get("label") or str(nid)
+        labels[nid] = lab if isinstance(lab, str) else str(lab)
+        tokens[nid] = _norm_tokens(labels[nid])
+    return tokens, labels
+def add_cross_corpus_edges(
+    G: nx.Graph,
+    parts: list[tuple[str, nx.Graph, dict[str, set[str]], dict[str, str]]],
+    *,
+    max_edges: int = 12000,
+    min_jaccard: float = 0.32,
+    min_shared: int = 2,
+    max_per_target_corpus: int = 2,
+) -> int:
+    """
+    parts: (name, subgraph, tokens_map, labels_map) for each corpus.
+    Adds INFERRED semantically_similar_to edges only between different corpora (id prefix).
+    """
+    inverted: dict[str, list[tuple[str, str]]] = defaultdict(list)
+    for corpus, _Sg, tok_map, _lab in parts:
+        for nid, toks in tok_map.items():
+            for t in toks:
+                inverted[t].append((corpus, nid))
+    token_maps = {name: tm for name, _Sg, tm, _ in parts}
+    def corpus_of(nid: str) -> str:
+        if nid.startswith(BOOK_PREFIX):
+            return "books"
+        if nid.startswith(YT_PREFIX):
+            return "yt"
+        return "main"
+    existing = {frozenset((u, v)) for u, v in G.edges()}
+    added = 0
+    for corpus_a, _Ga, tok_a, _lab_a in parts:
+        for u, tu in tok_a.items():
+            if not tu:
+                continue
+            cand: set[str] = set()
+            for t in tu:
+                for corp_b, v in inverted[t]:
+                    if corp_b == corpus_a:
+                        continue
+                    if corpus_of(u) == corpus_of(v):
+                        continue
+                    cand.add(v)
+            scored: list[tuple[float, str]] = []
+            for v in cand:
+                tv = None
+                for name in token_maps:
+                    if v in token_maps[name]:
+                        tv = token_maps[name][v]
+                        break
+                if not tv:
+                    continue
+                inter = len(tu & tv)
+                if inter < min_shared:
+                    continue
+                union = len(tu | tv) or 1
+                j = inter / union
+                if j < min_jaccard:
+                    continue
+                scored.append((j, v))
+            scored.sort(reverse=True)
+            tgt_corpus_count: dict[str, int] = defaultdict(int)
+            for j, v in scored:
+                if added >= max_edges:
+                    return added
+                cb = corpus_of(v)
+                if tgt_corpus_count[cb] >= max_per_target_corpus:
+                    continue
+                pair = frozenset((u, v))
+                if pair in existing:
+                    continue
+                existing.add(pair)
+                tgt_corpus_count[cb] += 1
+                rationale = f"cross_corpus token overlap jaccard={j:.2f}"
+                G.add_edge(
+                    u,
+                    v,
+                    relation="semantically_similar_to",
+                    confidence="INFERRED",
+                    confidence_score=min(0.95, 0.55 + 0.4 * j),
+                    source_file="graphify_merge/cross_corpus",
+                    source_location=f"{corpus_a}->{cb}",
+                    weight=1.0,
+                    rationale=rationale[:500],
+                )
+                added += 1
+    return added
+def auto_community_labels(
+    G: nx.Graph, communities: dict[int, list[str]]
+) -> dict[int, str]:
+    """Short names from highest-degree node labels in each community."""
+    deg = dict(G.degree())
+    out: dict[int, str] = {}
+    for cid, members in communities.items():
+        ranked = sorted(members, key=lambda n: deg.get(n, 0), reverse=True)
+        bits: list[str] = []
+        seen_words: set[str] = set()
+        for nid in ranked[:12]:
+            lab = G.nodes[nid].get("label") or nid
+            if not isinstance(lab, str):
+                lab = str(lab)
+            # shorten
+            short = lab.strip()
+            if len(short) > 42:
+                short = short[:39] + "…"
+            w = _norm_tokens(short)
+            if not w:
+                continue
+            if short and short not in bits:
+                bits.append(short)
+            seen_words |= w
+            if len(bits) >= 3:
+                break
+        if bits:
+            name = " · ".join(bits[:3])
+        else:
+            name = f"Community {cid}"
+        if len(name) > 90:
+            name = name[:87] + "…"
+        out[cid] = name
+    return out
+def polish_labels(labels: dict[int, str], G: nx.Graph, communities: dict[int, list[str]]) -> dict[int, str]:
+    """Short-circuit noisy labels from ingested graph-report summary nodes."""
+    out = dict(labels)
+    for cid, name in list(out.items()):
+        nlow = name.lower()
+        if "graph report" in nlow and "communities" in nlow:
+            out[cid] = "Ingested graph-report hubs (books merge artifact)"
+        elif "communities (" in nlow and "thin omitted" in nlow:
+            out[cid] = "Book community index nodes (metadata)"
+    return out
+def main() -> None:
+    for p in (BOOKS_JSON, YT_JSON):
+        if not p.exists():
+            print(
+                f"Missing {p}. Books/YouTube graphs were merged into graphify-out and "
+                "the source dirs were removed; restore graphify-books-out/ and "
+                "graphify-yt-transcripts-out/ from git (or a backup) to re-run this merge.",
+                file=sys.stderr,
+            )
+            raise SystemExit(1)
+    ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
+    backup = OUT / f"graph.json.pre-merge-{ts}.bak"
+    if MAIN_JSON.exists():
+        shutil.copy2(MAIN_JSON, backup)
+        print(f"Backed up graph.json -> {backup.name}")
+    raw_main = json.loads(MAIN_JSON.read_text(encoding="utf-8"))
+    raw_books = json.loads(BOOKS_JSON.read_text(encoding="utf-8"))
+    G_main = load_node_link(MAIN_JSON)
+    G_books = load_node_link(BOOKS_JSON)
+    G_yt = load_youtube_nx(YT_JSON)
+    strip_community(G_main)
+    strip_community(G_books)
+    strip_community(G_yt)
+    G_books_p, map_b = prefix_graph(G_books, BOOK_PREFIX)
+    G_yt_p, map_y = prefix_graph(G_yt, YT_PREFIX)
+    G = nx.compose_all([G_main, G_books_p, G_yt_p])
+    hyper: list[dict] = []
+    hyper += collect_hyperedges_main(raw_main)
+    hyper += collect_hyperedges_books(raw_books, map_b)
+    if YT_SEM.exists():
+        sem = json.loads(YT_SEM.read_text(encoding="utf-8"))
+        hyper += collect_hyperedges_yt(sem, map_y)
+    G.graph["hyperedges"] = hyper
+    print(f"Merged hyperedges: {len(hyper)}")
+    parts = []
+    for name, sub in (
+        ("main", G_main),
+        ("books", G_books_p),
+        ("yt", G_yt_p),
+    ):
+        tm, lm = build_token_index(sub)
+        parts.append((name, sub, tm, lm))
+    n_cross = add_cross_corpus_edges(G, parts)
+    print(f"Cross-corpus edges added: {n_cross}")
+    print(f"Combined graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
+    communities = cluster(G)
+    cohesion = score_all(G, communities)
+    gods = god_nodes(G)
+    surprises = surprising_connections(G, communities)
+    labels = polish_labels(auto_community_labels(G, communities), G, communities)
+    questions = suggest_questions(G, communities, labels)
+    detection = {
+        "total_files": 0,
+        "total_words": 0,
+        "needs_graph": True,
+        "warning": None,
+        "files": {"paper": [], "code": [], "document": [], "image": [], "video": []},
+        "skipped_sensitive": [],
+        "graphifyignore_patterns": 0,
+    }
+    tokens = {"input": 0, "output": 0}
+    report = generate(
+        G,
+        communities,
+        cohesion,
+        labels,
+        gods,
+        surprises,
+        detection,
+        tokens,
+        str(ROOT),
+        suggested_questions=questions,
+    )
+    OUT.mkdir(parents=True, exist_ok=True)
+    (OUT / "GRAPH_REPORT.md").write_text(report, encoding="utf-8")
+    ok = to_json(G, communities, str(OUT / "graph.json"), force=True)
+    if not ok:
+        raise SystemExit("to_json refused to write; check stderr")
+    analysis = {
+        "communities": {str(k): v for k, v in communities.items()},
+        "cohesion": {str(k): v for k, v in cohesion.items()},
+        "gods": gods,
+        "surprises": surprises,
+        "questions": questions,
+        "merge_meta": {
+            "merged_at": datetime.now(timezone.utc).isoformat(),
+            "sources": ["graphify-out", "graphify-books-out", "graphify-yt-transcripts-out"],
+            "cross_corpus_edges": n_cross,
+            "hyperedges": len(hyper),
+        },
+    }
+    (OUT / ".graphify_analysis.json").write_text(
+        json.dumps(analysis, indent=2), encoding="utf-8"
+    )
+    (OUT / ".graphify_labels.json").write_text(
+        json.dumps({str(k): v for k, v in labels.items()}, indent=2),
+        encoding="utf-8",
+    )
+    n = G.number_of_nodes()
+    to_html(
+        G,
+        communities,
+        str(OUT / "graph.html"),
+        community_labels=labels,
+        node_limit=n,
+    )
+    print(f"Wrote graph.html ({n} nodes, node_limit=n for graphify viz cap)")
+if __name__ == "__main__":
+    main()

package/scripts/regen_graphify_html.py ADDED Viewed

@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Write graphify-out/graph.html from existing graph.json (full graph, bypasses 5k default cap)."""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from networkx.readwrite import json_graph
+from graphify.export import to_html
+ROOT = Path(__file__).resolve().parents[1]
+OUT = ROOT / "graphify-out"
+def main() -> None:
+    gj = OUT / "graph.json"
+    if not gj.exists():
+        print(f"Missing {gj}", file=sys.stderr)
+        sys.exit(1)
+    G = json_graph.node_link_graph(json.loads(gj.read_text(encoding="utf-8")), edges="links")
+    analysis_path = OUT / ".graphify_analysis.json"
+    if not analysis_path.exists():
+        print(f"Missing {analysis_path}", file=sys.stderr)
+        sys.exit(1)
+    analysis = json.loads(analysis_path.read_text(encoding="utf-8"))
+    communities = {int(k): v for k, v in analysis["communities"].items()}
+    labels_path = OUT / ".graphify_labels.json"
+    labels: dict[int, str] = {}
+    if labels_path.exists():
+        labels = {int(k): v for k, v in json.loads(labels_path.read_text(encoding="utf-8")).items()}
+    n = G.number_of_nodes()
+    # graphify skips full HTML when n > default limit; pass explicit limit for full-node viz.
+    to_html(
+        G,
+        communities,
+        str(OUT / "graph.html"),
+        community_labels=labels or None,
+        node_limit=n,
+    )
+    print(f"Wrote {OUT / 'graph.html'} ({n} nodes)")
+if __name__ == "__main__":
+    main()

package/.agents/skills/defuddle/SKILL.md DELETED Viewed

@@ -1,90 +0,0 @@
----
-name: defuddle
-description: "Strip clutter from web pages before ingesting into the wiki. Removes ads, navigation, headers, footers, and boilerplate: leaving clean readable markdown that saves 40-60% tokens. Triggers on: defuddle, clean this page, strip this url, fetch and clean, clean web content before ingesting, strip ads, remove clutter, clean URL content, readable markdown from URL."
-allowed-tools: Read Bash
----
-# defuddle: Web Page Cleaner
-Defuddle extracts the meaningful content from a web page and drops everything else: ads, cookie banners, nav bars, related articles, footers, social sharing buttons. What remains is the article body as clean markdown.
-Use this before any URL ingestion. It is optional but strongly recommended. It cuts token usage by 40-60% on typical web articles and produces cleaner wiki pages.
----
-## Wiki Path Resolution
-This skill saves cleaned content to `.raw/` (relative to vault root). It does NOT write to `wiki/` directly. The vault root is the working directory. Other skills (wiki-ingest) handle wiki path resolution via `VAULT_WIKI_PATH` when reading from `.raw/` and writing to `wiki/`.
----
-## Install
-```bash
-npm install -g defuddle-cli
-```
-Verify: `defuddle --version`
----
-## Usage
-### Clean a URL directly
-```bash
-defuddle https://example.com/article
-```
-Outputs clean markdown to stdout.
-### Save to .raw/
-```bash
-defuddle https://example.com/article > .raw/articles/article-slug-$(date +%Y-%m-%d).md
-```
-### Add frontmatter header after saving
-After running defuddle, prepend the source URL and fetch date:
-```bash
-SLUG="article-slug-$(date +%Y-%m-%d)"
-{ echo "---"; echo "source_url: https://example.com/article"; echo "fetched: $(date +%Y-%m-%d)"; echo "---"; echo ""; defuddle https://example.com/article; } > .raw/articles/$SLUG.md
-```
-### Clean a local HTML file
-```bash
-defuddle page.html
-```
----
-## When to Use
-**Use defuddle when:**
-- Ingesting a news article, blog post, or documentation page from a URL
-- The page has a lot of surrounding content (most web pages do)
-- You want to stay within token budget on a long article
-**Skip defuddle when:**
-- The source is already a clean markdown or PDF file
-- The page is a dashboard, app, or structured data (defuddle expects article-style content)
-- defuddle is not installed and the article is short enough to process raw
----
-## Fallback
-If defuddle is not installed, check:
-```bash
-which defuddle 2>/dev/null || echo "not installed"
-```
-If not installed: use WebFetch directly. The content will be less clean but still workable.
----
-## Integration with /wiki-ingest
-The `/wiki-ingest` skill checks for defuddle automatically when a URL is passed. You do not need to run defuddle manually before ingesting a URL. The ingest skill will call it if available.
-To manually clean a page and save before ingesting:
-1. Run the save command above
-2. Then: `ingest .raw/articles/[slug].md`