knowledge-worker 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mygraph/merge.py ADDED
@@ -0,0 +1,133 @@
1
+ """
2
+ merge.py — Stage 4 of the v1 ingest pipeline.
3
+
4
+ Idempotent merge of approved candidates into the graph. Slug-based IDs already
5
+ make `add_node` / `add_edge` idempotent (per v0). We add:
6
+
7
+ - Source node: always merges cleanly (Stage 1 emits it).
8
+ - MENTIONED_IN edge from each new concept node to the Source (auto-injected if
9
+ the extractor didn't include it).
10
+ - Body-diff: if a candidate ID matches an existing node but the body differs,
11
+ we surface the diff and prompt keep_old / replace / append. Logged as a
12
+ review eval_record.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import difflib
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+
21
+ from mygraph import Graph, Node, Edge
22
+ try:
23
+ from .eval_log import append as eval_append
24
+ except ImportError: # direct script execution
25
+ from eval_log import append as eval_append
26
+
27
+
28
+ def _now() -> str:
29
+ return datetime.now(timezone.utc).isoformat()
30
+
31
+
32
+ def _diff(old: str, new: str) -> str:
33
+ return "\n".join(difflib.unified_diff(
34
+ old.splitlines(), new.splitlines(),
35
+ fromfile="existing", tofile="candidate", lineterm=""))
36
+
37
+
38
+ def _resolve_body_conflict(existing: Node, cand: dict, interactive: bool) -> str:
39
+ if not interactive:
40
+ # default to keep_old in non-interactive mode (safest)
41
+ eval_append({"kind": "merge_body_conflict", "candidate_id": cand["id"],
42
+ "resolution": "keep_old_auto"})
43
+ return existing.body
44
+ print(f"\n[merge] body conflict for {cand['id']}:")
45
+ print(_diff(existing.body, cand.get("body", "")))
46
+ choice = ""
47
+ while choice not in {"k", "r", "a"}:
48
+ try:
49
+ choice = input(" [k]eep old / [r]eplace / [a]ppend > ").strip().lower()
50
+ except EOFError:
51
+ choice = "k"
52
+ eval_append({"kind": "merge_body_conflict", "candidate_id": cand["id"],
53
+ "resolution": {"k": "keep_old", "r": "replace", "a": "append"}[choice]})
54
+ if choice == "k":
55
+ return existing.body
56
+ if choice == "r":
57
+ return cand.get("body", "")
58
+ return (existing.body + "\n\n--- (appended) ---\n" + cand.get("body", "")).strip()
59
+
60
+
61
+ def merge(approved: dict, interactive: bool = True) -> tuple[int, int]:
62
+ """
63
+ Merge approved nodes/edges into the graph. Returns (nodes_added, edges_added).
64
+ """
65
+ g = Graph.load()
66
+ src = approved["source"]
67
+ src_id = src["id"]
68
+
69
+ # 1. Source node always merges
70
+ src_existed = src_id in g.nodes
71
+ g.add_node(Node(id=src_id, type="source", label=src["label"],
72
+ body=src.get("body", ""), confidence="high"))
73
+ nodes_added = 0 if src_existed else 1
74
+
75
+ # 2. Concept nodes
76
+ for cand in approved.get("nodes", []):
77
+ nid = cand["id"]
78
+ if nid in g.nodes:
79
+ existing = g.nodes[nid]
80
+ new_body = cand.get("body", "")
81
+ if new_body and new_body.strip() != (existing.body or "").strip():
82
+ resolved = _resolve_body_conflict(existing, cand, interactive)
83
+ existing.body = resolved
84
+ existing.label = cand.get("label", existing.label)
85
+ existing.confidence = cand.get("confidence", existing.confidence)
86
+ else:
87
+ g.add_node(Node(id=nid, type=cand["type"], label=cand["label"],
88
+ body=cand.get("body", ""),
89
+ confidence=cand.get("confidence", "medium")))
90
+ nodes_added += 1
91
+
92
+ # 3. Edges (extractor-emitted)
93
+ edges_added = 0
94
+ for cand in approved.get("edges", []):
95
+ try:
96
+ e = Edge(src=cand["src"], dst=cand["dst"], type=cand["type"],
97
+ source_id=src_id, excerpt=cand.get("excerpt", ""),
98
+ confidence=cand.get("confidence", "medium"))
99
+ except (AssertionError, ValueError) as exc:
100
+ eval_append({"kind": "merge_edge_skipped", "edge": cand, "reason": str(exc)})
101
+ continue
102
+ before = len(g.edges)
103
+ g.add_edge(e)
104
+ if len(g.edges) > before:
105
+ edges_added += 1
106
+ else:
107
+ # de-duped: refresh last_seen on the existing edge
108
+ for existing in g.edges:
109
+ if (existing.src, existing.dst, existing.type, existing.source_id) == \
110
+ (e.src, e.dst, e.type, e.source_id):
111
+ existing.last_seen = _now()
112
+ break
113
+
114
+ # 4. Auto-inject MENTIONED_IN for any approved node missing one to this source
115
+ new_concept_ids = {n["id"] for n in approved.get("nodes", [])}
116
+ has_mentioned = {(e.src, e.dst, e.type) for e in g.edges if e.type == "MENTIONED_IN"}
117
+ for nid in new_concept_ids:
118
+ if nid == src_id:
119
+ continue
120
+ if (nid, src_id, "MENTIONED_IN") in has_mentioned:
121
+ continue
122
+ # find the candidate to grab its excerpt
123
+ cand = next((n for n in approved.get("nodes", []) if n["id"] == nid), {})
124
+ e = Edge(src=nid, dst=src_id, type="MENTIONED_IN",
125
+ source_id=src_id, excerpt=cand.get("excerpt", ""),
126
+ confidence=cand.get("confidence", "medium"))
127
+ before = len(g.edges)
128
+ g.add_edge(e)
129
+ if len(g.edges) > before:
130
+ edges_added += 1
131
+
132
+ g.save()
133
+ return nodes_added, edges_added