knowledge-worker 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_worker-0.6.0.dist-info/METADATA +365 -0
- knowledge_worker-0.6.0.dist-info/RECORD +27 -0
- knowledge_worker-0.6.0.dist-info/WHEEL +5 -0
- knowledge_worker-0.6.0.dist-info/entry_points.txt +3 -0
- knowledge_worker-0.6.0.dist-info/licenses/LICENSE +21 -0
- knowledge_worker-0.6.0.dist-info/top_level.txt +2 -0
- mygraph/__init__.py +23 -0
- mygraph/anthropic_client.py +199 -0
- mygraph/audit.py +137 -0
- mygraph/check.py +273 -0
- mygraph/discover.py +654 -0
- mygraph/eval_log.py +36 -0
- mygraph/export_context.py +124 -0
- mygraph/extractor.py +243 -0
- mygraph/extractor_openai.py +165 -0
- mygraph/ingest.py +170 -0
- mygraph/memory_audit.py +1094 -0
- mygraph/merge.py +133 -0
- mygraph/mygraph.py +773 -0
- mygraph/owl_io.py +202 -0
- mygraph/review.py +151 -0
- mygraph/validator.py +149 -0
- mygraph/viz.py +409 -0
- ollama_proxy/eval_compare.py +185 -0
- ollama_proxy/extractor_adapter.py +168 -0
- ollama_proxy/proxy.py +143 -0
- ollama_proxy/server.py +194 -0
mygraph/discover.py
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
"""
|
|
2
|
+
discover.py - second-order network analytics and derived-edge proposals.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
mykg discover
|
|
6
|
+
mykg discover --out discovery.json --candidates discovery.candidates.json
|
|
7
|
+
|
|
8
|
+
Where `mykg audit` ranks what already exists, `mykg discover` infers what the
|
|
9
|
+
graph implies but does not yet say. It runs seven read-only analyses:
|
|
10
|
+
|
|
11
|
+
staleness_radar important nodes whose evidence has gone cold
|
|
12
|
+
co_mentions pairs that recur across sources with no direct edge
|
|
13
|
+
serves_candidates ideas/decisions structurally close to a goal they
|
|
14
|
+
do not yet SERVE
|
|
15
|
+
related_candidates Adamic-Adar link prediction over the semantic graph
|
|
16
|
+
question_debt open questions with no answering decision or evidence
|
|
17
|
+
corroboration claims that hang on a single source
|
|
18
|
+
bridges cross-community connectors after removing hub "spines"
|
|
19
|
+
tensions nodes that are both supported and challenged, and
|
|
20
|
+
conflicts between contributions to the same goal
|
|
21
|
+
|
|
22
|
+
Every result is a PROPOSAL. Discover never mutates the graph: derived edges
|
|
23
|
+
(CO_MENTIONED_WITH, SERVES_CANDIDATE, RELATES_TO, BRIDGES, TENSION_WITH) are
|
|
24
|
+
written to a candidates file for human review — AI proposes, provenance
|
|
25
|
+
verifies, the owner promotes.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import json
|
|
32
|
+
import math
|
|
33
|
+
import sys
|
|
34
|
+
from collections import defaultdict
|
|
35
|
+
from datetime import datetime
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from .mygraph import Edge, Graph
|
|
40
|
+
from .memory_audit import (
|
|
41
|
+
PROVENANCE_EDGE_TYPES,
|
|
42
|
+
_add_projection,
|
|
43
|
+
_betweenness,
|
|
44
|
+
_build_adjacency,
|
|
45
|
+
_community_partition,
|
|
46
|
+
_pagerank,
|
|
47
|
+
_semantic_edges,
|
|
48
|
+
_semantic_ids,
|
|
49
|
+
_source_projection_edges,
|
|
50
|
+
)
|
|
51
|
+
except ImportError: # direct script execution: python mygraph/discover.py
|
|
52
|
+
from mygraph import Edge, Graph
|
|
53
|
+
from memory_audit import (
|
|
54
|
+
PROVENANCE_EDGE_TYPES,
|
|
55
|
+
_add_projection,
|
|
56
|
+
_betweenness,
|
|
57
|
+
_build_adjacency,
|
|
58
|
+
_community_partition,
|
|
59
|
+
_pagerank,
|
|
60
|
+
_semantic_edges,
|
|
61
|
+
_semantic_ids,
|
|
62
|
+
_source_projection_edges,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
SCHEMA_VERSION = 1
|
|
66
|
+
|
|
67
|
+
# Edge types that express goal contribution, used for serves-gap detection.
|
|
68
|
+
GOAL_EDGE_TYPES = {"SERVES", "HAS_IDEA", "ABOUT", "INVOLVES"}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------- shared scaffolding -------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def _parse_ts(raw: str | None) -> datetime | None:
|
|
74
|
+
if not raw:
|
|
75
|
+
return None
|
|
76
|
+
try:
|
|
77
|
+
return datetime.fromisoformat(raw.replace("Z", "+00:00"))
|
|
78
|
+
except ValueError:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _node_sources(g: Graph) -> dict[str, set[str]]:
|
|
83
|
+
"""Distinct provenance sources per non-source node."""
|
|
84
|
+
sources: dict[str, set[str]] = defaultdict(set)
|
|
85
|
+
for edge in g.edges:
|
|
86
|
+
if edge.type not in PROVENANCE_EDGE_TYPES:
|
|
87
|
+
continue
|
|
88
|
+
src = g.nodes.get(edge.src)
|
|
89
|
+
dst = g.nodes.get(edge.dst)
|
|
90
|
+
if src and dst and src.type != "source" and dst.type == "source":
|
|
91
|
+
sources[edge.src].add(edge.dst)
|
|
92
|
+
elif src and dst and src.type == "source" and dst.type != "source":
|
|
93
|
+
sources[edge.dst].add(edge.src)
|
|
94
|
+
return sources
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _excerpt_for(g: Graph, node_id: str, source_id: str) -> str:
|
|
98
|
+
for edge in g.edges:
|
|
99
|
+
if edge.type not in PROVENANCE_EDGE_TYPES or not edge.excerpt:
|
|
100
|
+
continue
|
|
101
|
+
if {edge.src, edge.dst} == {node_id, source_id}:
|
|
102
|
+
return edge.excerpt
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _label(g: Graph, node_id: str) -> str:
|
|
107
|
+
node = g.nodes.get(node_id)
|
|
108
|
+
return node.label if node else node_id
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _proposal(g: Graph, src: str, dst: str, type_: str, score: float,
|
|
112
|
+
rationale: str, evidence: list[str]) -> dict:
|
|
113
|
+
return {
|
|
114
|
+
"src": src,
|
|
115
|
+
"dst": dst,
|
|
116
|
+
"type": type_,
|
|
117
|
+
"score": round(score, 6),
|
|
118
|
+
"rationale": rationale,
|
|
119
|
+
"evidence_sources": sorted(evidence),
|
|
120
|
+
"src_label": _label(g, src),
|
|
121
|
+
"dst_label": _label(g, dst),
|
|
122
|
+
"status": "proposed",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class _Workspace:
|
|
127
|
+
"""Semantic projection shared by all analyses, computed once."""
|
|
128
|
+
|
|
129
|
+
def __init__(self, g: Graph):
|
|
130
|
+
self.g = g
|
|
131
|
+
self.ids = _semantic_ids(g)
|
|
132
|
+
id_set = set(self.ids)
|
|
133
|
+
self.semantic_edges = _semantic_edges(g, id_set)
|
|
134
|
+
self.directed, self.undirected = _build_adjacency(self.ids, self.semantic_edges)
|
|
135
|
+
_add_projection(self.directed, self.undirected,
|
|
136
|
+
_source_projection_edges(g, id_set))
|
|
137
|
+
self.pagerank = _pagerank(self.ids, self.directed)
|
|
138
|
+
self.betweenness = _betweenness(self.ids, self.undirected)
|
|
139
|
+
self.node_sources = _node_sources(g)
|
|
140
|
+
# direct semantic adjacency (no source projection), for "no existing
|
|
141
|
+
# edge" checks when proposing new links
|
|
142
|
+
self.direct_links: set[frozenset[str]] = {
|
|
143
|
+
frozenset((e.src, e.dst)) for e in self.semantic_edges
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------- 1. staleness radar -------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def staleness_radar(ws: _Workspace, stale_days: int, limit: int) -> dict:
|
|
150
|
+
"""Important nodes whose evidence trail has gone cold.
|
|
151
|
+
|
|
152
|
+
Recency is the newest `last_seen`/`created_at` on any incident edge (or the
|
|
153
|
+
node's own `created_at`). The clock reference is the newest timestamp in
|
|
154
|
+
the whole graph, so results are deterministic for a committed graph file.
|
|
155
|
+
"""
|
|
156
|
+
g = ws.g
|
|
157
|
+
latest: datetime | None = None
|
|
158
|
+
recency: dict[str, datetime] = {}
|
|
159
|
+
for node_id in ws.ids:
|
|
160
|
+
ts = _parse_ts(g.nodes[node_id].created_at)
|
|
161
|
+
if ts:
|
|
162
|
+
recency[node_id] = ts
|
|
163
|
+
latest = max(latest, ts) if latest else ts
|
|
164
|
+
for edge in g.edges:
|
|
165
|
+
ts = _parse_ts(edge.last_seen) or _parse_ts(edge.created_at)
|
|
166
|
+
if not ts:
|
|
167
|
+
continue
|
|
168
|
+
latest = max(latest, ts) if latest else ts
|
|
169
|
+
for endpoint in (edge.src, edge.dst):
|
|
170
|
+
if endpoint in recency:
|
|
171
|
+
recency[endpoint] = max(recency[endpoint], ts)
|
|
172
|
+
elif endpoint in g.nodes and g.nodes[endpoint].type != "source":
|
|
173
|
+
recency[endpoint] = ts
|
|
174
|
+
|
|
175
|
+
if not latest:
|
|
176
|
+
return {"reference_time": None, "stale_days_threshold": stale_days, "stale": []}
|
|
177
|
+
|
|
178
|
+
max_rank = max(ws.pagerank.values()) or 1.0
|
|
179
|
+
records = []
|
|
180
|
+
for node_id, seen in recency.items():
|
|
181
|
+
days = (latest - seen).total_seconds() / 86400.0
|
|
182
|
+
if days < stale_days:
|
|
183
|
+
continue
|
|
184
|
+
importance = ws.pagerank.get(node_id, 0.0) / max_rank
|
|
185
|
+
node = g.nodes[node_id]
|
|
186
|
+
records.append({
|
|
187
|
+
"id": node_id,
|
|
188
|
+
"type": node.type,
|
|
189
|
+
"label": node.label,
|
|
190
|
+
"days_stale": round(days, 1),
|
|
191
|
+
"importance": round(importance, 4),
|
|
192
|
+
"staleness_score": round(importance * days, 4),
|
|
193
|
+
"flag": "STALE",
|
|
194
|
+
})
|
|
195
|
+
records.sort(key=lambda r: (-r["staleness_score"], r["id"]))
|
|
196
|
+
return {
|
|
197
|
+
"reference_time": latest.isoformat(),
|
|
198
|
+
"stale_days_threshold": stale_days,
|
|
199
|
+
"stale": records[:limit],
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ---------- 2. co-mention inference --------------------------------------------
|
|
204
|
+
|
|
205
|
+
def co_mention_candidates(ws: _Workspace, min_sources: int, limit: int) -> list[dict]:
|
|
206
|
+
"""Pairs mentioned together in >= min_sources distinct sources but never
|
|
207
|
+
directly linked. Multi-source co-occurrence is stronger evidence than
|
|
208
|
+
adjacency inside a single conversation."""
|
|
209
|
+
g = ws.g
|
|
210
|
+
by_source: dict[str, set[str]] = defaultdict(set)
|
|
211
|
+
for node_id, sources in ws.node_sources.items():
|
|
212
|
+
for source_id in sources:
|
|
213
|
+
by_source[source_id].add(node_id)
|
|
214
|
+
|
|
215
|
+
pair_sources: dict[frozenset[str], set[str]] = defaultdict(set)
|
|
216
|
+
for source_id, members in by_source.items():
|
|
217
|
+
ordered = sorted(members)
|
|
218
|
+
for i, left in enumerate(ordered):
|
|
219
|
+
for right in ordered[i + 1:]:
|
|
220
|
+
pair_sources[frozenset((left, right))].add(source_id)
|
|
221
|
+
|
|
222
|
+
proposals = []
|
|
223
|
+
for pair, sources in pair_sources.items():
|
|
224
|
+
if len(sources) < min_sources or pair in ws.direct_links:
|
|
225
|
+
continue
|
|
226
|
+
left, right = sorted(pair)
|
|
227
|
+
proposals.append(_proposal(
|
|
228
|
+
ws.g, left, right, "CO_MENTIONED_WITH",
|
|
229
|
+
score=float(len(sources)),
|
|
230
|
+
rationale=(
|
|
231
|
+
f"co-mentioned in {len(sources)} distinct sources "
|
|
232
|
+
"with no direct edge"
|
|
233
|
+
),
|
|
234
|
+
evidence=list(sources),
|
|
235
|
+
))
|
|
236
|
+
proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
|
|
237
|
+
return proposals[:limit]
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------- 3+4. link prediction (serves gaps + related pairs) ------------------
|
|
241
|
+
|
|
242
|
+
def _adamic_adar(ws: _Workspace, a: str, b: str) -> tuple[float, list[str]]:
|
|
243
|
+
shared = ws.undirected[a] & ws.undirected[b]
|
|
244
|
+
score = 0.0
|
|
245
|
+
witnesses = []
|
|
246
|
+
for z in shared:
|
|
247
|
+
degree = len(ws.undirected[z])
|
|
248
|
+
if degree > 1:
|
|
249
|
+
score += 1.0 / math.log(degree)
|
|
250
|
+
witnesses.append(z)
|
|
251
|
+
return score, sorted(witnesses)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _has_directed_path(ws: _Workspace, start: str, target: str,
|
|
255
|
+
edge_types: set[str]) -> bool:
|
|
256
|
+
allowed: dict[str, set[str]] = defaultdict(set)
|
|
257
|
+
for edge in ws.semantic_edges:
|
|
258
|
+
if edge.type in edge_types:
|
|
259
|
+
allowed[edge.src].add(edge.dst)
|
|
260
|
+
frontier, seen = [start], {start}
|
|
261
|
+
while frontier:
|
|
262
|
+
current = frontier.pop()
|
|
263
|
+
if current == target:
|
|
264
|
+
return True
|
|
265
|
+
for nxt in allowed[current]:
|
|
266
|
+
if nxt not in seen:
|
|
267
|
+
seen.add(nxt)
|
|
268
|
+
frontier.append(nxt)
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def serves_candidates(ws: _Workspace, limit: int) -> list[dict]:
|
|
273
|
+
"""Ideas/decisions/projects structurally entangled with a goal they have
|
|
274
|
+
no contribution path to. Surfaces the work the graph cannot yet explain."""
|
|
275
|
+
g = ws.g
|
|
276
|
+
goals = [nid for nid in ws.ids if g.nodes[nid].type == "goal"]
|
|
277
|
+
contributors = [
|
|
278
|
+
nid for nid in ws.ids
|
|
279
|
+
if g.nodes[nid].type in {"idea", "decision", "project"}
|
|
280
|
+
]
|
|
281
|
+
proposals = []
|
|
282
|
+
for goal in goals:
|
|
283
|
+
for node_id in contributors:
|
|
284
|
+
if frozenset((node_id, goal)) in ws.direct_links:
|
|
285
|
+
continue
|
|
286
|
+
score, witnesses = _adamic_adar(ws, node_id, goal)
|
|
287
|
+
if score <= 0 or _has_directed_path(ws, node_id, goal, GOAL_EDGE_TYPES):
|
|
288
|
+
continue
|
|
289
|
+
proposals.append(_proposal(
|
|
290
|
+
g, node_id, goal, "SERVES_CANDIDATE",
|
|
291
|
+
score=score,
|
|
292
|
+
rationale=(
|
|
293
|
+
f"shares {len(witnesses)} neighbors with the goal "
|
|
294
|
+
"but has no contribution path to it"
|
|
295
|
+
),
|
|
296
|
+
evidence=witnesses,
|
|
297
|
+
))
|
|
298
|
+
proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
|
|
299
|
+
return proposals[:limit]
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def related_candidates(ws: _Workspace, limit: int) -> list[dict]:
|
|
303
|
+
"""Classic Adamic-Adar link prediction over the semantic projection:
|
|
304
|
+
non-adjacent pairs whose neighborhoods strongly overlap."""
|
|
305
|
+
g = ws.g
|
|
306
|
+
proposals = []
|
|
307
|
+
for i, a in enumerate(ws.ids):
|
|
308
|
+
for b in ws.ids[i + 1:]:
|
|
309
|
+
pair = frozenset((a, b))
|
|
310
|
+
if pair in ws.direct_links:
|
|
311
|
+
continue
|
|
312
|
+
score, witnesses = _adamic_adar(ws, a, b)
|
|
313
|
+
if score < 1.0 or len(witnesses) < 2:
|
|
314
|
+
continue
|
|
315
|
+
proposals.append(_proposal(
|
|
316
|
+
g, a, b, "RELATES_TO",
|
|
317
|
+
score=score,
|
|
318
|
+
rationale=f"Adamic-Adar {round(score, 3)} via {len(witnesses)} shared neighbors",
|
|
319
|
+
evidence=witnesses,
|
|
320
|
+
))
|
|
321
|
+
proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
|
|
322
|
+
return proposals[:limit]
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# ---------- 5. question debt ----------------------------------------------------
|
|
326
|
+
|
|
327
|
+
def question_debt(ws: _Workspace, limit: int) -> dict:
|
|
328
|
+
"""Open questions ranked by how central, old, and evidence-free they are.
|
|
329
|
+
|
|
330
|
+
A question counts as answered when a decision points at it via ABOUT; those
|
|
331
|
+
detected pairs are reported as ANSWERS edges."""
|
|
332
|
+
g = ws.g
|
|
333
|
+
answered_by: dict[str, list[str]] = defaultdict(list)
|
|
334
|
+
evidence_count: dict[str, int] = defaultdict(int)
|
|
335
|
+
for edge in ws.semantic_edges:
|
|
336
|
+
if edge.type == "ABOUT" and g.nodes[edge.src].type == "decision" \
|
|
337
|
+
and g.nodes[edge.dst].type == "question":
|
|
338
|
+
answered_by[edge.dst].append(edge.src)
|
|
339
|
+
if edge.type == "SUPPORTED_BY" and g.nodes[edge.src].type == "question":
|
|
340
|
+
evidence_count[edge.src] += 1
|
|
341
|
+
|
|
342
|
+
latest = None
|
|
343
|
+
for node_id in ws.ids:
|
|
344
|
+
ts = _parse_ts(g.nodes[node_id].created_at)
|
|
345
|
+
if ts:
|
|
346
|
+
latest = max(latest, ts) if latest else ts
|
|
347
|
+
|
|
348
|
+
max_rank = max(ws.pagerank.values()) or 1.0
|
|
349
|
+
open_questions, answers = [], []
|
|
350
|
+
for node_id in ws.ids:
|
|
351
|
+
node = g.nodes[node_id]
|
|
352
|
+
if node.type != "question":
|
|
353
|
+
continue
|
|
354
|
+
deciders = sorted(answered_by.get(node_id, []))
|
|
355
|
+
for decision_id in deciders:
|
|
356
|
+
answers.append({"src": decision_id, "dst": node_id, "type": "ANSWERS"})
|
|
357
|
+
if deciders:
|
|
358
|
+
continue
|
|
359
|
+
created = _parse_ts(node.created_at)
|
|
360
|
+
age_days = (latest - created).total_seconds() / 86400.0 if latest and created else 0.0
|
|
361
|
+
weight = ws.pagerank.get(node_id, 0.0) / max_rank
|
|
362
|
+
open_questions.append({
|
|
363
|
+
"id": node_id,
|
|
364
|
+
"label": node.label,
|
|
365
|
+
"age_days": round(age_days, 1),
|
|
366
|
+
"evidence_edges": evidence_count.get(node_id, 0),
|
|
367
|
+
"centrality": round(weight, 4),
|
|
368
|
+
"debt_score": round(weight * (1.0 + age_days), 4),
|
|
369
|
+
"flag": "UNANSWERED",
|
|
370
|
+
})
|
|
371
|
+
open_questions.sort(key=lambda r: (-r["debt_score"], r["id"]))
|
|
372
|
+
answers.sort(key=lambda r: (r["src"], r["dst"]))
|
|
373
|
+
return {"open": open_questions[:limit], "answers_detected": answers}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# ---------- 6. corroboration ----------------------------------------------------
|
|
377
|
+
|
|
378
|
+
def corroboration(ws: _Workspace, limit: int) -> dict:
|
|
379
|
+
"""How many independent sources back each claim. Single-source memories are
|
|
380
|
+
one bad transcript away from being wrong."""
|
|
381
|
+
g = ws.g
|
|
382
|
+
max_rank = max(ws.pagerank.values()) or 1.0
|
|
383
|
+
single, distribution = [], defaultdict(int)
|
|
384
|
+
for node_id in ws.ids:
|
|
385
|
+
count = len(ws.node_sources.get(node_id, set()))
|
|
386
|
+
distribution[count] += 1
|
|
387
|
+
if count == 1:
|
|
388
|
+
source_id = next(iter(ws.node_sources[node_id]))
|
|
389
|
+
single.append({
|
|
390
|
+
"id": node_id,
|
|
391
|
+
"type": g.nodes[node_id].type,
|
|
392
|
+
"label": g.nodes[node_id].label,
|
|
393
|
+
"source": source_id,
|
|
394
|
+
"excerpt": _excerpt_for(g, node_id, source_id),
|
|
395
|
+
"centrality": round(ws.pagerank.get(node_id, 0.0) / max_rank, 4),
|
|
396
|
+
"flag": "SINGLE_SOURCE",
|
|
397
|
+
})
|
|
398
|
+
single.sort(key=lambda r: (-r["centrality"], r["id"]))
|
|
399
|
+
return {
|
|
400
|
+
"source_count_distribution": dict(sorted(distribution.items())),
|
|
401
|
+
"single_source": single[:limit],
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# ---------- 7. de-spined bridges ------------------------------------------------
|
|
406
|
+
|
|
407
|
+
def despined_bridges(ws: _Workspace, limit: int, max_communities: int = 12) -> dict:
|
|
408
|
+
"""Bridges that remain once dominant hubs are removed.
|
|
409
|
+
|
|
410
|
+
Owner/project hub nodes absorb most betweenness and mask which concepts
|
|
411
|
+
actually connect domains. Remove any node holding > 2x the median nonzero
|
|
412
|
+
betweenness AND ranked in the top two, then recompute on the remainder."""
|
|
413
|
+
ranked = sorted(ws.betweenness.items(), key=lambda kv: (-kv[1], kv[0]))
|
|
414
|
+
nonzero = sorted(v for _, v in ranked if v > 0)
|
|
415
|
+
spine: list[str] = []
|
|
416
|
+
if len(nonzero) >= 3:
|
|
417
|
+
median = nonzero[len(nonzero) // 2]
|
|
418
|
+
for node_id, value in ranked[:2]:
|
|
419
|
+
if value > 2 * median and value > 0:
|
|
420
|
+
spine.append(node_id)
|
|
421
|
+
|
|
422
|
+
remaining = [nid for nid in ws.ids if nid not in spine]
|
|
423
|
+
adjacency = {
|
|
424
|
+
nid: {n for n in ws.undirected[nid] if n not in spine}
|
|
425
|
+
for nid in remaining
|
|
426
|
+
}
|
|
427
|
+
betweenness = _betweenness(remaining, adjacency)
|
|
428
|
+
partition = _community_partition(remaining, adjacency, max_communities)
|
|
429
|
+
|
|
430
|
+
bridges = []
|
|
431
|
+
for edge in ws.semantic_edges:
|
|
432
|
+
if edge.src in spine or edge.dst in spine:
|
|
433
|
+
continue
|
|
434
|
+
left, right = partition.get(edge.src), partition.get(edge.dst)
|
|
435
|
+
if left is None or right is None or left == right:
|
|
436
|
+
continue
|
|
437
|
+
score = betweenness.get(edge.src, 0.0) + betweenness.get(edge.dst, 0.0)
|
|
438
|
+
bridges.append({
|
|
439
|
+
"src": edge.src,
|
|
440
|
+
"dst": edge.dst,
|
|
441
|
+
"type": "BRIDGES",
|
|
442
|
+
"edge_type": edge.type,
|
|
443
|
+
"communities": sorted((left, right)),
|
|
444
|
+
"score": round(score, 6),
|
|
445
|
+
"src_label": _label(ws.g, edge.src),
|
|
446
|
+
"dst_label": _label(ws.g, edge.dst),
|
|
447
|
+
})
|
|
448
|
+
bridges.sort(key=lambda b: (-b["score"], b["src"], b["dst"]))
|
|
449
|
+
|
|
450
|
+
top_nodes = sorted(
|
|
451
|
+
({"id": nid, "label": _label(ws.g, nid), "betweenness": round(val, 6)}
|
|
452
|
+
for nid, val in betweenness.items() if val > 0),
|
|
453
|
+
key=lambda r: (-r["betweenness"], r["id"]),
|
|
454
|
+
)
|
|
455
|
+
return {
|
|
456
|
+
"spine_removed": spine,
|
|
457
|
+
"communities": len(set(partition.values())),
|
|
458
|
+
"bridge_edges": bridges[:limit],
|
|
459
|
+
"bridge_nodes": top_nodes[:limit],
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# ---------- 8. tensions ---------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
def tensions(ws: _Workspace, limit: int) -> list[dict]:
|
|
466
|
+
"""Contradiction structure: nodes both supported and challenged, and
|
|
467
|
+
challenged contributors to goals that other nodes serve."""
|
|
468
|
+
g = ws.g
|
|
469
|
+
supported: dict[str, list[str]] = defaultdict(list)
|
|
470
|
+
challenged: dict[str, list[str]] = defaultdict(list)
|
|
471
|
+
serves: dict[str, list[str]] = defaultdict(list)
|
|
472
|
+
for edge in ws.semantic_edges:
|
|
473
|
+
if edge.type == "SUPPORTED_BY":
|
|
474
|
+
supported[edge.src].append(edge.dst)
|
|
475
|
+
elif edge.type == "CHALLENGES":
|
|
476
|
+
challenged[edge.dst].append(edge.src)
|
|
477
|
+
elif edge.type == "SERVES":
|
|
478
|
+
serves[edge.dst].append(edge.src)
|
|
479
|
+
|
|
480
|
+
proposals = []
|
|
481
|
+
for node_id in ws.ids:
|
|
482
|
+
if node_id in supported and node_id in challenged:
|
|
483
|
+
for challenger in sorted(challenged[node_id]):
|
|
484
|
+
proposals.append(_proposal(
|
|
485
|
+
g, challenger, node_id, "TENSION_WITH",
|
|
486
|
+
score=float(len(supported[node_id]) + len(challenged[node_id])),
|
|
487
|
+
rationale=(
|
|
488
|
+
f"target has {len(supported[node_id])} supporting and "
|
|
489
|
+
f"{len(challenged[node_id])} challenging edges — contested claim"
|
|
490
|
+
),
|
|
491
|
+
evidence=sorted(supported[node_id]),
|
|
492
|
+
))
|
|
493
|
+
for goal, contributors in serves.items():
|
|
494
|
+
if goal not in challenged:
|
|
495
|
+
continue
|
|
496
|
+
for challenger in sorted(challenged[goal]):
|
|
497
|
+
for contributor in sorted(contributors):
|
|
498
|
+
if contributor == challenger:
|
|
499
|
+
continue
|
|
500
|
+
proposals.append(_proposal(
|
|
501
|
+
g, challenger, contributor, "TENSION_WITH",
|
|
502
|
+
score=1.0,
|
|
503
|
+
rationale=(
|
|
504
|
+
f"challenges {goal}, which this node SERVES — "
|
|
505
|
+
"the contribution inherits the risk"
|
|
506
|
+
),
|
|
507
|
+
evidence=[goal],
|
|
508
|
+
))
|
|
509
|
+
proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
|
|
510
|
+
return proposals[:limit]
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# ---------- assembly ------------------------------------------------------------
|
|
514
|
+
|
|
515
|
+
def build_discovery(g: Graph, *, limit: int = 10, stale_days: int = 30,
|
|
516
|
+
min_co_sources: int = 2) -> dict:
|
|
517
|
+
ws = _Workspace(g)
|
|
518
|
+
report = {
|
|
519
|
+
"schema_version": SCHEMA_VERSION,
|
|
520
|
+
"stats": {
|
|
521
|
+
"semantic_nodes": len(ws.ids),
|
|
522
|
+
"semantic_edges": len(ws.semantic_edges),
|
|
523
|
+
},
|
|
524
|
+
"staleness_radar": staleness_radar(ws, stale_days, limit),
|
|
525
|
+
"co_mentions": co_mention_candidates(ws, min_co_sources, limit),
|
|
526
|
+
"serves_candidates": serves_candidates(ws, limit),
|
|
527
|
+
"related_candidates": related_candidates(ws, limit),
|
|
528
|
+
"question_debt": question_debt(ws, limit),
|
|
529
|
+
"corroboration": corroboration(ws, limit),
|
|
530
|
+
"bridges": despined_bridges(ws, limit),
|
|
531
|
+
"tensions": tensions(ws, limit),
|
|
532
|
+
}
|
|
533
|
+
return report
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def extract_candidates(report: dict) -> dict:
|
|
537
|
+
"""Flatten every derived-edge proposal into one promotion-queue payload."""
|
|
538
|
+
proposals = []
|
|
539
|
+
for section in ("co_mentions", "serves_candidates", "related_candidates", "tensions"):
|
|
540
|
+
proposals.extend(report.get(section, []))
|
|
541
|
+
for bridge in report.get("bridges", {}).get("bridge_edges", []):
|
|
542
|
+
proposals.append({
|
|
543
|
+
"src": bridge["src"],
|
|
544
|
+
"dst": bridge["dst"],
|
|
545
|
+
"type": "BRIDGES",
|
|
546
|
+
"score": bridge["score"],
|
|
547
|
+
"rationale": f"connects communities {bridge['communities']}",
|
|
548
|
+
"evidence_sources": [],
|
|
549
|
+
"src_label": bridge["src_label"],
|
|
550
|
+
"dst_label": bridge["dst_label"],
|
|
551
|
+
"status": "proposed",
|
|
552
|
+
})
|
|
553
|
+
return {
|
|
554
|
+
"schema_version": SCHEMA_VERSION,
|
|
555
|
+
"note": "Derived-edge proposals. Review before promoting; discover never mutates the graph.",
|
|
556
|
+
"proposals": proposals,
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
# ---------- console report ------------------------------------------------------
|
|
561
|
+
|
|
562
|
+
def _print_pairs(title: str, rows: list[dict], empty: str) -> None:
|
|
563
|
+
print(f"\n{title}")
|
|
564
|
+
if not rows:
|
|
565
|
+
print(f" {empty}")
|
|
566
|
+
return
|
|
567
|
+
for row in rows:
|
|
568
|
+
print(f" {row['src']} <-> {row['dst']} [{row['type']} {row['score']}]")
|
|
569
|
+
print(f" {row['rationale']}")
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def print_report(report: dict) -> None:
|
|
573
|
+
stats = report["stats"]
|
|
574
|
+
print(f"discover: {stats['semantic_nodes']} semantic nodes, "
|
|
575
|
+
f"{stats['semantic_edges']} semantic edges - proposals only, graph untouched")
|
|
576
|
+
|
|
577
|
+
radar = report["staleness_radar"]
|
|
578
|
+
print(f"\nStaleness radar (>={radar['stale_days_threshold']} days behind latest activity)")
|
|
579
|
+
if not radar["stale"]:
|
|
580
|
+
print(" nothing stale - memory is warm")
|
|
581
|
+
for row in radar["stale"]:
|
|
582
|
+
print(f" {row['id']} {row['days_stale']}d cold, importance {row['importance']}")
|
|
583
|
+
|
|
584
|
+
_print_pairs("Co-mention candidates (recur across sources, never linked)",
|
|
585
|
+
report["co_mentions"], "no multi-source co-mentions without edges")
|
|
586
|
+
_print_pairs("Goal-alignment candidates (close to a goal, no contribution path)",
|
|
587
|
+
report["serves_candidates"], "every entangled node already has a path to its goals")
|
|
588
|
+
_print_pairs("Link predictions (Adamic-Adar)",
|
|
589
|
+
report["related_candidates"], "no strong non-adjacent overlaps")
|
|
590
|
+
|
|
591
|
+
debt = report["question_debt"]
|
|
592
|
+
print(f"\nQuestion debt ({len(debt['open'])} open, "
|
|
593
|
+
f"{len(debt['answers_detected'])} answered via decisions)")
|
|
594
|
+
for row in debt["open"]:
|
|
595
|
+
print(f" {row['id']} debt {row['debt_score']} "
|
|
596
|
+
f"(age {row['age_days']}d, evidence edges {row['evidence_edges']})")
|
|
597
|
+
|
|
598
|
+
corro = report["corroboration"]
|
|
599
|
+
print(f"\nCorroboration (source-count distribution {corro['source_count_distribution']})")
|
|
600
|
+
for row in corro["single_source"]:
|
|
601
|
+
print(f" {row['id']} single source: {row['source']}")
|
|
602
|
+
|
|
603
|
+
bridges = report["bridges"]
|
|
604
|
+
spine = ", ".join(bridges["spine_removed"]) or "none"
|
|
605
|
+
print(f"\nBridges after removing spine [{spine}] "
|
|
606
|
+
f"({bridges['communities']} communities)")
|
|
607
|
+
for row in bridges["bridge_edges"]:
|
|
608
|
+
print(f" {row['src']} --{row['edge_type']}-- {row['dst']} "
|
|
609
|
+
f"communities {row['communities']}")
|
|
610
|
+
|
|
611
|
+
_print_pairs("Tensions", report["tensions"], "no contested claims detected")
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
# ---------- CLI -----------------------------------------------------------------
|
|
615
|
+
|
|
616
|
+
def run_discover(args: list[str]) -> int:
|
|
617
|
+
parser = argparse.ArgumentParser(prog="mykg discover")
|
|
618
|
+
parser.add_argument("--graph", default=None,
|
|
619
|
+
help="Graph JSON path. Defaults to MYGRAPH_PATH or local graph.")
|
|
620
|
+
parser.add_argument("--out", default=None,
|
|
621
|
+
help="Write full discovery report JSON here ('-' for stdout).")
|
|
622
|
+
parser.add_argument("--candidates", default=None,
|
|
623
|
+
help="Write derived-edge proposals (promotion queue) here.")
|
|
624
|
+
parser.add_argument("--limit", type=int, default=10, help="Rows per section.")
|
|
625
|
+
parser.add_argument("--stale-days", type=int, default=30,
|
|
626
|
+
help="Days behind latest graph activity before a node is stale.")
|
|
627
|
+
parser.add_argument("--min-co-sources", type=int, default=2,
|
|
628
|
+
help="Distinct sources required for a co-mention proposal.")
|
|
629
|
+
parsed = parser.parse_args(args)
|
|
630
|
+
|
|
631
|
+
g = Graph.load(parsed.graph)
|
|
632
|
+
report = build_discovery(g, limit=parsed.limit, stale_days=parsed.stale_days,
|
|
633
|
+
min_co_sources=parsed.min_co_sources)
|
|
634
|
+
|
|
635
|
+
if parsed.out == "-":
|
|
636
|
+
json.dump(report, sys.stdout, indent=2, sort_keys=True)
|
|
637
|
+
print()
|
|
638
|
+
else:
|
|
639
|
+
print_report(report)
|
|
640
|
+
if parsed.out:
|
|
641
|
+
path = Path(parsed.out).expanduser().resolve()
|
|
642
|
+
path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
643
|
+
print(f"\ndiscover: wrote {path}")
|
|
644
|
+
|
|
645
|
+
if parsed.candidates:
|
|
646
|
+
payload = extract_candidates(report)
|
|
647
|
+
path = Path(parsed.candidates).expanduser().resolve()
|
|
648
|
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
649
|
+
print(f"discover: wrote {len(payload['proposals'])} proposals -> {path}")
|
|
650
|
+
return 0
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
if __name__ == "__main__":
|
|
654
|
+
sys.exit(run_discover(sys.argv[1:]))
|
mygraph/eval_log.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
eval_log.py — JSONL appender for eval_record.jsonl.
|
|
3
|
+
|
|
4
|
+
Every review action, provenance violation, stale-edge flag, relational probe,
|
|
5
|
+
and source-candidate suggestion writes one line here. This is the v1 corpus that
|
|
6
|
+
v2+ will use for prompt refinement / edge weighting / RL.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
HERE = Path(__file__).parent
|
|
16
|
+
EVAL_LOG = HERE / "eval_record.jsonl"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def now() -> str:
|
|
20
|
+
return datetime.now(timezone.utc).isoformat()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def append(record: dict, path: Path = EVAL_LOG) -> None:
|
|
24
|
+
record.setdefault("ts", now())
|
|
25
|
+
with path.open("a", encoding="utf-8") as f:
|
|
26
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def append_many(records: list[dict], path: Path = EVAL_LOG) -> None:
|
|
30
|
+
if not records:
|
|
31
|
+
return
|
|
32
|
+
ts = now()
|
|
33
|
+
with path.open("a", encoding="utf-8") as f:
|
|
34
|
+
for r in records:
|
|
35
|
+
r.setdefault("ts", ts)
|
|
36
|
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|