knowledge-worker 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mygraph/discover.py ADDED
@@ -0,0 +1,654 @@
1
+ """
2
+ discover.py - second-order network analytics and derived-edge proposals.
3
+
4
+ Usage:
5
+ mykg discover
6
+ mykg discover --out discovery.json --candidates discovery.candidates.json
7
+
8
+ Where `mykg audit` ranks what already exists, `mykg discover` infers what the
9
+ graph implies but does not yet say. It runs seven read-only analyses:
10
+
11
+ staleness_radar important nodes whose evidence has gone cold
12
+ co_mentions pairs that recur across sources with no direct edge
13
+ serves_candidates ideas/decisions structurally close to a goal they
14
+ do not yet SERVE
15
+ related_candidates Adamic-Adar link prediction over the semantic graph
16
+ question_debt open questions with no answering decision or evidence
17
+ corroboration claims that hang on a single source
18
+ bridges cross-community connectors after removing hub "spines"
19
+ tensions nodes that are both supported and challenged, and
20
+ conflicts between contributions to the same goal
21
+
22
+ Every result is a PROPOSAL. Discover never mutates the graph: derived edges
23
+ (CO_MENTIONED_WITH, SERVES_CANDIDATE, RELATES_TO, BRIDGES, TENSION_WITH) are
24
+ written to a candidates file for human review — AI proposes, provenance
25
+ verifies, the owner promotes.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import math
33
+ import sys
34
+ from collections import defaultdict
35
+ from datetime import datetime
36
+ from pathlib import Path
37
+
38
+ try:
39
+ from .mygraph import Edge, Graph
40
+ from .memory_audit import (
41
+ PROVENANCE_EDGE_TYPES,
42
+ _add_projection,
43
+ _betweenness,
44
+ _build_adjacency,
45
+ _community_partition,
46
+ _pagerank,
47
+ _semantic_edges,
48
+ _semantic_ids,
49
+ _source_projection_edges,
50
+ )
51
+ except ImportError: # direct script execution: python mygraph/discover.py
52
+ from mygraph import Edge, Graph
53
+ from memory_audit import (
54
+ PROVENANCE_EDGE_TYPES,
55
+ _add_projection,
56
+ _betweenness,
57
+ _build_adjacency,
58
+ _community_partition,
59
+ _pagerank,
60
+ _semantic_edges,
61
+ _semantic_ids,
62
+ _source_projection_edges,
63
+ )
64
+
65
+ SCHEMA_VERSION = 1
66
+
67
+ # Edge types that express goal contribution, used for serves-gap detection.
68
+ GOAL_EDGE_TYPES = {"SERVES", "HAS_IDEA", "ABOUT", "INVOLVES"}
69
+
70
+
71
+ # ---------- shared scaffolding -------------------------------------------------
72
+
73
+ def _parse_ts(raw: str | None) -> datetime | None:
74
+ if not raw:
75
+ return None
76
+ try:
77
+ return datetime.fromisoformat(raw.replace("Z", "+00:00"))
78
+ except ValueError:
79
+ return None
80
+
81
+
82
+ def _node_sources(g: Graph) -> dict[str, set[str]]:
83
+ """Distinct provenance sources per non-source node."""
84
+ sources: dict[str, set[str]] = defaultdict(set)
85
+ for edge in g.edges:
86
+ if edge.type not in PROVENANCE_EDGE_TYPES:
87
+ continue
88
+ src = g.nodes.get(edge.src)
89
+ dst = g.nodes.get(edge.dst)
90
+ if src and dst and src.type != "source" and dst.type == "source":
91
+ sources[edge.src].add(edge.dst)
92
+ elif src and dst and src.type == "source" and dst.type != "source":
93
+ sources[edge.dst].add(edge.src)
94
+ return sources
95
+
96
+
97
+ def _excerpt_for(g: Graph, node_id: str, source_id: str) -> str:
98
+ for edge in g.edges:
99
+ if edge.type not in PROVENANCE_EDGE_TYPES or not edge.excerpt:
100
+ continue
101
+ if {edge.src, edge.dst} == {node_id, source_id}:
102
+ return edge.excerpt
103
+ return ""
104
+
105
+
106
+ def _label(g: Graph, node_id: str) -> str:
107
+ node = g.nodes.get(node_id)
108
+ return node.label if node else node_id
109
+
110
+
111
+ def _proposal(g: Graph, src: str, dst: str, type_: str, score: float,
112
+ rationale: str, evidence: list[str]) -> dict:
113
+ return {
114
+ "src": src,
115
+ "dst": dst,
116
+ "type": type_,
117
+ "score": round(score, 6),
118
+ "rationale": rationale,
119
+ "evidence_sources": sorted(evidence),
120
+ "src_label": _label(g, src),
121
+ "dst_label": _label(g, dst),
122
+ "status": "proposed",
123
+ }
124
+
125
+
126
+ class _Workspace:
127
+ """Semantic projection shared by all analyses, computed once."""
128
+
129
+ def __init__(self, g: Graph):
130
+ self.g = g
131
+ self.ids = _semantic_ids(g)
132
+ id_set = set(self.ids)
133
+ self.semantic_edges = _semantic_edges(g, id_set)
134
+ self.directed, self.undirected = _build_adjacency(self.ids, self.semantic_edges)
135
+ _add_projection(self.directed, self.undirected,
136
+ _source_projection_edges(g, id_set))
137
+ self.pagerank = _pagerank(self.ids, self.directed)
138
+ self.betweenness = _betweenness(self.ids, self.undirected)
139
+ self.node_sources = _node_sources(g)
140
+ # direct semantic adjacency (no source projection), for "no existing
141
+ # edge" checks when proposing new links
142
+ self.direct_links: set[frozenset[str]] = {
143
+ frozenset((e.src, e.dst)) for e in self.semantic_edges
144
+ }
145
+
146
+
147
+ # ---------- 1. staleness radar -------------------------------------------------
148
+
149
+ def staleness_radar(ws: _Workspace, stale_days: int, limit: int) -> dict:
150
+ """Important nodes whose evidence trail has gone cold.
151
+
152
+ Recency is the newest `last_seen`/`created_at` on any incident edge (or the
153
+ node's own `created_at`). The clock reference is the newest timestamp in
154
+ the whole graph, so results are deterministic for a committed graph file.
155
+ """
156
+ g = ws.g
157
+ latest: datetime | None = None
158
+ recency: dict[str, datetime] = {}
159
+ for node_id in ws.ids:
160
+ ts = _parse_ts(g.nodes[node_id].created_at)
161
+ if ts:
162
+ recency[node_id] = ts
163
+ latest = max(latest, ts) if latest else ts
164
+ for edge in g.edges:
165
+ ts = _parse_ts(edge.last_seen) or _parse_ts(edge.created_at)
166
+ if not ts:
167
+ continue
168
+ latest = max(latest, ts) if latest else ts
169
+ for endpoint in (edge.src, edge.dst):
170
+ if endpoint in recency:
171
+ recency[endpoint] = max(recency[endpoint], ts)
172
+ elif endpoint in g.nodes and g.nodes[endpoint].type != "source":
173
+ recency[endpoint] = ts
174
+
175
+ if not latest:
176
+ return {"reference_time": None, "stale_days_threshold": stale_days, "stale": []}
177
+
178
+ max_rank = max(ws.pagerank.values()) or 1.0
179
+ records = []
180
+ for node_id, seen in recency.items():
181
+ days = (latest - seen).total_seconds() / 86400.0
182
+ if days < stale_days:
183
+ continue
184
+ importance = ws.pagerank.get(node_id, 0.0) / max_rank
185
+ node = g.nodes[node_id]
186
+ records.append({
187
+ "id": node_id,
188
+ "type": node.type,
189
+ "label": node.label,
190
+ "days_stale": round(days, 1),
191
+ "importance": round(importance, 4),
192
+ "staleness_score": round(importance * days, 4),
193
+ "flag": "STALE",
194
+ })
195
+ records.sort(key=lambda r: (-r["staleness_score"], r["id"]))
196
+ return {
197
+ "reference_time": latest.isoformat(),
198
+ "stale_days_threshold": stale_days,
199
+ "stale": records[:limit],
200
+ }
201
+
202
+
203
+ # ---------- 2. co-mention inference --------------------------------------------
204
+
205
+ def co_mention_candidates(ws: _Workspace, min_sources: int, limit: int) -> list[dict]:
206
+ """Pairs mentioned together in >= min_sources distinct sources but never
207
+ directly linked. Multi-source co-occurrence is stronger evidence than
208
+ adjacency inside a single conversation."""
209
+ g = ws.g
210
+ by_source: dict[str, set[str]] = defaultdict(set)
211
+ for node_id, sources in ws.node_sources.items():
212
+ for source_id in sources:
213
+ by_source[source_id].add(node_id)
214
+
215
+ pair_sources: dict[frozenset[str], set[str]] = defaultdict(set)
216
+ for source_id, members in by_source.items():
217
+ ordered = sorted(members)
218
+ for i, left in enumerate(ordered):
219
+ for right in ordered[i + 1:]:
220
+ pair_sources[frozenset((left, right))].add(source_id)
221
+
222
+ proposals = []
223
+ for pair, sources in pair_sources.items():
224
+ if len(sources) < min_sources or pair in ws.direct_links:
225
+ continue
226
+ left, right = sorted(pair)
227
+ proposals.append(_proposal(
228
+ ws.g, left, right, "CO_MENTIONED_WITH",
229
+ score=float(len(sources)),
230
+ rationale=(
231
+ f"co-mentioned in {len(sources)} distinct sources "
232
+ "with no direct edge"
233
+ ),
234
+ evidence=list(sources),
235
+ ))
236
+ proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
237
+ return proposals[:limit]
238
+
239
+
240
+ # ---------- 3+4. link prediction (serves gaps + related pairs) ------------------
241
+
242
+ def _adamic_adar(ws: _Workspace, a: str, b: str) -> tuple[float, list[str]]:
243
+ shared = ws.undirected[a] & ws.undirected[b]
244
+ score = 0.0
245
+ witnesses = []
246
+ for z in shared:
247
+ degree = len(ws.undirected[z])
248
+ if degree > 1:
249
+ score += 1.0 / math.log(degree)
250
+ witnesses.append(z)
251
+ return score, sorted(witnesses)
252
+
253
+
254
+ def _has_directed_path(ws: _Workspace, start: str, target: str,
255
+ edge_types: set[str]) -> bool:
256
+ allowed: dict[str, set[str]] = defaultdict(set)
257
+ for edge in ws.semantic_edges:
258
+ if edge.type in edge_types:
259
+ allowed[edge.src].add(edge.dst)
260
+ frontier, seen = [start], {start}
261
+ while frontier:
262
+ current = frontier.pop()
263
+ if current == target:
264
+ return True
265
+ for nxt in allowed[current]:
266
+ if nxt not in seen:
267
+ seen.add(nxt)
268
+ frontier.append(nxt)
269
+ return False
270
+
271
+
272
+ def serves_candidates(ws: _Workspace, limit: int) -> list[dict]:
273
+ """Ideas/decisions/projects structurally entangled with a goal they have
274
+ no contribution path to. Surfaces the work the graph cannot yet explain."""
275
+ g = ws.g
276
+ goals = [nid for nid in ws.ids if g.nodes[nid].type == "goal"]
277
+ contributors = [
278
+ nid for nid in ws.ids
279
+ if g.nodes[nid].type in {"idea", "decision", "project"}
280
+ ]
281
+ proposals = []
282
+ for goal in goals:
283
+ for node_id in contributors:
284
+ if frozenset((node_id, goal)) in ws.direct_links:
285
+ continue
286
+ score, witnesses = _adamic_adar(ws, node_id, goal)
287
+ if score <= 0 or _has_directed_path(ws, node_id, goal, GOAL_EDGE_TYPES):
288
+ continue
289
+ proposals.append(_proposal(
290
+ g, node_id, goal, "SERVES_CANDIDATE",
291
+ score=score,
292
+ rationale=(
293
+ f"shares {len(witnesses)} neighbors with the goal "
294
+ "but has no contribution path to it"
295
+ ),
296
+ evidence=witnesses,
297
+ ))
298
+ proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
299
+ return proposals[:limit]
300
+
301
+
302
+ def related_candidates(ws: _Workspace, limit: int) -> list[dict]:
303
+ """Classic Adamic-Adar link prediction over the semantic projection:
304
+ non-adjacent pairs whose neighborhoods strongly overlap."""
305
+ g = ws.g
306
+ proposals = []
307
+ for i, a in enumerate(ws.ids):
308
+ for b in ws.ids[i + 1:]:
309
+ pair = frozenset((a, b))
310
+ if pair in ws.direct_links:
311
+ continue
312
+ score, witnesses = _adamic_adar(ws, a, b)
313
+ if score < 1.0 or len(witnesses) < 2:
314
+ continue
315
+ proposals.append(_proposal(
316
+ g, a, b, "RELATES_TO",
317
+ score=score,
318
+ rationale=f"Adamic-Adar {round(score, 3)} via {len(witnesses)} shared neighbors",
319
+ evidence=witnesses,
320
+ ))
321
+ proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
322
+ return proposals[:limit]
323
+
324
+
325
+ # ---------- 5. question debt ----------------------------------------------------
326
+
327
+ def question_debt(ws: _Workspace, limit: int) -> dict:
328
+ """Open questions ranked by how central, old, and evidence-free they are.
329
+
330
+ A question counts as answered when a decision points at it via ABOUT; those
331
+ detected pairs are reported as ANSWERS edges."""
332
+ g = ws.g
333
+ answered_by: dict[str, list[str]] = defaultdict(list)
334
+ evidence_count: dict[str, int] = defaultdict(int)
335
+ for edge in ws.semantic_edges:
336
+ if edge.type == "ABOUT" and g.nodes[edge.src].type == "decision" \
337
+ and g.nodes[edge.dst].type == "question":
338
+ answered_by[edge.dst].append(edge.src)
339
+ if edge.type == "SUPPORTED_BY" and g.nodes[edge.src].type == "question":
340
+ evidence_count[edge.src] += 1
341
+
342
+ latest = None
343
+ for node_id in ws.ids:
344
+ ts = _parse_ts(g.nodes[node_id].created_at)
345
+ if ts:
346
+ latest = max(latest, ts) if latest else ts
347
+
348
+ max_rank = max(ws.pagerank.values()) or 1.0
349
+ open_questions, answers = [], []
350
+ for node_id in ws.ids:
351
+ node = g.nodes[node_id]
352
+ if node.type != "question":
353
+ continue
354
+ deciders = sorted(answered_by.get(node_id, []))
355
+ for decision_id in deciders:
356
+ answers.append({"src": decision_id, "dst": node_id, "type": "ANSWERS"})
357
+ if deciders:
358
+ continue
359
+ created = _parse_ts(node.created_at)
360
+ age_days = (latest - created).total_seconds() / 86400.0 if latest and created else 0.0
361
+ weight = ws.pagerank.get(node_id, 0.0) / max_rank
362
+ open_questions.append({
363
+ "id": node_id,
364
+ "label": node.label,
365
+ "age_days": round(age_days, 1),
366
+ "evidence_edges": evidence_count.get(node_id, 0),
367
+ "centrality": round(weight, 4),
368
+ "debt_score": round(weight * (1.0 + age_days), 4),
369
+ "flag": "UNANSWERED",
370
+ })
371
+ open_questions.sort(key=lambda r: (-r["debt_score"], r["id"]))
372
+ answers.sort(key=lambda r: (r["src"], r["dst"]))
373
+ return {"open": open_questions[:limit], "answers_detected": answers}
374
+
375
+
376
+ # ---------- 6. corroboration ----------------------------------------------------
377
+
378
+ def corroboration(ws: _Workspace, limit: int) -> dict:
379
+ """How many independent sources back each claim. Single-source memories are
380
+ one bad transcript away from being wrong."""
381
+ g = ws.g
382
+ max_rank = max(ws.pagerank.values()) or 1.0
383
+ single, distribution = [], defaultdict(int)
384
+ for node_id in ws.ids:
385
+ count = len(ws.node_sources.get(node_id, set()))
386
+ distribution[count] += 1
387
+ if count == 1:
388
+ source_id = next(iter(ws.node_sources[node_id]))
389
+ single.append({
390
+ "id": node_id,
391
+ "type": g.nodes[node_id].type,
392
+ "label": g.nodes[node_id].label,
393
+ "source": source_id,
394
+ "excerpt": _excerpt_for(g, node_id, source_id),
395
+ "centrality": round(ws.pagerank.get(node_id, 0.0) / max_rank, 4),
396
+ "flag": "SINGLE_SOURCE",
397
+ })
398
+ single.sort(key=lambda r: (-r["centrality"], r["id"]))
399
+ return {
400
+ "source_count_distribution": dict(sorted(distribution.items())),
401
+ "single_source": single[:limit],
402
+ }
403
+
404
+
405
+ # ---------- 7. de-spined bridges ------------------------------------------------
406
+
407
+ def despined_bridges(ws: _Workspace, limit: int, max_communities: int = 12) -> dict:
408
+ """Bridges that remain once dominant hubs are removed.
409
+
410
+ Owner/project hub nodes absorb most betweenness and mask which concepts
411
+ actually connect domains. Remove any node holding > 2x the median nonzero
412
+ betweenness AND ranked in the top two, then recompute on the remainder."""
413
+ ranked = sorted(ws.betweenness.items(), key=lambda kv: (-kv[1], kv[0]))
414
+ nonzero = sorted(v for _, v in ranked if v > 0)
415
+ spine: list[str] = []
416
+ if len(nonzero) >= 3:
417
+ median = nonzero[len(nonzero) // 2]
418
+ for node_id, value in ranked[:2]:
419
+ if value > 2 * median and value > 0:
420
+ spine.append(node_id)
421
+
422
+ remaining = [nid for nid in ws.ids if nid not in spine]
423
+ adjacency = {
424
+ nid: {n for n in ws.undirected[nid] if n not in spine}
425
+ for nid in remaining
426
+ }
427
+ betweenness = _betweenness(remaining, adjacency)
428
+ partition = _community_partition(remaining, adjacency, max_communities)
429
+
430
+ bridges = []
431
+ for edge in ws.semantic_edges:
432
+ if edge.src in spine or edge.dst in spine:
433
+ continue
434
+ left, right = partition.get(edge.src), partition.get(edge.dst)
435
+ if left is None or right is None or left == right:
436
+ continue
437
+ score = betweenness.get(edge.src, 0.0) + betweenness.get(edge.dst, 0.0)
438
+ bridges.append({
439
+ "src": edge.src,
440
+ "dst": edge.dst,
441
+ "type": "BRIDGES",
442
+ "edge_type": edge.type,
443
+ "communities": sorted((left, right)),
444
+ "score": round(score, 6),
445
+ "src_label": _label(ws.g, edge.src),
446
+ "dst_label": _label(ws.g, edge.dst),
447
+ })
448
+ bridges.sort(key=lambda b: (-b["score"], b["src"], b["dst"]))
449
+
450
+ top_nodes = sorted(
451
+ ({"id": nid, "label": _label(ws.g, nid), "betweenness": round(val, 6)}
452
+ for nid, val in betweenness.items() if val > 0),
453
+ key=lambda r: (-r["betweenness"], r["id"]),
454
+ )
455
+ return {
456
+ "spine_removed": spine,
457
+ "communities": len(set(partition.values())),
458
+ "bridge_edges": bridges[:limit],
459
+ "bridge_nodes": top_nodes[:limit],
460
+ }
461
+
462
+
463
+ # ---------- 8. tensions ---------------------------------------------------------
464
+
465
+ def tensions(ws: _Workspace, limit: int) -> list[dict]:
466
+ """Contradiction structure: nodes both supported and challenged, and
467
+ challenged contributors to goals that other nodes serve."""
468
+ g = ws.g
469
+ supported: dict[str, list[str]] = defaultdict(list)
470
+ challenged: dict[str, list[str]] = defaultdict(list)
471
+ serves: dict[str, list[str]] = defaultdict(list)
472
+ for edge in ws.semantic_edges:
473
+ if edge.type == "SUPPORTED_BY":
474
+ supported[edge.src].append(edge.dst)
475
+ elif edge.type == "CHALLENGES":
476
+ challenged[edge.dst].append(edge.src)
477
+ elif edge.type == "SERVES":
478
+ serves[edge.dst].append(edge.src)
479
+
480
+ proposals = []
481
+ for node_id in ws.ids:
482
+ if node_id in supported and node_id in challenged:
483
+ for challenger in sorted(challenged[node_id]):
484
+ proposals.append(_proposal(
485
+ g, challenger, node_id, "TENSION_WITH",
486
+ score=float(len(supported[node_id]) + len(challenged[node_id])),
487
+ rationale=(
488
+ f"target has {len(supported[node_id])} supporting and "
489
+ f"{len(challenged[node_id])} challenging edges — contested claim"
490
+ ),
491
+ evidence=sorted(supported[node_id]),
492
+ ))
493
+ for goal, contributors in serves.items():
494
+ if goal not in challenged:
495
+ continue
496
+ for challenger in sorted(challenged[goal]):
497
+ for contributor in sorted(contributors):
498
+ if contributor == challenger:
499
+ continue
500
+ proposals.append(_proposal(
501
+ g, challenger, contributor, "TENSION_WITH",
502
+ score=1.0,
503
+ rationale=(
504
+ f"challenges {goal}, which this node SERVES — "
505
+ "the contribution inherits the risk"
506
+ ),
507
+ evidence=[goal],
508
+ ))
509
+ proposals.sort(key=lambda p: (-p["score"], p["src"], p["dst"]))
510
+ return proposals[:limit]
511
+
512
+
513
+ # ---------- assembly ------------------------------------------------------------
514
+
515
+ def build_discovery(g: Graph, *, limit: int = 10, stale_days: int = 30,
516
+ min_co_sources: int = 2) -> dict:
517
+ ws = _Workspace(g)
518
+ report = {
519
+ "schema_version": SCHEMA_VERSION,
520
+ "stats": {
521
+ "semantic_nodes": len(ws.ids),
522
+ "semantic_edges": len(ws.semantic_edges),
523
+ },
524
+ "staleness_radar": staleness_radar(ws, stale_days, limit),
525
+ "co_mentions": co_mention_candidates(ws, min_co_sources, limit),
526
+ "serves_candidates": serves_candidates(ws, limit),
527
+ "related_candidates": related_candidates(ws, limit),
528
+ "question_debt": question_debt(ws, limit),
529
+ "corroboration": corroboration(ws, limit),
530
+ "bridges": despined_bridges(ws, limit),
531
+ "tensions": tensions(ws, limit),
532
+ }
533
+ return report
534
+
535
+
536
+ def extract_candidates(report: dict) -> dict:
537
+ """Flatten every derived-edge proposal into one promotion-queue payload."""
538
+ proposals = []
539
+ for section in ("co_mentions", "serves_candidates", "related_candidates", "tensions"):
540
+ proposals.extend(report.get(section, []))
541
+ for bridge in report.get("bridges", {}).get("bridge_edges", []):
542
+ proposals.append({
543
+ "src": bridge["src"],
544
+ "dst": bridge["dst"],
545
+ "type": "BRIDGES",
546
+ "score": bridge["score"],
547
+ "rationale": f"connects communities {bridge['communities']}",
548
+ "evidence_sources": [],
549
+ "src_label": bridge["src_label"],
550
+ "dst_label": bridge["dst_label"],
551
+ "status": "proposed",
552
+ })
553
+ return {
554
+ "schema_version": SCHEMA_VERSION,
555
+ "note": "Derived-edge proposals. Review before promoting; discover never mutates the graph.",
556
+ "proposals": proposals,
557
+ }
558
+
559
+
560
+ # ---------- console report ------------------------------------------------------
561
+
562
+ def _print_pairs(title: str, rows: list[dict], empty: str) -> None:
563
+ print(f"\n{title}")
564
+ if not rows:
565
+ print(f" {empty}")
566
+ return
567
+ for row in rows:
568
+ print(f" {row['src']} <-> {row['dst']} [{row['type']} {row['score']}]")
569
+ print(f" {row['rationale']}")
570
+
571
+
572
+ def print_report(report: dict) -> None:
573
+ stats = report["stats"]
574
+ print(f"discover: {stats['semantic_nodes']} semantic nodes, "
575
+ f"{stats['semantic_edges']} semantic edges - proposals only, graph untouched")
576
+
577
+ radar = report["staleness_radar"]
578
+ print(f"\nStaleness radar (>={radar['stale_days_threshold']} days behind latest activity)")
579
+ if not radar["stale"]:
580
+ print(" nothing stale - memory is warm")
581
+ for row in radar["stale"]:
582
+ print(f" {row['id']} {row['days_stale']}d cold, importance {row['importance']}")
583
+
584
+ _print_pairs("Co-mention candidates (recur across sources, never linked)",
585
+ report["co_mentions"], "no multi-source co-mentions without edges")
586
+ _print_pairs("Goal-alignment candidates (close to a goal, no contribution path)",
587
+ report["serves_candidates"], "every entangled node already has a path to its goals")
588
+ _print_pairs("Link predictions (Adamic-Adar)",
589
+ report["related_candidates"], "no strong non-adjacent overlaps")
590
+
591
+ debt = report["question_debt"]
592
+ print(f"\nQuestion debt ({len(debt['open'])} open, "
593
+ f"{len(debt['answers_detected'])} answered via decisions)")
594
+ for row in debt["open"]:
595
+ print(f" {row['id']} debt {row['debt_score']} "
596
+ f"(age {row['age_days']}d, evidence edges {row['evidence_edges']})")
597
+
598
+ corro = report["corroboration"]
599
+ print(f"\nCorroboration (source-count distribution {corro['source_count_distribution']})")
600
+ for row in corro["single_source"]:
601
+ print(f" {row['id']} single source: {row['source']}")
602
+
603
+ bridges = report["bridges"]
604
+ spine = ", ".join(bridges["spine_removed"]) or "none"
605
+ print(f"\nBridges after removing spine [{spine}] "
606
+ f"({bridges['communities']} communities)")
607
+ for row in bridges["bridge_edges"]:
608
+ print(f" {row['src']} --{row['edge_type']}-- {row['dst']} "
609
+ f"communities {row['communities']}")
610
+
611
+ _print_pairs("Tensions", report["tensions"], "no contested claims detected")
612
+
613
+
614
+ # ---------- CLI -----------------------------------------------------------------
615
+
616
+ def run_discover(args: list[str]) -> int:
617
+ parser = argparse.ArgumentParser(prog="mykg discover")
618
+ parser.add_argument("--graph", default=None,
619
+ help="Graph JSON path. Defaults to MYGRAPH_PATH or local graph.")
620
+ parser.add_argument("--out", default=None,
621
+ help="Write full discovery report JSON here ('-' for stdout).")
622
+ parser.add_argument("--candidates", default=None,
623
+ help="Write derived-edge proposals (promotion queue) here.")
624
+ parser.add_argument("--limit", type=int, default=10, help="Rows per section.")
625
+ parser.add_argument("--stale-days", type=int, default=30,
626
+ help="Days behind latest graph activity before a node is stale.")
627
+ parser.add_argument("--min-co-sources", type=int, default=2,
628
+ help="Distinct sources required for a co-mention proposal.")
629
+ parsed = parser.parse_args(args)
630
+
631
+ g = Graph.load(parsed.graph)
632
+ report = build_discovery(g, limit=parsed.limit, stale_days=parsed.stale_days,
633
+ min_co_sources=parsed.min_co_sources)
634
+
635
+ if parsed.out == "-":
636
+ json.dump(report, sys.stdout, indent=2, sort_keys=True)
637
+ print()
638
+ else:
639
+ print_report(report)
640
+ if parsed.out:
641
+ path = Path(parsed.out).expanduser().resolve()
642
+ path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8")
643
+ print(f"\ndiscover: wrote {path}")
644
+
645
+ if parsed.candidates:
646
+ payload = extract_candidates(report)
647
+ path = Path(parsed.candidates).expanduser().resolve()
648
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
649
+ print(f"discover: wrote {len(payload['proposals'])} proposals -> {path}")
650
+ return 0
651
+
652
+
653
+ if __name__ == "__main__":
654
+ sys.exit(run_discover(sys.argv[1:]))
mygraph/eval_log.py ADDED
@@ -0,0 +1,36 @@
1
+ """
2
+ eval_log.py — JSONL appender for eval_record.jsonl.
3
+
4
+ Every review action, provenance violation, stale-edge flag, relational probe,
5
+ and source-candidate suggestion writes one line here. This is the v1 corpus that
6
+ v2+ will use for prompt refinement / edge weighting / RL.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+
15
+ HERE = Path(__file__).parent
16
+ EVAL_LOG = HERE / "eval_record.jsonl"
17
+
18
+
19
+ def now() -> str:
20
+ return datetime.now(timezone.utc).isoformat()
21
+
22
+
23
+ def append(record: dict, path: Path = EVAL_LOG) -> None:
24
+ record.setdefault("ts", now())
25
+ with path.open("a", encoding="utf-8") as f:
26
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
27
+
28
+
29
+ def append_many(records: list[dict], path: Path = EVAL_LOG) -> None:
30
+ if not records:
31
+ return
32
+ ts = now()
33
+ with path.open("a", encoding="utf-8") as f:
34
+ for r in records:
35
+ r.setdefault("ts", ts)
36
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")