@geravant/sinain 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +183 -0
  2. package/index.ts +2096 -0
  3. package/install.js +155 -0
  4. package/openclaw.plugin.json +59 -0
  5. package/package.json +21 -0
  6. package/sinain-memory/common.py +403 -0
  7. package/sinain-memory/demo_knowledge_transfer.sh +85 -0
  8. package/sinain-memory/embedder.py +268 -0
  9. package/sinain-memory/eval/__init__.py +0 -0
  10. package/sinain-memory/eval/assertions.py +288 -0
  11. package/sinain-memory/eval/judges/__init__.py +0 -0
  12. package/sinain-memory/eval/judges/base_judge.py +61 -0
  13. package/sinain-memory/eval/judges/curation_judge.py +46 -0
  14. package/sinain-memory/eval/judges/insight_judge.py +48 -0
  15. package/sinain-memory/eval/judges/mining_judge.py +42 -0
  16. package/sinain-memory/eval/judges/signal_judge.py +45 -0
  17. package/sinain-memory/eval/schemas.py +247 -0
  18. package/sinain-memory/eval_delta.py +109 -0
  19. package/sinain-memory/eval_reporter.py +642 -0
  20. package/sinain-memory/feedback_analyzer.py +221 -0
  21. package/sinain-memory/git_backup.sh +19 -0
  22. package/sinain-memory/insight_synthesizer.py +181 -0
  23. package/sinain-memory/memory/2026-03-01.md +11 -0
  24. package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
  25. package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
  26. package/sinain-memory/memory/sinain-playbook.md +21 -0
  27. package/sinain-memory/memory-config.json +39 -0
  28. package/sinain-memory/memory_miner.py +183 -0
  29. package/sinain-memory/module_manager.py +695 -0
  30. package/sinain-memory/playbook_curator.py +225 -0
  31. package/sinain-memory/requirements.txt +3 -0
  32. package/sinain-memory/signal_analyzer.py +141 -0
  33. package/sinain-memory/test_local.py +402 -0
  34. package/sinain-memory/tests/__init__.py +0 -0
  35. package/sinain-memory/tests/conftest.py +189 -0
  36. package/sinain-memory/tests/test_curator_helpers.py +94 -0
  37. package/sinain-memory/tests/test_embedder.py +210 -0
  38. package/sinain-memory/tests/test_extract_json.py +124 -0
  39. package/sinain-memory/tests/test_feedback_computation.py +121 -0
  40. package/sinain-memory/tests/test_miner_helpers.py +71 -0
  41. package/sinain-memory/tests/test_module_management.py +458 -0
  42. package/sinain-memory/tests/test_parsers.py +96 -0
  43. package/sinain-memory/tests/test_tick_evaluator.py +430 -0
  44. package/sinain-memory/tests/test_triple_extractor.py +255 -0
  45. package/sinain-memory/tests/test_triple_ingest.py +191 -0
  46. package/sinain-memory/tests/test_triple_migrate.py +138 -0
  47. package/sinain-memory/tests/test_triplestore.py +248 -0
  48. package/sinain-memory/tick_evaluator.py +392 -0
  49. package/sinain-memory/triple_extractor.py +402 -0
  50. package/sinain-memory/triple_ingest.py +290 -0
  51. package/sinain-memory/triple_migrate.py +275 -0
  52. package/sinain-memory/triple_query.py +184 -0
  53. package/sinain-memory/triplestore.py +498 -0
@@ -0,0 +1,642 @@
1
+ #!/usr/bin/env python3
2
+ """Tier 2 Evaluation: Daily report generator — runs as server cron job (daily 03:00).
3
+
4
+ Aggregates 24h of eval-logs, computes quality metrics, detects regressions,
5
+ uses LLM to interpret trends and write a daily report to memory/eval-reports/.
6
+
7
+ Invocation (cron):
8
+ uv run --with requests python3 sinain-koog/eval_reporter.py \
9
+ --memory-dir memory/ [--days 1]
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from collections import Counter
16
+ from datetime import datetime, timedelta, timezone
17
+ from pathlib import Path
18
+
19
+ _koog_dir = str(Path(__file__).resolve().parent)
20
+ if _koog_dir not in sys.path:
21
+ sys.path.insert(0, _koog_dir)
22
+
23
+ from common import LLMError, _load_config, _read_jsonl, call_llm, extract_json, read_recent_logs
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Config (duplicated from tick_evaluator to avoid circular import)
28
+ # ---------------------------------------------------------------------------
29
+
30
+ _EVAL_DEFAULTS = {
31
+ "level": "mechanical",
32
+ "sampleRate": 0.2,
33
+ "judges": {"model": "smart", "maxTokens": 200, "timeout": 30},
34
+ "dailyReport": True,
35
+ "regressionThresholds": {
36
+ "assertionPassRate": 0.85,
37
+ "effectivenessRate": 0.4,
38
+ "skipRate": 0.8,
39
+ },
40
+ }
41
+
42
+
43
+ def load_eval_config(memory_dir: str) -> dict:
44
+ """Load eval config with runtime overrides from memory/eval-config.json."""
45
+ base = _load_config().get("eval", {})
46
+ cfg = {**_EVAL_DEFAULTS, **base}
47
+
48
+ override_path = Path(memory_dir) / "eval-config.json"
49
+ if override_path.exists():
50
+ try:
51
+ override = json.loads(override_path.read_text(encoding="utf-8"))
52
+ cfg.update(override)
53
+ except (json.JSONDecodeError, OSError) as e:
54
+ print(f"[warn] eval-config.json override failed: {e}", file=sys.stderr)
55
+
56
+ return cfg
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Aggregation
61
+ # ---------------------------------------------------------------------------
62
+
63
+ def load_eval_logs(memory_dir: str, days: int = 1) -> list[dict]:
64
+ """Load eval-log entries from the last N days."""
65
+ log_dir = Path(memory_dir) / "eval-logs"
66
+ if not log_dir.is_dir():
67
+ return []
68
+
69
+ cutoff = datetime.now(timezone.utc) - timedelta(days=days)
70
+ entries: list[dict] = []
71
+
72
+ for jsonl_file in sorted(log_dir.glob("*.jsonl"), reverse=True):
73
+ try:
74
+ file_date = datetime.strptime(jsonl_file.stem, "%Y-%m-%d").replace(tzinfo=timezone.utc)
75
+ except ValueError:
76
+ continue
77
+ if file_date < cutoff - timedelta(days=1):
78
+ break
79
+ entries.extend(_read_jsonl(jsonl_file))
80
+
81
+ return entries
82
+
83
+
84
+ def extract_run_summaries(eval_entries: list[dict]) -> tuple[list[dict], list[dict]]:
85
+ """Separate run_summary metadata entries from tick eval results.
86
+
87
+ Returns (tick_entries, run_summaries).
88
+ """
89
+ ticks = []
90
+ summaries = []
91
+ for e in eval_entries:
92
+ if e.get("_type") == "run_summary":
93
+ summaries.append(e)
94
+ else:
95
+ ticks.append(e)
96
+ return ticks, summaries
97
+
98
+
99
+ def compute_aggregates(eval_entries: list[dict]) -> dict:
100
+ """Compute daily aggregate metrics from eval-log entries."""
101
+ if not eval_entries:
102
+ return {"tickCount": 0}
103
+
104
+ # Schema validity
105
+ schema_total = sum(e.get("schema", {}).get("total", 0) for e in eval_entries)
106
+ schema_valid = sum(e.get("schema", {}).get("valid", 0) for e in eval_entries)
107
+ schema_rate = round(schema_valid / schema_total, 3) if schema_total > 0 else 1.0
108
+
109
+ # Assertion pass rate
110
+ assert_total = sum(e.get("assertions", {}).get("total", 0) for e in eval_entries)
111
+ assert_passed = sum(e.get("assertions", {}).get("passed", 0) for e in eval_entries)
112
+ assert_rate = round(assert_passed / assert_total, 3) if assert_total > 0 else 1.0
113
+
114
+ # Assertion failure histogram
115
+ failure_counter: Counter = Counter()
116
+ for e in eval_entries:
117
+ for f in e.get("assertions", {}).get("failures", []):
118
+ failure_counter[f.get("name", "unknown")] += 1
119
+
120
+ # Judge score distribution + sub-score aggregation
121
+ judge_scores: dict[str, list[int]] = {}
122
+ sub_scores: dict[str, dict[str, list[int]]] = {} # {judge: {dim: [scores]}}
123
+ for e in eval_entries:
124
+ judges = e.get("judges")
125
+ if not judges:
126
+ continue
127
+ for judge_name, result in judges.items():
128
+ if isinstance(result, dict) and "score" in result:
129
+ judge_scores.setdefault(judge_name, []).append(result["score"])
130
+ # Collect multi-dimensional sub-scores if present
131
+ scores_dict = result.get("scores")
132
+ if isinstance(scores_dict, dict):
133
+ judge_subs = sub_scores.setdefault(judge_name, {})
134
+ for dim, val in scores_dict.items():
135
+ if isinstance(val, (int, float)):
136
+ judge_subs.setdefault(dim, []).append(int(val))
137
+
138
+ judge_avg = None
139
+ if judge_scores:
140
+ all_scores = [s for scores in judge_scores.values() for s in scores]
141
+ judge_avg = round(sum(all_scores) / len(all_scores), 2) if all_scores else None
142
+
143
+ # Pass rate trend
144
+ pass_rates = [e.get("passRate", 1.0) for e in eval_entries]
145
+ avg_pass_rate = round(sum(pass_rates) / len(pass_rates), 3)
146
+
147
+ # Build sub-score summary: {judge: {dim: {count, avg}}}
148
+ sub_score_summary: dict[str, dict[str, dict]] = {}
149
+ for judge_name, dims in sub_scores.items():
150
+ sub_score_summary[judge_name] = {
151
+ dim: {"count": len(vals), "avg": round(sum(vals) / len(vals), 2)}
152
+ for dim, vals in dims.items()
153
+ }
154
+
155
+ return {
156
+ "tickCount": len(eval_entries),
157
+ "schemaValidity": {"total": schema_total, "valid": schema_valid, "rate": schema_rate},
158
+ "assertionPassRate": {"total": assert_total, "passed": assert_passed, "rate": assert_rate},
159
+ "failureHistogram": dict(failure_counter.most_common(10)),
160
+ "judgeScores": {k: {"count": len(v), "avg": round(sum(v) / len(v), 2), "dist": dict(Counter(v))}
161
+ for k, v in judge_scores.items()},
162
+ "judgeAvg": judge_avg,
163
+ "subScores": sub_score_summary,
164
+ "avgPassRate": avg_pass_rate,
165
+ }
166
+
167
+
168
+ def compute_playbook_health(playbook_logs: list[dict]) -> dict:
169
+ """Compute playbook health metrics from heartbeat logs."""
170
+ line_counts: list[int] = []
171
+ total_added = 0
172
+ total_pruned = 0
173
+
174
+ for entry in playbook_logs:
175
+ changes = entry.get("playbookChanges", {})
176
+ if isinstance(changes, dict):
177
+ pl = changes.get("playbookLines")
178
+ if isinstance(pl, int):
179
+ line_counts.append(pl)
180
+ total_added += len(changes.get("added", []))
181
+ total_pruned += len(changes.get("pruned", []))
182
+
183
+ tick_count = len(playbook_logs) or 1
184
+ return {
185
+ "lineCountTrend": line_counts[-5:] if line_counts else [],
186
+ "avgChurnPerTick": round((total_added + total_pruned) / tick_count, 1),
187
+ "totalAdded": total_added,
188
+ "totalPruned": total_pruned,
189
+ }
190
+
191
+
192
+ def _percentile(sorted_vals: list[float], p: float) -> float:
193
+ """Compute the p-th percentile (0-100) from a pre-sorted list."""
194
+ if not sorted_vals:
195
+ return 0.0
196
+ k = (len(sorted_vals) - 1) * p / 100.0
197
+ f = int(k)
198
+ c = f + 1 if f + 1 < len(sorted_vals) else f
199
+ return round(sorted_vals[f] + (k - f) * (sorted_vals[c] - sorted_vals[f]), 1)
200
+
201
+
202
+ def compute_latency_stats(playbook_logs: list[dict]) -> dict[str, dict]:
203
+ """Aggregate per-script latency statistics from playbook-log entries.
204
+
205
+ Returns {scriptName: {count, avg, p50, p95}} for each script key found
206
+ in the latencyMs field, plus a "total" entry for totalLatencyMs.
207
+ """
208
+ buckets: dict[str, list[float]] = {}
209
+ for entry in playbook_logs:
210
+ lat = entry.get("latencyMs")
211
+ if isinstance(lat, dict):
212
+ for script, ms in lat.items():
213
+ if isinstance(ms, (int, float)):
214
+ buckets.setdefault(script, []).append(float(ms))
215
+ total = entry.get("totalLatencyMs")
216
+ if isinstance(total, (int, float)):
217
+ buckets.setdefault("total", []).append(float(total))
218
+
219
+ stats: dict[str, dict] = {}
220
+ for name, vals in buckets.items():
221
+ vals.sort()
222
+ stats[name] = {
223
+ "count": len(vals),
224
+ "avg": round(sum(vals) / len(vals), 1),
225
+ "p50": _percentile(vals, 50),
226
+ "p95": _percentile(vals, 95),
227
+ }
228
+ return stats
229
+
230
+
231
+ def compute_skip_rate(playbook_logs: list[dict]) -> float:
232
+ """Compute the insight synthesizer skip rate."""
233
+ total = 0
234
+ skipped = 0
235
+ for entry in playbook_logs:
236
+ output = entry.get("output")
237
+ if output is not None:
238
+ total += 1
239
+ if output.get("skip", False):
240
+ skipped += 1
241
+ return round(skipped / total, 2) if total > 0 else 0.0
242
+
243
+
244
+ # ---------------------------------------------------------------------------
245
+ # Regression detection
246
+ # ---------------------------------------------------------------------------
247
+
248
+ def detect_regressions(aggregates: dict, thresholds: dict, skip_rate: float) -> list[str]:
249
+ """Detect regressions based on thresholds."""
250
+ regressions: list[str] = []
251
+
252
+ assert_rate = aggregates.get("assertionPassRate", {}).get("rate", 1.0)
253
+ if assert_rate < thresholds.get("assertionPassRate", 0.85):
254
+ regressions.append(
255
+ f"Assertion pass rate {assert_rate:.1%} below threshold {thresholds['assertionPassRate']:.0%}"
256
+ )
257
+
258
+ if skip_rate > thresholds.get("skipRate", 0.8):
259
+ regressions.append(
260
+ f"Skip rate {skip_rate:.0%} above threshold {thresholds['skipRate']:.0%} — synthesizer rarely producing output"
261
+ )
262
+
263
+ # Repeated failures
264
+ histogram = aggregates.get("failureHistogram", {})
265
+ for name, count in histogram.items():
266
+ if count >= 3:
267
+ regressions.append(f"Assertion '{name}' failed {count} times (systemic issue)")
268
+
269
+ return regressions
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Report generation
274
+ # ---------------------------------------------------------------------------
275
+
276
+ def generate_report_markdown(
277
+ date_str: str,
278
+ aggregates: dict,
279
+ playbook_health: dict,
280
+ skip_rate: float,
281
+ regressions: list[str],
282
+ llm_interpretation: str = "",
283
+ run_summaries: list[dict] | None = None,
284
+ latency_stats: dict[str, dict] | None = None,
285
+ ) -> str:
286
+ """Generate the daily eval report as markdown."""
287
+ lines: list[str] = []
288
+ lines.append(f"# Eval Report — {date_str}\n")
289
+
290
+ # Quality Gates
291
+ lines.append("## Quality Gates")
292
+ schema = aggregates.get("schemaValidity", {})
293
+ s_rate = schema.get("rate", 1.0)
294
+ s_icon = "✓" if s_rate >= 0.95 else "⚠"
295
+ lines.append(f"- {s_icon} Schema validity: {s_rate:.0%} ({schema.get('valid', 0)}/{schema.get('total', 0)} checks)")
296
+
297
+ a = aggregates.get("assertionPassRate", {})
298
+ a_rate = a.get("rate", 1.0)
299
+ a_icon = "✓" if a_rate >= 0.85 else "⚠"
300
+ lines.append(f"- {a_icon} Assertion pass rate: {a_rate:.0%} ({a.get('passed', 0)}/{a.get('total', 0)} checks)")
301
+
302
+ j_avg = aggregates.get("judgeAvg")
303
+ if j_avg is not None:
304
+ j_icon = "✓" if j_avg >= 3.0 else "⚠"
305
+ judge_count = sum(v.get("count", 0) for v in aggregates.get("judgeScores", {}).values())
306
+ lines.append(f"- {j_icon} Mean judge score: {j_avg}/4.0 ({judge_count} evaluations)")
307
+
308
+ skip_icon = "✓" if skip_rate < 0.8 else "⚠"
309
+ lines.append(f"- {skip_icon} Skip rate: {skip_rate:.0%}")
310
+ lines.append(f"- Ticks evaluated: {aggregates.get('tickCount', 0)}")
311
+
312
+ # Partial run warning
313
+ if run_summaries:
314
+ partial_runs = [s for s in run_summaries if s.get("isPartial")]
315
+ if partial_runs:
316
+ total_failed = sum(s.get("failed", 0) for s in partial_runs)
317
+ total_attempted = sum(s.get("attempted", 0) for s in partial_runs)
318
+ lines.append(f"- ⚠ PARTIAL: {total_failed}/{total_attempted} tick evaluations "
319
+ f"failed across {len(partial_runs)} run(s)")
320
+ lines.append("")
321
+
322
+ # Assertion Failures
323
+ histogram = aggregates.get("failureHistogram", {})
324
+ if histogram:
325
+ lines.append("## Assertion Failures (top failures)")
326
+ for i, (name, count) in enumerate(sorted(histogram.items(), key=lambda x: -x[1])[:5], 1):
327
+ lines.append(f"{i}. {name} — {count} failures")
328
+ lines.append("")
329
+
330
+ # Judge Score Breakdown
331
+ judge_scores = aggregates.get("judgeScores", {})
332
+ if judge_scores:
333
+ lines.append("## Judge Scores")
334
+ for judge_name, info in judge_scores.items():
335
+ dist = info.get("dist", {})
336
+ dist_str = ", ".join(f"{k}★={v}" for k, v in sorted(dist.items()))
337
+ lines.append(f"- {judge_name}: avg {info.get('avg', '?')}/4.0 ({dist_str})")
338
+ lines.append("")
339
+
340
+ # Sub-Score Breakdown (multi-dimensional rubrics)
341
+ sub_scores = aggregates.get("subScores", {})
342
+ if sub_scores:
343
+ lines.append("## Sub-Scores (per dimension)")
344
+ for judge_name, dims in sorted(sub_scores.items()):
345
+ dim_parts = []
346
+ for dim, info in sorted(dims.items()):
347
+ dim_parts.append(f"{dim}={info['avg']}/4.0")
348
+ lines.append(f"- {judge_name}: {', '.join(dim_parts)}")
349
+ lines.append("")
350
+
351
+ # Playbook Health
352
+ lines.append("## Playbook Health")
353
+ lines.append(f"- Line count trend: {playbook_health.get('lineCountTrend', [])}")
354
+ lines.append(f"- Avg churn/tick: {playbook_health.get('avgChurnPerTick', 0)} changes")
355
+ lines.append(f"- Total added: {playbook_health.get('totalAdded', 0)}, pruned: {playbook_health.get('totalPruned', 0)}")
356
+ lines.append("")
357
+
358
+ # Latency
359
+ if latency_stats:
360
+ lines.append("## Latency")
361
+ for script, info in sorted(latency_stats.items()):
362
+ lines.append(f"- {script}: avg {info['avg']}ms, p50 {info['p50']}ms, "
363
+ f"p95 {info['p95']}ms ({info['count']} samples)")
364
+ lines.append("")
365
+
366
+ # Regressions
367
+ if regressions:
368
+ lines.append("## ⚠ Regressions Detected")
369
+ for r in regressions:
370
+ lines.append(f"- {r}")
371
+ lines.append("")
372
+
373
+ # LLM Interpretation
374
+ if llm_interpretation:
375
+ lines.append("## Analysis & Recommendations")
376
+ lines.append(llm_interpretation)
377
+ lines.append("")
378
+
379
+ return "\n".join(lines) + "\n"
380
+
381
+
382
+ def build_snapshot(aggregates: dict, skip_rate: float, regressions: list[str]) -> dict:
383
+ """Build a compact snapshot of key metrics for delta comparison."""
384
+ judge_scores = aggregates.get("judgeScores", {})
385
+ per_judge = {name: info.get("avg") for name, info in judge_scores.items() if info.get("avg") is not None}
386
+
387
+ # Top 3 assertion failures
388
+ histogram = aggregates.get("failureHistogram", {})
389
+ top_failures = [name for name, _ in sorted(histogram.items(), key=lambda x: -x[1])[:3]]
390
+
391
+ return {
392
+ "assertionPassRate": aggregates.get("assertionPassRate", {}).get("rate"),
393
+ "schemaRate": aggregates.get("schemaValidity", {}).get("rate"),
394
+ "judgeAvg": aggregates.get("judgeAvg"),
395
+ "skipRate": skip_rate,
396
+ "perJudgeAvg": per_judge,
397
+ "topFailures": top_failures,
398
+ "regressionCount": len(regressions),
399
+ "tickCount": aggregates.get("tickCount", 0),
400
+ }
401
+
402
+
403
+ def load_previous_snapshot(report_dir: Path, current_date: str) -> tuple[str | None, dict | None]:
404
+ """Find the most recent snapshot before current_date.
405
+
406
+ Returns (date_str, snapshot_dict) or (None, None).
407
+ """
408
+ snapshots = sorted(report_dir.glob("*.snapshot.json"), reverse=True)
409
+ for snap_path in snapshots:
410
+ date_str = snap_path.stem.replace(".snapshot", "")
411
+ if date_str < current_date:
412
+ try:
413
+ return date_str, json.loads(snap_path.read_text(encoding="utf-8"))
414
+ except (json.JSONDecodeError, OSError):
415
+ continue
416
+ return None, None
417
+
418
+
419
+ def compute_delta(before: dict, after: dict) -> dict[str, dict]:
420
+ """Compute per-metric deltas between two snapshots.
421
+
422
+ Returns {metric: {before, after, delta, status}} where status is
423
+ IMPROVED, REGRESSED, or SAME.
424
+ """
425
+ # Metrics where higher is better
426
+ higher_better = {"assertionPassRate", "schemaRate", "judgeAvg"}
427
+ # Metrics where lower is better
428
+ lower_better = {"skipRate", "regressionCount"}
429
+
430
+ result: dict[str, dict] = {}
431
+ for key in higher_better | lower_better:
432
+ b = before.get(key)
433
+ a = after.get(key)
434
+ if b is None or a is None:
435
+ continue
436
+ delta = round(a - b, 4) if isinstance(a, float) else a - b
437
+ if key in higher_better:
438
+ status = "IMPROVED" if delta > 0.001 else ("REGRESSED" if delta < -0.001 else "SAME")
439
+ else:
440
+ status = "IMPROVED" if delta < -0.001 else ("REGRESSED" if delta > 0.001 else "SAME")
441
+ result[key] = {"before": b, "after": a, "delta": delta, "status": status}
442
+
443
+ # Per-judge deltas
444
+ before_judges = before.get("perJudgeAvg", {})
445
+ after_judges = after.get("perJudgeAvg", {})
446
+ for judge in set(before_judges) | set(after_judges):
447
+ b = before_judges.get(judge)
448
+ a = after_judges.get(judge)
449
+ if b is None or a is None:
450
+ continue
451
+ delta = round(a - b, 2)
452
+ status = "IMPROVED" if delta > 0.05 else ("REGRESSED" if delta < -0.05 else "SAME")
453
+ result[f"judge:{judge}"] = {"before": b, "after": a, "delta": delta, "status": status}
454
+
455
+ return result
456
+
457
+
458
+ def format_delta_section(prev_date: str, delta: dict[str, dict]) -> str:
459
+ """Format delta comparison as a markdown section."""
460
+ lines = [f"## Delta vs Previous ({prev_date})"]
461
+ for metric, info in sorted(delta.items()):
462
+ marker = {"IMPROVED": "↑", "REGRESSED": "↓", "SAME": "→"}.get(info["status"], "?")
463
+ sign = "+" if info["delta"] > 0 else ""
464
+ lines.append(f"- {marker} {metric}: {info['before']} → {info['after']} ({sign}{info['delta']}) [{info['status']}]")
465
+ lines.append("")
466
+ return "\n".join(lines)
467
+
468
+
469
+ def _sample_judge_details(eval_entries: list[dict], max_entries: int = 8) -> str:
470
+ """Extract sampled judge reasonings + assertion failures for cross-tick synthesis."""
471
+ # Sample evenly across the day
472
+ step = max(1, len(eval_entries) // max_entries)
473
+ sampled = eval_entries[::step][:max_entries]
474
+
475
+ parts: list[str] = []
476
+ for i, entry in enumerate(sampled):
477
+ tick_ts = entry.get("tickTs", "?")
478
+ section = [f"### Tick {i+1} ({tick_ts})"]
479
+
480
+ # Judge reasonings (truncated)
481
+ judges = entry.get("judges")
482
+ if judges:
483
+ for judge_name, result in judges.items():
484
+ if isinstance(result, dict) and "reasoning" in result:
485
+ reasoning = str(result["reasoning"])[:150]
486
+ score = result.get("score", "?")
487
+ section.append(f" {judge_name} ({score}/4): {reasoning}")
488
+
489
+ # Assertion failures
490
+ failures = entry.get("assertions", {}).get("failures", [])
491
+ if failures:
492
+ for f in failures[:3]:
493
+ section.append(f" FAIL: {f.get('name', '?')} — {str(f.get('detail', ''))[:100]}")
494
+
495
+ parts.append("\n".join(section))
496
+
497
+ return "\n\n".join(parts)
498
+
499
+
500
+ def get_llm_interpretation(
501
+ aggregates: dict,
502
+ regressions: list[str],
503
+ playbook_health: dict,
504
+ eval_entries: list[dict] | None = None,
505
+ ) -> str:
506
+ """Use LLM to interpret trends and write recommendations.
507
+
508
+ When eval_entries is provided, includes sampled judge reasonings for
509
+ cross-tick pattern synthesis.
510
+ """
511
+ system_prompt = (
512
+ "You are an evaluation analyst for a personal AI assistant pipeline. "
513
+ "Analyze the metrics AND individual tick evaluations to identify cross-cutting patterns. "
514
+ "Respond with ONLY a JSON object:\n"
515
+ '{"patterns": ["pattern 1", ...], '
516
+ '"bottleneck": "detection|generation|both|none", '
517
+ '"recommendations": ["rec 1", ...]}\n\n'
518
+ "- patterns: 2-4 recurring themes across individual ticks (reference specific judges/assertions)\n"
519
+ "- bottleneck: whether issues stem from signal detection (input), insight generation (output), both, or none\n"
520
+ "- recommendations: 3-5 actionable next steps"
521
+ )
522
+
523
+ user_prompt = (
524
+ f"## Aggregates\n{json.dumps(aggregates, indent=2)}\n\n"
525
+ f"## Regressions\n{regressions}\n\n"
526
+ f"## Playbook Health\n{json.dumps(playbook_health, indent=2)}"
527
+ )
528
+
529
+ if eval_entries:
530
+ details = _sample_judge_details(eval_entries)
531
+ user_prompt += f"\n\n## Individual Tick Evaluations (sampled)\n{details}"
532
+
533
+ try:
534
+ raw = call_llm(system_prompt, user_prompt, script="eval_reporter", json_mode=True)
535
+ result = extract_json(raw)
536
+
537
+ sections: list[str] = []
538
+
539
+ # Bottleneck
540
+ bottleneck = result.get("bottleneck", "none")
541
+ if bottleneck != "none":
542
+ sections.append(f"**Bottleneck**: {bottleneck}")
543
+
544
+ # Patterns
545
+ patterns = result.get("patterns", [])
546
+ if patterns:
547
+ sections.append("**Patterns**:")
548
+ sections.extend(f"- {p}" for p in patterns)
549
+
550
+ # Recommendations
551
+ recs = result.get("recommendations", [])
552
+ if recs:
553
+ sections.append("\n**Recommendations**:")
554
+ sections.extend(f"- {r}" for r in recs)
555
+
556
+ return "\n".join(sections) if sections else ""
557
+ except (ValueError, LLMError) as e:
558
+ print(f"[eval-reporter] LLM interpretation failed: {e}", file=sys.stderr)
559
+
560
+ return ""
561
+
562
+
563
+ # ---------------------------------------------------------------------------
564
+ # Main
565
+ # ---------------------------------------------------------------------------
566
+
567
+ def main():
568
+ parser = argparse.ArgumentParser(description="Sinain Koog Daily Eval Reporter (Tier 2)")
569
+ parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
570
+ parser.add_argument("--days", type=int, default=1, help="Number of days to aggregate (default: 1)")
571
+ args = parser.parse_args()
572
+
573
+ memory_dir = args.memory_dir
574
+ eval_config = load_eval_config(memory_dir)
575
+ thresholds = eval_config.get("regressionThresholds", _EVAL_DEFAULTS["regressionThresholds"])
576
+
577
+ # Load eval logs
578
+ raw_eval_entries = load_eval_logs(memory_dir, days=args.days)
579
+ if not raw_eval_entries:
580
+ print("[eval-reporter] no eval-log entries found", file=sys.stderr)
581
+ return
582
+
583
+ # Separate tick results from run summary metadata
584
+ eval_entries, run_summaries = extract_run_summaries(raw_eval_entries)
585
+ if not eval_entries:
586
+ print("[eval-reporter] no tick eval entries (only run summaries)", file=sys.stderr)
587
+ return
588
+
589
+ # Load playbook logs for health metrics
590
+ playbook_logs = read_recent_logs(memory_dir, days=args.days)
591
+
592
+ # Compute metrics
593
+ aggregates = compute_aggregates(eval_entries)
594
+ playbook_health = compute_playbook_health(playbook_logs)
595
+ skip_rate = compute_skip_rate(playbook_logs)
596
+ latency_stats = compute_latency_stats(playbook_logs)
597
+ regressions = detect_regressions(aggregates, thresholds, skip_rate)
598
+
599
+ print(f"[eval-reporter] {aggregates['tickCount']} ticks, "
600
+ f"schema={aggregates.get('schemaValidity', {}).get('rate', '?')}, "
601
+ f"assertions={aggregates.get('assertionPassRate', {}).get('rate', '?')}, "
602
+ f"regressions={len(regressions)}", file=sys.stderr)
603
+
604
+ # LLM interpretation (if report feature is on and we have enough data)
605
+ llm_interpretation = ""
606
+ if eval_config.get("dailyReport", True) and aggregates["tickCount"] >= 2:
607
+ llm_interpretation = get_llm_interpretation(
608
+ aggregates, regressions, playbook_health, eval_entries=eval_entries,
609
+ )
610
+
611
+ # Generate report
612
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
613
+ report = generate_report_markdown(
614
+ today, aggregates, playbook_health, skip_rate, regressions, llm_interpretation,
615
+ run_summaries=run_summaries,
616
+ latency_stats=latency_stats,
617
+ )
618
+
619
+ # Write report
620
+ report_dir = Path(memory_dir) / "eval-reports"
621
+ report_dir.mkdir(parents=True, exist_ok=True)
622
+
623
+ # Write snapshot for delta comparison
624
+ snapshot = build_snapshot(aggregates, skip_rate, regressions)
625
+ snapshot_file = report_dir / f"{today}.snapshot.json"
626
+ snapshot_file.write_text(json.dumps(snapshot, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
627
+
628
+ # Compute delta vs previous snapshot and append to report
629
+ prev_date, prev_snapshot = load_previous_snapshot(report_dir, today)
630
+ if prev_snapshot:
631
+ delta = compute_delta(prev_snapshot, snapshot)
632
+ if delta:
633
+ report += "\n" + format_delta_section(prev_date, delta) + "\n"
634
+
635
+ report_file = report_dir / f"{today}.md"
636
+ report_file.write_text(report, encoding="utf-8")
637
+
638
+ print(f"[eval-reporter] report + snapshot written to {report_dir}", file=sys.stderr)
639
+
640
+
641
+ if __name__ == "__main__":
642
+ main()