@geravant/sinain 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -0
- package/index.ts +2096 -0
- package/install.js +155 -0
- package/openclaw.plugin.json +59 -0
- package/package.json +21 -0
- package/sinain-memory/common.py +403 -0
- package/sinain-memory/demo_knowledge_transfer.sh +85 -0
- package/sinain-memory/embedder.py +268 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/assertions.py +288 -0
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +61 -0
- package/sinain-memory/eval/judges/curation_judge.py +46 -0
- package/sinain-memory/eval/judges/insight_judge.py +48 -0
- package/sinain-memory/eval/judges/mining_judge.py +42 -0
- package/sinain-memory/eval/judges/signal_judge.py +45 -0
- package/sinain-memory/eval/schemas.py +247 -0
- package/sinain-memory/eval_delta.py +109 -0
- package/sinain-memory/eval_reporter.py +642 -0
- package/sinain-memory/feedback_analyzer.py +221 -0
- package/sinain-memory/git_backup.sh +19 -0
- package/sinain-memory/insight_synthesizer.py +181 -0
- package/sinain-memory/memory/2026-03-01.md +11 -0
- package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
- package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
- package/sinain-memory/memory/sinain-playbook.md +21 -0
- package/sinain-memory/memory-config.json +39 -0
- package/sinain-memory/memory_miner.py +183 -0
- package/sinain-memory/module_manager.py +695 -0
- package/sinain-memory/playbook_curator.py +225 -0
- package/sinain-memory/requirements.txt +3 -0
- package/sinain-memory/signal_analyzer.py +141 -0
- package/sinain-memory/test_local.py +402 -0
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +189 -0
- package/sinain-memory/tests/test_curator_helpers.py +94 -0
- package/sinain-memory/tests/test_embedder.py +210 -0
- package/sinain-memory/tests/test_extract_json.py +124 -0
- package/sinain-memory/tests/test_feedback_computation.py +121 -0
- package/sinain-memory/tests/test_miner_helpers.py +71 -0
- package/sinain-memory/tests/test_module_management.py +458 -0
- package/sinain-memory/tests/test_parsers.py +96 -0
- package/sinain-memory/tests/test_tick_evaluator.py +430 -0
- package/sinain-memory/tests/test_triple_extractor.py +255 -0
- package/sinain-memory/tests/test_triple_ingest.py +191 -0
- package/sinain-memory/tests/test_triple_migrate.py +138 -0
- package/sinain-memory/tests/test_triplestore.py +248 -0
- package/sinain-memory/tick_evaluator.py +392 -0
- package/sinain-memory/triple_extractor.py +402 -0
- package/sinain-memory/triple_ingest.py +290 -0
- package/sinain-memory/triple_migrate.py +275 -0
- package/sinain-memory/triple_query.py +184 -0
- package/sinain-memory/triplestore.py +498 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tier 2 Evaluation: Daily report generator — runs as server cron job (daily 03:00).
|
|
3
|
+
|
|
4
|
+
Aggregates 24h of eval-logs, computes quality metrics, detects regressions,
|
|
5
|
+
uses LLM to interpret trends and write a daily report to memory/eval-reports/.
|
|
6
|
+
|
|
7
|
+
Invocation (cron):
|
|
8
|
+
uv run --with requests python3 sinain-koog/eval_reporter.py \
|
|
9
|
+
--memory-dir memory/ [--days 1]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from collections import Counter
|
|
16
|
+
from datetime import datetime, timedelta, timezone
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
_koog_dir = str(Path(__file__).resolve().parent)
|
|
20
|
+
if _koog_dir not in sys.path:
|
|
21
|
+
sys.path.insert(0, _koog_dir)
|
|
22
|
+
|
|
23
|
+
from common import LLMError, _load_config, _read_jsonl, call_llm, extract_json, read_recent_logs
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Config (duplicated from tick_evaluator to avoid circular import)
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
_EVAL_DEFAULTS = {
|
|
31
|
+
"level": "mechanical",
|
|
32
|
+
"sampleRate": 0.2,
|
|
33
|
+
"judges": {"model": "smart", "maxTokens": 200, "timeout": 30},
|
|
34
|
+
"dailyReport": True,
|
|
35
|
+
"regressionThresholds": {
|
|
36
|
+
"assertionPassRate": 0.85,
|
|
37
|
+
"effectivenessRate": 0.4,
|
|
38
|
+
"skipRate": 0.8,
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_eval_config(memory_dir: str) -> dict:
|
|
44
|
+
"""Load eval config with runtime overrides from memory/eval-config.json."""
|
|
45
|
+
base = _load_config().get("eval", {})
|
|
46
|
+
cfg = {**_EVAL_DEFAULTS, **base}
|
|
47
|
+
|
|
48
|
+
override_path = Path(memory_dir) / "eval-config.json"
|
|
49
|
+
if override_path.exists():
|
|
50
|
+
try:
|
|
51
|
+
override = json.loads(override_path.read_text(encoding="utf-8"))
|
|
52
|
+
cfg.update(override)
|
|
53
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
54
|
+
print(f"[warn] eval-config.json override failed: {e}", file=sys.stderr)
|
|
55
|
+
|
|
56
|
+
return cfg
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Aggregation
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def load_eval_logs(memory_dir: str, days: int = 1) -> list[dict]:
|
|
64
|
+
"""Load eval-log entries from the last N days."""
|
|
65
|
+
log_dir = Path(memory_dir) / "eval-logs"
|
|
66
|
+
if not log_dir.is_dir():
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
|
|
70
|
+
entries: list[dict] = []
|
|
71
|
+
|
|
72
|
+
for jsonl_file in sorted(log_dir.glob("*.jsonl"), reverse=True):
|
|
73
|
+
try:
|
|
74
|
+
file_date = datetime.strptime(jsonl_file.stem, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
75
|
+
except ValueError:
|
|
76
|
+
continue
|
|
77
|
+
if file_date < cutoff - timedelta(days=1):
|
|
78
|
+
break
|
|
79
|
+
entries.extend(_read_jsonl(jsonl_file))
|
|
80
|
+
|
|
81
|
+
return entries
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def extract_run_summaries(eval_entries: list[dict]) -> tuple[list[dict], list[dict]]:
|
|
85
|
+
"""Separate run_summary metadata entries from tick eval results.
|
|
86
|
+
|
|
87
|
+
Returns (tick_entries, run_summaries).
|
|
88
|
+
"""
|
|
89
|
+
ticks = []
|
|
90
|
+
summaries = []
|
|
91
|
+
for e in eval_entries:
|
|
92
|
+
if e.get("_type") == "run_summary":
|
|
93
|
+
summaries.append(e)
|
|
94
|
+
else:
|
|
95
|
+
ticks.append(e)
|
|
96
|
+
return ticks, summaries
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def compute_aggregates(eval_entries: list[dict]) -> dict:
|
|
100
|
+
"""Compute daily aggregate metrics from eval-log entries."""
|
|
101
|
+
if not eval_entries:
|
|
102
|
+
return {"tickCount": 0}
|
|
103
|
+
|
|
104
|
+
# Schema validity
|
|
105
|
+
schema_total = sum(e.get("schema", {}).get("total", 0) for e in eval_entries)
|
|
106
|
+
schema_valid = sum(e.get("schema", {}).get("valid", 0) for e in eval_entries)
|
|
107
|
+
schema_rate = round(schema_valid / schema_total, 3) if schema_total > 0 else 1.0
|
|
108
|
+
|
|
109
|
+
# Assertion pass rate
|
|
110
|
+
assert_total = sum(e.get("assertions", {}).get("total", 0) for e in eval_entries)
|
|
111
|
+
assert_passed = sum(e.get("assertions", {}).get("passed", 0) for e in eval_entries)
|
|
112
|
+
assert_rate = round(assert_passed / assert_total, 3) if assert_total > 0 else 1.0
|
|
113
|
+
|
|
114
|
+
# Assertion failure histogram
|
|
115
|
+
failure_counter: Counter = Counter()
|
|
116
|
+
for e in eval_entries:
|
|
117
|
+
for f in e.get("assertions", {}).get("failures", []):
|
|
118
|
+
failure_counter[f.get("name", "unknown")] += 1
|
|
119
|
+
|
|
120
|
+
# Judge score distribution + sub-score aggregation
|
|
121
|
+
judge_scores: dict[str, list[int]] = {}
|
|
122
|
+
sub_scores: dict[str, dict[str, list[int]]] = {} # {judge: {dim: [scores]}}
|
|
123
|
+
for e in eval_entries:
|
|
124
|
+
judges = e.get("judges")
|
|
125
|
+
if not judges:
|
|
126
|
+
continue
|
|
127
|
+
for judge_name, result in judges.items():
|
|
128
|
+
if isinstance(result, dict) and "score" in result:
|
|
129
|
+
judge_scores.setdefault(judge_name, []).append(result["score"])
|
|
130
|
+
# Collect multi-dimensional sub-scores if present
|
|
131
|
+
scores_dict = result.get("scores")
|
|
132
|
+
if isinstance(scores_dict, dict):
|
|
133
|
+
judge_subs = sub_scores.setdefault(judge_name, {})
|
|
134
|
+
for dim, val in scores_dict.items():
|
|
135
|
+
if isinstance(val, (int, float)):
|
|
136
|
+
judge_subs.setdefault(dim, []).append(int(val))
|
|
137
|
+
|
|
138
|
+
judge_avg = None
|
|
139
|
+
if judge_scores:
|
|
140
|
+
all_scores = [s for scores in judge_scores.values() for s in scores]
|
|
141
|
+
judge_avg = round(sum(all_scores) / len(all_scores), 2) if all_scores else None
|
|
142
|
+
|
|
143
|
+
# Pass rate trend
|
|
144
|
+
pass_rates = [e.get("passRate", 1.0) for e in eval_entries]
|
|
145
|
+
avg_pass_rate = round(sum(pass_rates) / len(pass_rates), 3)
|
|
146
|
+
|
|
147
|
+
# Build sub-score summary: {judge: {dim: {count, avg}}}
|
|
148
|
+
sub_score_summary: dict[str, dict[str, dict]] = {}
|
|
149
|
+
for judge_name, dims in sub_scores.items():
|
|
150
|
+
sub_score_summary[judge_name] = {
|
|
151
|
+
dim: {"count": len(vals), "avg": round(sum(vals) / len(vals), 2)}
|
|
152
|
+
for dim, vals in dims.items()
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
"tickCount": len(eval_entries),
|
|
157
|
+
"schemaValidity": {"total": schema_total, "valid": schema_valid, "rate": schema_rate},
|
|
158
|
+
"assertionPassRate": {"total": assert_total, "passed": assert_passed, "rate": assert_rate},
|
|
159
|
+
"failureHistogram": dict(failure_counter.most_common(10)),
|
|
160
|
+
"judgeScores": {k: {"count": len(v), "avg": round(sum(v) / len(v), 2), "dist": dict(Counter(v))}
|
|
161
|
+
for k, v in judge_scores.items()},
|
|
162
|
+
"judgeAvg": judge_avg,
|
|
163
|
+
"subScores": sub_score_summary,
|
|
164
|
+
"avgPassRate": avg_pass_rate,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def compute_playbook_health(playbook_logs: list[dict]) -> dict:
|
|
169
|
+
"""Compute playbook health metrics from heartbeat logs."""
|
|
170
|
+
line_counts: list[int] = []
|
|
171
|
+
total_added = 0
|
|
172
|
+
total_pruned = 0
|
|
173
|
+
|
|
174
|
+
for entry in playbook_logs:
|
|
175
|
+
changes = entry.get("playbookChanges", {})
|
|
176
|
+
if isinstance(changes, dict):
|
|
177
|
+
pl = changes.get("playbookLines")
|
|
178
|
+
if isinstance(pl, int):
|
|
179
|
+
line_counts.append(pl)
|
|
180
|
+
total_added += len(changes.get("added", []))
|
|
181
|
+
total_pruned += len(changes.get("pruned", []))
|
|
182
|
+
|
|
183
|
+
tick_count = len(playbook_logs) or 1
|
|
184
|
+
return {
|
|
185
|
+
"lineCountTrend": line_counts[-5:] if line_counts else [],
|
|
186
|
+
"avgChurnPerTick": round((total_added + total_pruned) / tick_count, 1),
|
|
187
|
+
"totalAdded": total_added,
|
|
188
|
+
"totalPruned": total_pruned,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _percentile(sorted_vals: list[float], p: float) -> float:
|
|
193
|
+
"""Compute the p-th percentile (0-100) from a pre-sorted list."""
|
|
194
|
+
if not sorted_vals:
|
|
195
|
+
return 0.0
|
|
196
|
+
k = (len(sorted_vals) - 1) * p / 100.0
|
|
197
|
+
f = int(k)
|
|
198
|
+
c = f + 1 if f + 1 < len(sorted_vals) else f
|
|
199
|
+
return round(sorted_vals[f] + (k - f) * (sorted_vals[c] - sorted_vals[f]), 1)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def compute_latency_stats(playbook_logs: list[dict]) -> dict[str, dict]:
|
|
203
|
+
"""Aggregate per-script latency statistics from playbook-log entries.
|
|
204
|
+
|
|
205
|
+
Returns {scriptName: {count, avg, p50, p95}} for each script key found
|
|
206
|
+
in the latencyMs field, plus a "total" entry for totalLatencyMs.
|
|
207
|
+
"""
|
|
208
|
+
buckets: dict[str, list[float]] = {}
|
|
209
|
+
for entry in playbook_logs:
|
|
210
|
+
lat = entry.get("latencyMs")
|
|
211
|
+
if isinstance(lat, dict):
|
|
212
|
+
for script, ms in lat.items():
|
|
213
|
+
if isinstance(ms, (int, float)):
|
|
214
|
+
buckets.setdefault(script, []).append(float(ms))
|
|
215
|
+
total = entry.get("totalLatencyMs")
|
|
216
|
+
if isinstance(total, (int, float)):
|
|
217
|
+
buckets.setdefault("total", []).append(float(total))
|
|
218
|
+
|
|
219
|
+
stats: dict[str, dict] = {}
|
|
220
|
+
for name, vals in buckets.items():
|
|
221
|
+
vals.sort()
|
|
222
|
+
stats[name] = {
|
|
223
|
+
"count": len(vals),
|
|
224
|
+
"avg": round(sum(vals) / len(vals), 1),
|
|
225
|
+
"p50": _percentile(vals, 50),
|
|
226
|
+
"p95": _percentile(vals, 95),
|
|
227
|
+
}
|
|
228
|
+
return stats
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def compute_skip_rate(playbook_logs: list[dict]) -> float:
|
|
232
|
+
"""Compute the insight synthesizer skip rate."""
|
|
233
|
+
total = 0
|
|
234
|
+
skipped = 0
|
|
235
|
+
for entry in playbook_logs:
|
|
236
|
+
output = entry.get("output")
|
|
237
|
+
if output is not None:
|
|
238
|
+
total += 1
|
|
239
|
+
if output.get("skip", False):
|
|
240
|
+
skipped += 1
|
|
241
|
+
return round(skipped / total, 2) if total > 0 else 0.0
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
# Regression detection
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
def detect_regressions(aggregates: dict, thresholds: dict, skip_rate: float) -> list[str]:
|
|
249
|
+
"""Detect regressions based on thresholds."""
|
|
250
|
+
regressions: list[str] = []
|
|
251
|
+
|
|
252
|
+
assert_rate = aggregates.get("assertionPassRate", {}).get("rate", 1.0)
|
|
253
|
+
if assert_rate < thresholds.get("assertionPassRate", 0.85):
|
|
254
|
+
regressions.append(
|
|
255
|
+
f"Assertion pass rate {assert_rate:.1%} below threshold {thresholds['assertionPassRate']:.0%}"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if skip_rate > thresholds.get("skipRate", 0.8):
|
|
259
|
+
regressions.append(
|
|
260
|
+
f"Skip rate {skip_rate:.0%} above threshold {thresholds['skipRate']:.0%} — synthesizer rarely producing output"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Repeated failures
|
|
264
|
+
histogram = aggregates.get("failureHistogram", {})
|
|
265
|
+
for name, count in histogram.items():
|
|
266
|
+
if count >= 3:
|
|
267
|
+
regressions.append(f"Assertion '{name}' failed {count} times (systemic issue)")
|
|
268
|
+
|
|
269
|
+
return regressions
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
# Report generation
|
|
274
|
+
# ---------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
def generate_report_markdown(
|
|
277
|
+
date_str: str,
|
|
278
|
+
aggregates: dict,
|
|
279
|
+
playbook_health: dict,
|
|
280
|
+
skip_rate: float,
|
|
281
|
+
regressions: list[str],
|
|
282
|
+
llm_interpretation: str = "",
|
|
283
|
+
run_summaries: list[dict] | None = None,
|
|
284
|
+
latency_stats: dict[str, dict] | None = None,
|
|
285
|
+
) -> str:
|
|
286
|
+
"""Generate the daily eval report as markdown."""
|
|
287
|
+
lines: list[str] = []
|
|
288
|
+
lines.append(f"# Eval Report — {date_str}\n")
|
|
289
|
+
|
|
290
|
+
# Quality Gates
|
|
291
|
+
lines.append("## Quality Gates")
|
|
292
|
+
schema = aggregates.get("schemaValidity", {})
|
|
293
|
+
s_rate = schema.get("rate", 1.0)
|
|
294
|
+
s_icon = "✓" if s_rate >= 0.95 else "⚠"
|
|
295
|
+
lines.append(f"- {s_icon} Schema validity: {s_rate:.0%} ({schema.get('valid', 0)}/{schema.get('total', 0)} checks)")
|
|
296
|
+
|
|
297
|
+
a = aggregates.get("assertionPassRate", {})
|
|
298
|
+
a_rate = a.get("rate", 1.0)
|
|
299
|
+
a_icon = "✓" if a_rate >= 0.85 else "⚠"
|
|
300
|
+
lines.append(f"- {a_icon} Assertion pass rate: {a_rate:.0%} ({a.get('passed', 0)}/{a.get('total', 0)} checks)")
|
|
301
|
+
|
|
302
|
+
j_avg = aggregates.get("judgeAvg")
|
|
303
|
+
if j_avg is not None:
|
|
304
|
+
j_icon = "✓" if j_avg >= 3.0 else "⚠"
|
|
305
|
+
judge_count = sum(v.get("count", 0) for v in aggregates.get("judgeScores", {}).values())
|
|
306
|
+
lines.append(f"- {j_icon} Mean judge score: {j_avg}/4.0 ({judge_count} evaluations)")
|
|
307
|
+
|
|
308
|
+
skip_icon = "✓" if skip_rate < 0.8 else "⚠"
|
|
309
|
+
lines.append(f"- {skip_icon} Skip rate: {skip_rate:.0%}")
|
|
310
|
+
lines.append(f"- Ticks evaluated: {aggregates.get('tickCount', 0)}")
|
|
311
|
+
|
|
312
|
+
# Partial run warning
|
|
313
|
+
if run_summaries:
|
|
314
|
+
partial_runs = [s for s in run_summaries if s.get("isPartial")]
|
|
315
|
+
if partial_runs:
|
|
316
|
+
total_failed = sum(s.get("failed", 0) for s in partial_runs)
|
|
317
|
+
total_attempted = sum(s.get("attempted", 0) for s in partial_runs)
|
|
318
|
+
lines.append(f"- ⚠ PARTIAL: {total_failed}/{total_attempted} tick evaluations "
|
|
319
|
+
f"failed across {len(partial_runs)} run(s)")
|
|
320
|
+
lines.append("")
|
|
321
|
+
|
|
322
|
+
# Assertion Failures
|
|
323
|
+
histogram = aggregates.get("failureHistogram", {})
|
|
324
|
+
if histogram:
|
|
325
|
+
lines.append("## Assertion Failures (top failures)")
|
|
326
|
+
for i, (name, count) in enumerate(sorted(histogram.items(), key=lambda x: -x[1])[:5], 1):
|
|
327
|
+
lines.append(f"{i}. {name} — {count} failures")
|
|
328
|
+
lines.append("")
|
|
329
|
+
|
|
330
|
+
# Judge Score Breakdown
|
|
331
|
+
judge_scores = aggregates.get("judgeScores", {})
|
|
332
|
+
if judge_scores:
|
|
333
|
+
lines.append("## Judge Scores")
|
|
334
|
+
for judge_name, info in judge_scores.items():
|
|
335
|
+
dist = info.get("dist", {})
|
|
336
|
+
dist_str = ", ".join(f"{k}★={v}" for k, v in sorted(dist.items()))
|
|
337
|
+
lines.append(f"- {judge_name}: avg {info.get('avg', '?')}/4.0 ({dist_str})")
|
|
338
|
+
lines.append("")
|
|
339
|
+
|
|
340
|
+
# Sub-Score Breakdown (multi-dimensional rubrics)
|
|
341
|
+
sub_scores = aggregates.get("subScores", {})
|
|
342
|
+
if sub_scores:
|
|
343
|
+
lines.append("## Sub-Scores (per dimension)")
|
|
344
|
+
for judge_name, dims in sorted(sub_scores.items()):
|
|
345
|
+
dim_parts = []
|
|
346
|
+
for dim, info in sorted(dims.items()):
|
|
347
|
+
dim_parts.append(f"{dim}={info['avg']}/4.0")
|
|
348
|
+
lines.append(f"- {judge_name}: {', '.join(dim_parts)}")
|
|
349
|
+
lines.append("")
|
|
350
|
+
|
|
351
|
+
# Playbook Health
|
|
352
|
+
lines.append("## Playbook Health")
|
|
353
|
+
lines.append(f"- Line count trend: {playbook_health.get('lineCountTrend', [])}")
|
|
354
|
+
lines.append(f"- Avg churn/tick: {playbook_health.get('avgChurnPerTick', 0)} changes")
|
|
355
|
+
lines.append(f"- Total added: {playbook_health.get('totalAdded', 0)}, pruned: {playbook_health.get('totalPruned', 0)}")
|
|
356
|
+
lines.append("")
|
|
357
|
+
|
|
358
|
+
# Latency
|
|
359
|
+
if latency_stats:
|
|
360
|
+
lines.append("## Latency")
|
|
361
|
+
for script, info in sorted(latency_stats.items()):
|
|
362
|
+
lines.append(f"- {script}: avg {info['avg']}ms, p50 {info['p50']}ms, "
|
|
363
|
+
f"p95 {info['p95']}ms ({info['count']} samples)")
|
|
364
|
+
lines.append("")
|
|
365
|
+
|
|
366
|
+
# Regressions
|
|
367
|
+
if regressions:
|
|
368
|
+
lines.append("## ⚠ Regressions Detected")
|
|
369
|
+
for r in regressions:
|
|
370
|
+
lines.append(f"- {r}")
|
|
371
|
+
lines.append("")
|
|
372
|
+
|
|
373
|
+
# LLM Interpretation
|
|
374
|
+
if llm_interpretation:
|
|
375
|
+
lines.append("## Analysis & Recommendations")
|
|
376
|
+
lines.append(llm_interpretation)
|
|
377
|
+
lines.append("")
|
|
378
|
+
|
|
379
|
+
return "\n".join(lines) + "\n"
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def build_snapshot(aggregates: dict, skip_rate: float, regressions: list[str]) -> dict:
|
|
383
|
+
"""Build a compact snapshot of key metrics for delta comparison."""
|
|
384
|
+
judge_scores = aggregates.get("judgeScores", {})
|
|
385
|
+
per_judge = {name: info.get("avg") for name, info in judge_scores.items() if info.get("avg") is not None}
|
|
386
|
+
|
|
387
|
+
# Top 3 assertion failures
|
|
388
|
+
histogram = aggregates.get("failureHistogram", {})
|
|
389
|
+
top_failures = [name for name, _ in sorted(histogram.items(), key=lambda x: -x[1])[:3]]
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
"assertionPassRate": aggregates.get("assertionPassRate", {}).get("rate"),
|
|
393
|
+
"schemaRate": aggregates.get("schemaValidity", {}).get("rate"),
|
|
394
|
+
"judgeAvg": aggregates.get("judgeAvg"),
|
|
395
|
+
"skipRate": skip_rate,
|
|
396
|
+
"perJudgeAvg": per_judge,
|
|
397
|
+
"topFailures": top_failures,
|
|
398
|
+
"regressionCount": len(regressions),
|
|
399
|
+
"tickCount": aggregates.get("tickCount", 0),
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def load_previous_snapshot(report_dir: Path, current_date: str) -> tuple[str | None, dict | None]:
|
|
404
|
+
"""Find the most recent snapshot before current_date.
|
|
405
|
+
|
|
406
|
+
Returns (date_str, snapshot_dict) or (None, None).
|
|
407
|
+
"""
|
|
408
|
+
snapshots = sorted(report_dir.glob("*.snapshot.json"), reverse=True)
|
|
409
|
+
for snap_path in snapshots:
|
|
410
|
+
date_str = snap_path.stem.replace(".snapshot", "")
|
|
411
|
+
if date_str < current_date:
|
|
412
|
+
try:
|
|
413
|
+
return date_str, json.loads(snap_path.read_text(encoding="utf-8"))
|
|
414
|
+
except (json.JSONDecodeError, OSError):
|
|
415
|
+
continue
|
|
416
|
+
return None, None
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def compute_delta(before: dict, after: dict) -> dict[str, dict]:
|
|
420
|
+
"""Compute per-metric deltas between two snapshots.
|
|
421
|
+
|
|
422
|
+
Returns {metric: {before, after, delta, status}} where status is
|
|
423
|
+
IMPROVED, REGRESSED, or SAME.
|
|
424
|
+
"""
|
|
425
|
+
# Metrics where higher is better
|
|
426
|
+
higher_better = {"assertionPassRate", "schemaRate", "judgeAvg"}
|
|
427
|
+
# Metrics where lower is better
|
|
428
|
+
lower_better = {"skipRate", "regressionCount"}
|
|
429
|
+
|
|
430
|
+
result: dict[str, dict] = {}
|
|
431
|
+
for key in higher_better | lower_better:
|
|
432
|
+
b = before.get(key)
|
|
433
|
+
a = after.get(key)
|
|
434
|
+
if b is None or a is None:
|
|
435
|
+
continue
|
|
436
|
+
delta = round(a - b, 4) if isinstance(a, float) else a - b
|
|
437
|
+
if key in higher_better:
|
|
438
|
+
status = "IMPROVED" if delta > 0.001 else ("REGRESSED" if delta < -0.001 else "SAME")
|
|
439
|
+
else:
|
|
440
|
+
status = "IMPROVED" if delta < -0.001 else ("REGRESSED" if delta > 0.001 else "SAME")
|
|
441
|
+
result[key] = {"before": b, "after": a, "delta": delta, "status": status}
|
|
442
|
+
|
|
443
|
+
# Per-judge deltas
|
|
444
|
+
before_judges = before.get("perJudgeAvg", {})
|
|
445
|
+
after_judges = after.get("perJudgeAvg", {})
|
|
446
|
+
for judge in set(before_judges) | set(after_judges):
|
|
447
|
+
b = before_judges.get(judge)
|
|
448
|
+
a = after_judges.get(judge)
|
|
449
|
+
if b is None or a is None:
|
|
450
|
+
continue
|
|
451
|
+
delta = round(a - b, 2)
|
|
452
|
+
status = "IMPROVED" if delta > 0.05 else ("REGRESSED" if delta < -0.05 else "SAME")
|
|
453
|
+
result[f"judge:{judge}"] = {"before": b, "after": a, "delta": delta, "status": status}
|
|
454
|
+
|
|
455
|
+
return result
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def format_delta_section(prev_date: str, delta: dict[str, dict]) -> str:
|
|
459
|
+
"""Format delta comparison as a markdown section."""
|
|
460
|
+
lines = [f"## Delta vs Previous ({prev_date})"]
|
|
461
|
+
for metric, info in sorted(delta.items()):
|
|
462
|
+
marker = {"IMPROVED": "↑", "REGRESSED": "↓", "SAME": "→"}.get(info["status"], "?")
|
|
463
|
+
sign = "+" if info["delta"] > 0 else ""
|
|
464
|
+
lines.append(f"- {marker} {metric}: {info['before']} → {info['after']} ({sign}{info['delta']}) [{info['status']}]")
|
|
465
|
+
lines.append("")
|
|
466
|
+
return "\n".join(lines)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _sample_judge_details(eval_entries: list[dict], max_entries: int = 8) -> str:
|
|
470
|
+
"""Extract sampled judge reasonings + assertion failures for cross-tick synthesis."""
|
|
471
|
+
# Sample evenly across the day
|
|
472
|
+
step = max(1, len(eval_entries) // max_entries)
|
|
473
|
+
sampled = eval_entries[::step][:max_entries]
|
|
474
|
+
|
|
475
|
+
parts: list[str] = []
|
|
476
|
+
for i, entry in enumerate(sampled):
|
|
477
|
+
tick_ts = entry.get("tickTs", "?")
|
|
478
|
+
section = [f"### Tick {i+1} ({tick_ts})"]
|
|
479
|
+
|
|
480
|
+
# Judge reasonings (truncated)
|
|
481
|
+
judges = entry.get("judges")
|
|
482
|
+
if judges:
|
|
483
|
+
for judge_name, result in judges.items():
|
|
484
|
+
if isinstance(result, dict) and "reasoning" in result:
|
|
485
|
+
reasoning = str(result["reasoning"])[:150]
|
|
486
|
+
score = result.get("score", "?")
|
|
487
|
+
section.append(f" {judge_name} ({score}/4): {reasoning}")
|
|
488
|
+
|
|
489
|
+
# Assertion failures
|
|
490
|
+
failures = entry.get("assertions", {}).get("failures", [])
|
|
491
|
+
if failures:
|
|
492
|
+
for f in failures[:3]:
|
|
493
|
+
section.append(f" FAIL: {f.get('name', '?')} — {str(f.get('detail', ''))[:100]}")
|
|
494
|
+
|
|
495
|
+
parts.append("\n".join(section))
|
|
496
|
+
|
|
497
|
+
return "\n\n".join(parts)
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def get_llm_interpretation(
|
|
501
|
+
aggregates: dict,
|
|
502
|
+
regressions: list[str],
|
|
503
|
+
playbook_health: dict,
|
|
504
|
+
eval_entries: list[dict] | None = None,
|
|
505
|
+
) -> str:
|
|
506
|
+
"""Use LLM to interpret trends and write recommendations.
|
|
507
|
+
|
|
508
|
+
When eval_entries is provided, includes sampled judge reasonings for
|
|
509
|
+
cross-tick pattern synthesis.
|
|
510
|
+
"""
|
|
511
|
+
system_prompt = (
|
|
512
|
+
"You are an evaluation analyst for a personal AI assistant pipeline. "
|
|
513
|
+
"Analyze the metrics AND individual tick evaluations to identify cross-cutting patterns. "
|
|
514
|
+
"Respond with ONLY a JSON object:\n"
|
|
515
|
+
'{"patterns": ["pattern 1", ...], '
|
|
516
|
+
'"bottleneck": "detection|generation|both|none", '
|
|
517
|
+
'"recommendations": ["rec 1", ...]}\n\n'
|
|
518
|
+
"- patterns: 2-4 recurring themes across individual ticks (reference specific judges/assertions)\n"
|
|
519
|
+
"- bottleneck: whether issues stem from signal detection (input), insight generation (output), both, or none\n"
|
|
520
|
+
"- recommendations: 3-5 actionable next steps"
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
user_prompt = (
|
|
524
|
+
f"## Aggregates\n{json.dumps(aggregates, indent=2)}\n\n"
|
|
525
|
+
f"## Regressions\n{regressions}\n\n"
|
|
526
|
+
f"## Playbook Health\n{json.dumps(playbook_health, indent=2)}"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
if eval_entries:
|
|
530
|
+
details = _sample_judge_details(eval_entries)
|
|
531
|
+
user_prompt += f"\n\n## Individual Tick Evaluations (sampled)\n{details}"
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
raw = call_llm(system_prompt, user_prompt, script="eval_reporter", json_mode=True)
|
|
535
|
+
result = extract_json(raw)
|
|
536
|
+
|
|
537
|
+
sections: list[str] = []
|
|
538
|
+
|
|
539
|
+
# Bottleneck
|
|
540
|
+
bottleneck = result.get("bottleneck", "none")
|
|
541
|
+
if bottleneck != "none":
|
|
542
|
+
sections.append(f"**Bottleneck**: {bottleneck}")
|
|
543
|
+
|
|
544
|
+
# Patterns
|
|
545
|
+
patterns = result.get("patterns", [])
|
|
546
|
+
if patterns:
|
|
547
|
+
sections.append("**Patterns**:")
|
|
548
|
+
sections.extend(f"- {p}" for p in patterns)
|
|
549
|
+
|
|
550
|
+
# Recommendations
|
|
551
|
+
recs = result.get("recommendations", [])
|
|
552
|
+
if recs:
|
|
553
|
+
sections.append("\n**Recommendations**:")
|
|
554
|
+
sections.extend(f"- {r}" for r in recs)
|
|
555
|
+
|
|
556
|
+
return "\n".join(sections) if sections else ""
|
|
557
|
+
except (ValueError, LLMError) as e:
|
|
558
|
+
print(f"[eval-reporter] LLM interpretation failed: {e}", file=sys.stderr)
|
|
559
|
+
|
|
560
|
+
return ""
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
# ---------------------------------------------------------------------------
|
|
564
|
+
# Main
|
|
565
|
+
# ---------------------------------------------------------------------------
|
|
566
|
+
|
|
567
|
+
def main():
|
|
568
|
+
parser = argparse.ArgumentParser(description="Sinain Koog Daily Eval Reporter (Tier 2)")
|
|
569
|
+
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
570
|
+
parser.add_argument("--days", type=int, default=1, help="Number of days to aggregate (default: 1)")
|
|
571
|
+
args = parser.parse_args()
|
|
572
|
+
|
|
573
|
+
memory_dir = args.memory_dir
|
|
574
|
+
eval_config = load_eval_config(memory_dir)
|
|
575
|
+
thresholds = eval_config.get("regressionThresholds", _EVAL_DEFAULTS["regressionThresholds"])
|
|
576
|
+
|
|
577
|
+
# Load eval logs
|
|
578
|
+
raw_eval_entries = load_eval_logs(memory_dir, days=args.days)
|
|
579
|
+
if not raw_eval_entries:
|
|
580
|
+
print("[eval-reporter] no eval-log entries found", file=sys.stderr)
|
|
581
|
+
return
|
|
582
|
+
|
|
583
|
+
# Separate tick results from run summary metadata
|
|
584
|
+
eval_entries, run_summaries = extract_run_summaries(raw_eval_entries)
|
|
585
|
+
if not eval_entries:
|
|
586
|
+
print("[eval-reporter] no tick eval entries (only run summaries)", file=sys.stderr)
|
|
587
|
+
return
|
|
588
|
+
|
|
589
|
+
# Load playbook logs for health metrics
|
|
590
|
+
playbook_logs = read_recent_logs(memory_dir, days=args.days)
|
|
591
|
+
|
|
592
|
+
# Compute metrics
|
|
593
|
+
aggregates = compute_aggregates(eval_entries)
|
|
594
|
+
playbook_health = compute_playbook_health(playbook_logs)
|
|
595
|
+
skip_rate = compute_skip_rate(playbook_logs)
|
|
596
|
+
latency_stats = compute_latency_stats(playbook_logs)
|
|
597
|
+
regressions = detect_regressions(aggregates, thresholds, skip_rate)
|
|
598
|
+
|
|
599
|
+
print(f"[eval-reporter] {aggregates['tickCount']} ticks, "
|
|
600
|
+
f"schema={aggregates.get('schemaValidity', {}).get('rate', '?')}, "
|
|
601
|
+
f"assertions={aggregates.get('assertionPassRate', {}).get('rate', '?')}, "
|
|
602
|
+
f"regressions={len(regressions)}", file=sys.stderr)
|
|
603
|
+
|
|
604
|
+
# LLM interpretation (if report feature is on and we have enough data)
|
|
605
|
+
llm_interpretation = ""
|
|
606
|
+
if eval_config.get("dailyReport", True) and aggregates["tickCount"] >= 2:
|
|
607
|
+
llm_interpretation = get_llm_interpretation(
|
|
608
|
+
aggregates, regressions, playbook_health, eval_entries=eval_entries,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Generate report
|
|
612
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
613
|
+
report = generate_report_markdown(
|
|
614
|
+
today, aggregates, playbook_health, skip_rate, regressions, llm_interpretation,
|
|
615
|
+
run_summaries=run_summaries,
|
|
616
|
+
latency_stats=latency_stats,
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Write report
|
|
620
|
+
report_dir = Path(memory_dir) / "eval-reports"
|
|
621
|
+
report_dir.mkdir(parents=True, exist_ok=True)
|
|
622
|
+
|
|
623
|
+
# Write snapshot for delta comparison
|
|
624
|
+
snapshot = build_snapshot(aggregates, skip_rate, regressions)
|
|
625
|
+
snapshot_file = report_dir / f"{today}.snapshot.json"
|
|
626
|
+
snapshot_file.write_text(json.dumps(snapshot, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
627
|
+
|
|
628
|
+
# Compute delta vs previous snapshot and append to report
|
|
629
|
+
prev_date, prev_snapshot = load_previous_snapshot(report_dir, today)
|
|
630
|
+
if prev_snapshot:
|
|
631
|
+
delta = compute_delta(prev_snapshot, snapshot)
|
|
632
|
+
if delta:
|
|
633
|
+
report += "\n" + format_delta_section(prev_date, delta) + "\n"
|
|
634
|
+
|
|
635
|
+
report_file = report_dir / f"{today}.md"
|
|
636
|
+
report_file.write_text(report, encoding="utf-8")
|
|
637
|
+
|
|
638
|
+
print(f"[eval-reporter] report + snapshot written to {report_dir}", file=sys.stderr)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
if __name__ == "__main__":
|
|
642
|
+
main()
|