@geravant/sinain 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +183 -0
- package/index.ts +2096 -0
- package/install.js +155 -0
- package/openclaw.plugin.json +59 -0
- package/package.json +21 -0
- package/sinain-memory/common.py +403 -0
- package/sinain-memory/demo_knowledge_transfer.sh +85 -0
- package/sinain-memory/embedder.py +268 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/assertions.py +288 -0
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +61 -0
- package/sinain-memory/eval/judges/curation_judge.py +46 -0
- package/sinain-memory/eval/judges/insight_judge.py +48 -0
- package/sinain-memory/eval/judges/mining_judge.py +42 -0
- package/sinain-memory/eval/judges/signal_judge.py +45 -0
- package/sinain-memory/eval/schemas.py +247 -0
- package/sinain-memory/eval_delta.py +109 -0
- package/sinain-memory/eval_reporter.py +642 -0
- package/sinain-memory/feedback_analyzer.py +221 -0
- package/sinain-memory/git_backup.sh +19 -0
- package/sinain-memory/insight_synthesizer.py +181 -0
- package/sinain-memory/memory/2026-03-01.md +11 -0
- package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
- package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
- package/sinain-memory/memory/sinain-playbook.md +21 -0
- package/sinain-memory/memory-config.json +39 -0
- package/sinain-memory/memory_miner.py +183 -0
- package/sinain-memory/module_manager.py +695 -0
- package/sinain-memory/playbook_curator.py +225 -0
- package/sinain-memory/requirements.txt +3 -0
- package/sinain-memory/signal_analyzer.py +141 -0
- package/sinain-memory/test_local.py +402 -0
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +189 -0
- package/sinain-memory/tests/test_curator_helpers.py +94 -0
- package/sinain-memory/tests/test_embedder.py +210 -0
- package/sinain-memory/tests/test_extract_json.py +124 -0
- package/sinain-memory/tests/test_feedback_computation.py +121 -0
- package/sinain-memory/tests/test_miner_helpers.py +71 -0
- package/sinain-memory/tests/test_module_management.py +458 -0
- package/sinain-memory/tests/test_parsers.py +96 -0
- package/sinain-memory/tests/test_tick_evaluator.py +430 -0
- package/sinain-memory/tests/test_triple_extractor.py +255 -0
- package/sinain-memory/tests/test_triple_ingest.py +191 -0
- package/sinain-memory/tests/test_triple_migrate.py +138 -0
- package/sinain-memory/tests/test_triplestore.py +248 -0
- package/sinain-memory/tick_evaluator.py +392 -0
- package/sinain-memory/triple_extractor.py +402 -0
- package/sinain-memory/triple_ingest.py +290 -0
- package/sinain-memory/triple_migrate.py +275 -0
- package/sinain-memory/triple_query.py +184 -0
- package/sinain-memory/triplestore.py +498 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tier 1 Evaluation: Per-tick evaluator — runs as independent server cron job.
|
|
3
|
+
|
|
4
|
+
Reads playbook-logs written by the heartbeat, validates outputs against
|
|
5
|
+
JSON schemas, runs behavioral assertions, and optionally invokes LLM-as-Judge
|
|
6
|
+
evaluators. Writes results to memory/eval-logs/YYYY-MM-DD.jsonl.
|
|
7
|
+
|
|
8
|
+
Invocation (cron, every 30 min offset from heartbeat):
|
|
9
|
+
uv run --with requests python3 sinain-koog/tick_evaluator.py \
|
|
10
|
+
--memory-dir memory/
|
|
11
|
+
|
|
12
|
+
Config-driven eval levels:
|
|
13
|
+
mechanical — schema + assertions only (zero LLM cost)
|
|
14
|
+
sampled — mechanical + random LLM judges at sampleRate probability
|
|
15
|
+
full — mechanical + LLM judges on every tick output
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import random
|
|
21
|
+
import sys
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
# Ensure sinain-koog is on sys.path for imports
|
|
26
|
+
_koog_dir = str(Path(__file__).resolve().parent)
|
|
27
|
+
if _koog_dir not in sys.path:
|
|
28
|
+
sys.path.insert(0, _koog_dir)
|
|
29
|
+
|
|
30
|
+
from common import (
|
|
31
|
+
_load_config,
|
|
32
|
+
_read_jsonl,
|
|
33
|
+
list_daily_memory_files,
|
|
34
|
+
read_playbook,
|
|
35
|
+
)
|
|
36
|
+
from eval.assertions import run_tick_assertions
|
|
37
|
+
from eval.schemas import SCHEMA_REGISTRY, validate
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Config
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
_EVAL_DEFAULTS = {
|
|
45
|
+
"level": "mechanical",
|
|
46
|
+
"sampleRate": 0.2,
|
|
47
|
+
"judges": {"model": "smart", "maxTokens": 200, "timeout": 30},
|
|
48
|
+
"dailyReport": True,
|
|
49
|
+
"regressionThresholds": {
|
|
50
|
+
"assertionPassRate": 0.85,
|
|
51
|
+
"effectivenessRate": 0.4,
|
|
52
|
+
"skipRate": 0.8,
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_eval_config(memory_dir: str) -> dict:
|
|
58
|
+
"""Load eval config with runtime overrides from memory/eval-config.json."""
|
|
59
|
+
base = _load_config().get("eval", {})
|
|
60
|
+
cfg = {**_EVAL_DEFAULTS, **base}
|
|
61
|
+
|
|
62
|
+
override_path = Path(memory_dir) / "eval-config.json"
|
|
63
|
+
if override_path.exists():
|
|
64
|
+
try:
|
|
65
|
+
override = json.loads(override_path.read_text(encoding="utf-8"))
|
|
66
|
+
cfg.update(override)
|
|
67
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
68
|
+
print(f"[warn] eval-config.json override failed: {e}", file=sys.stderr)
|
|
69
|
+
|
|
70
|
+
return cfg
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Log readers
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def read_today_playbook_logs(memory_dir: str) -> list[dict]:
|
|
78
|
+
"""Read today's playbook-log entries."""
|
|
79
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
80
|
+
log_file = Path(memory_dir) / "playbook-logs" / f"{today}.jsonl"
|
|
81
|
+
return _read_jsonl(log_file)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def read_today_eval_logs(memory_dir: str) -> list[dict]:
|
|
85
|
+
"""Read today's eval-log entries to find already-evaluated ticks."""
|
|
86
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
87
|
+
log_file = Path(memory_dir) / "eval-logs" / f"{today}.jsonl"
|
|
88
|
+
return _read_jsonl(log_file)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_evaluated_timestamps(eval_logs: list[dict]) -> set[str]:
|
|
92
|
+
"""Extract the set of tick timestamps that have already been evaluated."""
|
|
93
|
+
return {e.get("tickTs", "") for e in eval_logs if e.get("tickTs")}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Schema validation
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def validate_tick_schemas(log_entry: dict) -> dict:
|
|
101
|
+
"""Validate script outputs reconstructed from a heartbeat log entry.
|
|
102
|
+
|
|
103
|
+
The heartbeat flattens/reshapes script outputs when writing to the JSONL log.
|
|
104
|
+
This function reconstructs the original script output shapes from log fields
|
|
105
|
+
and validates them against the canonical schemas.
|
|
106
|
+
|
|
107
|
+
Returns {"total": int, "valid": int, "failures": [{"script": str, "errors": [...]}]}.
|
|
108
|
+
"""
|
|
109
|
+
total = 0
|
|
110
|
+
valid = 0
|
|
111
|
+
failures: list[dict] = []
|
|
112
|
+
|
|
113
|
+
def _check(script_name: str, data: dict) -> None:
|
|
114
|
+
nonlocal total, valid
|
|
115
|
+
schema = SCHEMA_REGISTRY.get(script_name)
|
|
116
|
+
if schema is None:
|
|
117
|
+
return
|
|
118
|
+
total += 1
|
|
119
|
+
errors = validate(data, schema)
|
|
120
|
+
if errors:
|
|
121
|
+
failures.append({"script": script_name, "errors": errors})
|
|
122
|
+
else:
|
|
123
|
+
valid += 1
|
|
124
|
+
|
|
125
|
+
# --- Signal Analyzer ---
|
|
126
|
+
# Log stores: signals (list), recommendedAction (obj|null), idle (bool)
|
|
127
|
+
if "signals" in log_entry:
|
|
128
|
+
_check("signal_analyzer", {
|
|
129
|
+
"signals": log_entry.get("signals", []),
|
|
130
|
+
"recommendedAction": log_entry.get("recommendedAction"),
|
|
131
|
+
"idle": log_entry.get("idle", False),
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
# --- Feedback Analyzer ---
|
|
135
|
+
# Log stores: feedbackScores ({avg}), effectivenessRate (float),
|
|
136
|
+
# curateDirective (str). The full effectiveness dict is NOT in the log —
|
|
137
|
+
# the heartbeat only writes effectivenessRate. Reconstruct a minimal valid
|
|
138
|
+
# shape; skip required-field checks we know the log doesn't carry.
|
|
139
|
+
if "feedbackScores" in log_entry and "curateDirective" in log_entry:
|
|
140
|
+
eff_rate = log_entry.get("effectivenessRate", 0)
|
|
141
|
+
_check("feedback_analyzer", {
|
|
142
|
+
"feedbackScores": log_entry.get("feedbackScores", {}),
|
|
143
|
+
"effectiveness": {
|
|
144
|
+
"outputs": 0, "positive": 0, "negative": 0, "neutral": 0,
|
|
145
|
+
"rate": eff_rate if isinstance(eff_rate, (int, float)) else 0,
|
|
146
|
+
},
|
|
147
|
+
"curateDirective": log_entry.get("curateDirective", "normal"),
|
|
148
|
+
"interpretation": log_entry.get("interpretation", ""),
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# --- Memory Miner ---
|
|
152
|
+
# Log stores: miningFindings (str|null), minedSources (list)
|
|
153
|
+
if log_entry.get("miningFindings") is not None:
|
|
154
|
+
_check("memory_miner", {
|
|
155
|
+
"findings": log_entry.get("miningFindings", ""),
|
|
156
|
+
"newPatterns": log_entry.get("newPatterns", []),
|
|
157
|
+
"minedSources": log_entry.get("minedSources", []),
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
# --- Playbook Curator ---
|
|
161
|
+
# Log stores: playbookChanges (dict) — can be {"note": "skipped"} or full
|
|
162
|
+
# curator output with changes.added/pruned/promoted and playbookLines.
|
|
163
|
+
pc = log_entry.get("playbookChanges")
|
|
164
|
+
if isinstance(pc, dict) and "changes" in pc:
|
|
165
|
+
_check("playbook_curator", pc)
|
|
166
|
+
|
|
167
|
+
# --- Insight Synthesizer ---
|
|
168
|
+
# Log stores: output (dict|null). When null, the synthesizer was skipped
|
|
169
|
+
# at the pipeline level (before it ran), which is different from skip=true.
|
|
170
|
+
output = log_entry.get("output")
|
|
171
|
+
if isinstance(output, dict):
|
|
172
|
+
_check("insight_synthesizer", output)
|
|
173
|
+
|
|
174
|
+
return {"total": total, "valid": valid, "failures": failures}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# LLM judge runner
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
def run_judges(log_entry: dict, playbook_text: str, eval_config: dict) -> dict | None:
|
|
182
|
+
"""Run LLM-as-Judge evaluators on a tick's outputs.
|
|
183
|
+
|
|
184
|
+
Returns dict of judge results, or None if judges are not applicable.
|
|
185
|
+
"""
|
|
186
|
+
# Late import to avoid loading LLM deps in mechanical mode
|
|
187
|
+
from eval.judges.signal_judge import judge_signal
|
|
188
|
+
from eval.judges.curation_judge import judge_curation
|
|
189
|
+
from eval.judges.insight_judge import judge_insight
|
|
190
|
+
from eval.judges.mining_judge import judge_mining
|
|
191
|
+
|
|
192
|
+
judge_cfg = eval_config.get("judges", {})
|
|
193
|
+
kwargs = {}
|
|
194
|
+
if judge_cfg.get("model"):
|
|
195
|
+
# Model will be resolved by common.call_llm via script config
|
|
196
|
+
pass
|
|
197
|
+
if judge_cfg.get("maxTokens"):
|
|
198
|
+
kwargs["max_tokens"] = judge_cfg["maxTokens"]
|
|
199
|
+
|
|
200
|
+
results: dict = {}
|
|
201
|
+
|
|
202
|
+
# Signal judge
|
|
203
|
+
signals = log_entry.get("signals")
|
|
204
|
+
if signals is not None:
|
|
205
|
+
signal_data = {
|
|
206
|
+
"signals": signals,
|
|
207
|
+
"recommendedAction": log_entry.get("recommendedAction"),
|
|
208
|
+
"idle": log_entry.get("idle", False),
|
|
209
|
+
}
|
|
210
|
+
session_summary = log_entry.get("sessionSummary", "")
|
|
211
|
+
result = judge_signal(signal_data, session_summary, **kwargs)
|
|
212
|
+
if result:
|
|
213
|
+
results["signal"] = result
|
|
214
|
+
|
|
215
|
+
# Curation judge
|
|
216
|
+
curator = log_entry.get("playbookChanges")
|
|
217
|
+
if curator is not None:
|
|
218
|
+
directive = log_entry.get("curateDirective", "normal")
|
|
219
|
+
result = judge_curation(curator, directive, playbook_text, **kwargs)
|
|
220
|
+
if result:
|
|
221
|
+
results["curation"] = result
|
|
222
|
+
|
|
223
|
+
# Insight judge
|
|
224
|
+
output = log_entry.get("output")
|
|
225
|
+
if output is not None:
|
|
226
|
+
result = judge_insight(output, playbook_text[:1000] if playbook_text else "", **kwargs)
|
|
227
|
+
if result:
|
|
228
|
+
results["insight"] = result
|
|
229
|
+
|
|
230
|
+
# Mining judge
|
|
231
|
+
mining = log_entry.get("miningResult")
|
|
232
|
+
if mining is not None:
|
|
233
|
+
result = judge_mining(mining, **kwargs)
|
|
234
|
+
if result:
|
|
235
|
+
results["mining"] = result
|
|
236
|
+
|
|
237
|
+
return results if results else None
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
# Main evaluation loop
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
def evaluate_tick(
|
|
245
|
+
log_entry: dict,
|
|
246
|
+
recent_logs: list[dict],
|
|
247
|
+
playbook_text: str,
|
|
248
|
+
daily_files: list[str],
|
|
249
|
+
eval_config: dict,
|
|
250
|
+
) -> dict:
|
|
251
|
+
"""Evaluate a single tick's log entry.
|
|
252
|
+
|
|
253
|
+
Returns the eval result dict to be written to eval-logs.
|
|
254
|
+
"""
|
|
255
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
256
|
+
tick_ts = log_entry.get("ts", "unknown")
|
|
257
|
+
level = eval_config.get("level", "mechanical")
|
|
258
|
+
|
|
259
|
+
# 1. Schema validation
|
|
260
|
+
schema_result = validate_tick_schemas(log_entry)
|
|
261
|
+
|
|
262
|
+
# 2. Behavioral assertions
|
|
263
|
+
assertion_results = run_tick_assertions(log_entry, recent_logs, playbook_text, daily_files)
|
|
264
|
+
passed = sum(1 for a in assertion_results if a["passed"])
|
|
265
|
+
assertion_failures = [a for a in assertion_results if not a["passed"]]
|
|
266
|
+
|
|
267
|
+
# 3. LLM judges (if level warrants it)
|
|
268
|
+
judges = None
|
|
269
|
+
if level == "full":
|
|
270
|
+
judges = run_judges(log_entry, playbook_text, eval_config)
|
|
271
|
+
elif level == "sampled":
|
|
272
|
+
sample_rate = eval_config.get("sampleRate", 0.2)
|
|
273
|
+
if random.random() < sample_rate:
|
|
274
|
+
judges = run_judges(log_entry, playbook_text, eval_config)
|
|
275
|
+
|
|
276
|
+
# 4. Compute pass rate
|
|
277
|
+
total_checks = schema_result["total"] + len(assertion_results)
|
|
278
|
+
passed_checks = schema_result["valid"] + passed
|
|
279
|
+
pass_rate = round(passed_checks / total_checks, 3) if total_checks > 0 else 1.0
|
|
280
|
+
|
|
281
|
+
result = {
|
|
282
|
+
"ts": now,
|
|
283
|
+
"tickTs": tick_ts,
|
|
284
|
+
"evalLevel": level,
|
|
285
|
+
"schema": {
|
|
286
|
+
"total": schema_result["total"],
|
|
287
|
+
"valid": schema_result["valid"],
|
|
288
|
+
"failures": schema_result["failures"],
|
|
289
|
+
},
|
|
290
|
+
"assertions": {
|
|
291
|
+
"total": len(assertion_results),
|
|
292
|
+
"passed": passed,
|
|
293
|
+
"failures": [{"name": a["name"], "detail": a["detail"]} for a in assertion_failures],
|
|
294
|
+
},
|
|
295
|
+
"judges": judges,
|
|
296
|
+
"passRate": pass_rate,
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
# Add judge average if available
|
|
300
|
+
if judges:
|
|
301
|
+
scores = [v["score"] for v in judges.values() if isinstance(v, dict) and "score" in v]
|
|
302
|
+
if scores:
|
|
303
|
+
result["judgeAvg"] = round(sum(scores) / len(scores), 2)
|
|
304
|
+
|
|
305
|
+
return result
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main():
|
|
309
|
+
parser = argparse.ArgumentParser(description="Sinain Koog Tick Evaluator (Tier 1)")
|
|
310
|
+
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
311
|
+
args = parser.parse_args()
|
|
312
|
+
|
|
313
|
+
memory_dir = args.memory_dir
|
|
314
|
+
eval_config = load_eval_config(memory_dir)
|
|
315
|
+
|
|
316
|
+
print(f"[tick-eval] level={eval_config.get('level')} sampleRate={eval_config.get('sampleRate')}",
|
|
317
|
+
file=sys.stderr)
|
|
318
|
+
|
|
319
|
+
# Read today's logs
|
|
320
|
+
playbook_logs = read_today_playbook_logs(memory_dir)
|
|
321
|
+
if not playbook_logs:
|
|
322
|
+
print("[tick-eval] no playbook-log entries for today", file=sys.stderr)
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
# Find unevaluated ticks
|
|
326
|
+
eval_logs = read_today_eval_logs(memory_dir)
|
|
327
|
+
evaluated_ts = get_evaluated_timestamps(eval_logs)
|
|
328
|
+
unevaluated = [e for e in playbook_logs if e.get("ts", "") not in evaluated_ts]
|
|
329
|
+
|
|
330
|
+
if not unevaluated:
|
|
331
|
+
print("[tick-eval] all ticks already evaluated", file=sys.stderr)
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
print(f"[tick-eval] {len(unevaluated)} unevaluated ticks found", file=sys.stderr)
|
|
335
|
+
|
|
336
|
+
# Shared context
|
|
337
|
+
playbook_text = read_playbook(memory_dir)
|
|
338
|
+
daily_files = [Path(f).name for f in list_daily_memory_files(memory_dir)]
|
|
339
|
+
|
|
340
|
+
# Evaluate each unevaluated tick
|
|
341
|
+
eval_log_dir = Path(memory_dir) / "eval-logs"
|
|
342
|
+
eval_log_dir.mkdir(parents=True, exist_ok=True)
|
|
343
|
+
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
344
|
+
eval_log_file = eval_log_dir / f"{today}.jsonl"
|
|
345
|
+
|
|
346
|
+
attempted = 0
|
|
347
|
+
succeeded = 0
|
|
348
|
+
failed = 0
|
|
349
|
+
fail_ticks: list[str] = []
|
|
350
|
+
|
|
351
|
+
for entry in unevaluated:
|
|
352
|
+
tick_ts = entry.get("ts", "")
|
|
353
|
+
attempted += 1
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
# Recent logs for assertion context (logs before this tick)
|
|
357
|
+
recent = [e for e in playbook_logs if e.get("ts", "") < tick_ts]
|
|
358
|
+
|
|
359
|
+
result = evaluate_tick(entry, recent, playbook_text, daily_files, eval_config)
|
|
360
|
+
|
|
361
|
+
with open(eval_log_file, "a", encoding="utf-8") as f:
|
|
362
|
+
f.write(json.dumps(result, ensure_ascii=False) + "\n")
|
|
363
|
+
succeeded += 1
|
|
364
|
+
|
|
365
|
+
status = "PASS" if result["passRate"] >= 0.85 else "WARN"
|
|
366
|
+
judge_info = f" judgeAvg={result.get('judgeAvg', '-')}" if result.get("judges") else ""
|
|
367
|
+
print(f"[tick-eval] {status} tick={tick_ts} passRate={result['passRate']}{judge_info}",
|
|
368
|
+
file=sys.stderr)
|
|
369
|
+
except Exception as exc:
|
|
370
|
+
failed += 1
|
|
371
|
+
fail_ticks.append(tick_ts)
|
|
372
|
+
print(f"[tick-eval] ERROR tick={tick_ts}: {exc}", file=sys.stderr)
|
|
373
|
+
|
|
374
|
+
# Write run summary so the reporter can detect partial runs
|
|
375
|
+
run_summary = {
|
|
376
|
+
"_type": "run_summary",
|
|
377
|
+
"ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
378
|
+
"attempted": attempted,
|
|
379
|
+
"succeeded": succeeded,
|
|
380
|
+
"failed": failed,
|
|
381
|
+
"isPartial": failed > 0,
|
|
382
|
+
"failedTicks": fail_ticks,
|
|
383
|
+
}
|
|
384
|
+
with open(eval_log_file, "a", encoding="utf-8") as f:
|
|
385
|
+
f.write(json.dumps(run_summary, ensure_ascii=False) + "\n")
|
|
386
|
+
|
|
387
|
+
print(f"[tick-eval] wrote {succeeded} eval entries to {eval_log_file} "
|
|
388
|
+
f"(attempted={attempted}, failed={failed})", file=sys.stderr)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
if __name__ == "__main__":
|
|
392
|
+
main()
|