@geravant/sinain 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +183 -0
  2. package/index.ts +2096 -0
  3. package/install.js +155 -0
  4. package/openclaw.plugin.json +59 -0
  5. package/package.json +21 -0
  6. package/sinain-memory/common.py +403 -0
  7. package/sinain-memory/demo_knowledge_transfer.sh +85 -0
  8. package/sinain-memory/embedder.py +268 -0
  9. package/sinain-memory/eval/__init__.py +0 -0
  10. package/sinain-memory/eval/assertions.py +288 -0
  11. package/sinain-memory/eval/judges/__init__.py +0 -0
  12. package/sinain-memory/eval/judges/base_judge.py +61 -0
  13. package/sinain-memory/eval/judges/curation_judge.py +46 -0
  14. package/sinain-memory/eval/judges/insight_judge.py +48 -0
  15. package/sinain-memory/eval/judges/mining_judge.py +42 -0
  16. package/sinain-memory/eval/judges/signal_judge.py +45 -0
  17. package/sinain-memory/eval/schemas.py +247 -0
  18. package/sinain-memory/eval_delta.py +109 -0
  19. package/sinain-memory/eval_reporter.py +642 -0
  20. package/sinain-memory/feedback_analyzer.py +221 -0
  21. package/sinain-memory/git_backup.sh +19 -0
  22. package/sinain-memory/insight_synthesizer.py +181 -0
  23. package/sinain-memory/memory/2026-03-01.md +11 -0
  24. package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
  25. package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
  26. package/sinain-memory/memory/sinain-playbook.md +21 -0
  27. package/sinain-memory/memory-config.json +39 -0
  28. package/sinain-memory/memory_miner.py +183 -0
  29. package/sinain-memory/module_manager.py +695 -0
  30. package/sinain-memory/playbook_curator.py +225 -0
  31. package/sinain-memory/requirements.txt +3 -0
  32. package/sinain-memory/signal_analyzer.py +141 -0
  33. package/sinain-memory/test_local.py +402 -0
  34. package/sinain-memory/tests/__init__.py +0 -0
  35. package/sinain-memory/tests/conftest.py +189 -0
  36. package/sinain-memory/tests/test_curator_helpers.py +94 -0
  37. package/sinain-memory/tests/test_embedder.py +210 -0
  38. package/sinain-memory/tests/test_extract_json.py +124 -0
  39. package/sinain-memory/tests/test_feedback_computation.py +121 -0
  40. package/sinain-memory/tests/test_miner_helpers.py +71 -0
  41. package/sinain-memory/tests/test_module_management.py +458 -0
  42. package/sinain-memory/tests/test_parsers.py +96 -0
  43. package/sinain-memory/tests/test_tick_evaluator.py +430 -0
  44. package/sinain-memory/tests/test_triple_extractor.py +255 -0
  45. package/sinain-memory/tests/test_triple_ingest.py +191 -0
  46. package/sinain-memory/tests/test_triple_migrate.py +138 -0
  47. package/sinain-memory/tests/test_triplestore.py +248 -0
  48. package/sinain-memory/tick_evaluator.py +392 -0
  49. package/sinain-memory/triple_extractor.py +402 -0
  50. package/sinain-memory/triple_ingest.py +290 -0
  51. package/sinain-memory/triple_migrate.py +275 -0
  52. package/sinain-memory/triple_query.py +184 -0
  53. package/sinain-memory/triplestore.py +498 -0
@@ -0,0 +1,392 @@
1
+ #!/usr/bin/env python3
2
+ """Tier 1 Evaluation: Per-tick evaluator — runs as independent server cron job.
3
+
4
+ Reads playbook-logs written by the heartbeat, validates outputs against
5
+ JSON schemas, runs behavioral assertions, and optionally invokes LLM-as-Judge
6
+ evaluators. Writes results to memory/eval-logs/YYYY-MM-DD.jsonl.
7
+
8
+ Invocation (cron, every 30 min offset from heartbeat):
9
+ uv run --with requests python3 sinain-koog/tick_evaluator.py \
10
+ --memory-dir memory/
11
+
12
+ Config-driven eval levels:
13
+ mechanical — schema + assertions only (zero LLM cost)
14
+ sampled — mechanical + random LLM judges at sampleRate probability
15
+ full — mechanical + LLM judges on every tick output
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import random
21
+ import sys
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+
25
+ # Ensure sinain-koog is on sys.path for imports
26
+ _koog_dir = str(Path(__file__).resolve().parent)
27
+ if _koog_dir not in sys.path:
28
+ sys.path.insert(0, _koog_dir)
29
+
30
+ from common import (
31
+ _load_config,
32
+ _read_jsonl,
33
+ list_daily_memory_files,
34
+ read_playbook,
35
+ )
36
+ from eval.assertions import run_tick_assertions
37
+ from eval.schemas import SCHEMA_REGISTRY, validate
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Config
42
+ # ---------------------------------------------------------------------------
43
+
44
+ _EVAL_DEFAULTS = {
45
+ "level": "mechanical",
46
+ "sampleRate": 0.2,
47
+ "judges": {"model": "smart", "maxTokens": 200, "timeout": 30},
48
+ "dailyReport": True,
49
+ "regressionThresholds": {
50
+ "assertionPassRate": 0.85,
51
+ "effectivenessRate": 0.4,
52
+ "skipRate": 0.8,
53
+ },
54
+ }
55
+
56
+
57
+ def load_eval_config(memory_dir: str) -> dict:
58
+ """Load eval config with runtime overrides from memory/eval-config.json."""
59
+ base = _load_config().get("eval", {})
60
+ cfg = {**_EVAL_DEFAULTS, **base}
61
+
62
+ override_path = Path(memory_dir) / "eval-config.json"
63
+ if override_path.exists():
64
+ try:
65
+ override = json.loads(override_path.read_text(encoding="utf-8"))
66
+ cfg.update(override)
67
+ except (json.JSONDecodeError, OSError) as e:
68
+ print(f"[warn] eval-config.json override failed: {e}", file=sys.stderr)
69
+
70
+ return cfg
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Log readers
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def read_today_playbook_logs(memory_dir: str) -> list[dict]:
78
+ """Read today's playbook-log entries."""
79
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
80
+ log_file = Path(memory_dir) / "playbook-logs" / f"{today}.jsonl"
81
+ return _read_jsonl(log_file)
82
+
83
+
84
+ def read_today_eval_logs(memory_dir: str) -> list[dict]:
85
+ """Read today's eval-log entries to find already-evaluated ticks."""
86
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
87
+ log_file = Path(memory_dir) / "eval-logs" / f"{today}.jsonl"
88
+ return _read_jsonl(log_file)
89
+
90
+
91
+ def get_evaluated_timestamps(eval_logs: list[dict]) -> set[str]:
92
+ """Extract the set of tick timestamps that have already been evaluated."""
93
+ return {e.get("tickTs", "") for e in eval_logs if e.get("tickTs")}
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Schema validation
98
+ # ---------------------------------------------------------------------------
99
+
100
+ def validate_tick_schemas(log_entry: dict) -> dict:
101
+ """Validate script outputs reconstructed from a heartbeat log entry.
102
+
103
+ The heartbeat flattens/reshapes script outputs when writing to the JSONL log.
104
+ This function reconstructs the original script output shapes from log fields
105
+ and validates them against the canonical schemas.
106
+
107
+ Returns {"total": int, "valid": int, "failures": [{"script": str, "errors": [...]}]}.
108
+ """
109
+ total = 0
110
+ valid = 0
111
+ failures: list[dict] = []
112
+
113
+ def _check(script_name: str, data: dict) -> None:
114
+ nonlocal total, valid
115
+ schema = SCHEMA_REGISTRY.get(script_name)
116
+ if schema is None:
117
+ return
118
+ total += 1
119
+ errors = validate(data, schema)
120
+ if errors:
121
+ failures.append({"script": script_name, "errors": errors})
122
+ else:
123
+ valid += 1
124
+
125
+ # --- Signal Analyzer ---
126
+ # Log stores: signals (list), recommendedAction (obj|null), idle (bool)
127
+ if "signals" in log_entry:
128
+ _check("signal_analyzer", {
129
+ "signals": log_entry.get("signals", []),
130
+ "recommendedAction": log_entry.get("recommendedAction"),
131
+ "idle": log_entry.get("idle", False),
132
+ })
133
+
134
+ # --- Feedback Analyzer ---
135
+ # Log stores: feedbackScores ({avg}), effectivenessRate (float),
136
+ # curateDirective (str). The full effectiveness dict is NOT in the log —
137
+ # the heartbeat only writes effectivenessRate. Reconstruct a minimal valid
138
+ # shape; skip required-field checks we know the log doesn't carry.
139
+ if "feedbackScores" in log_entry and "curateDirective" in log_entry:
140
+ eff_rate = log_entry.get("effectivenessRate", 0)
141
+ _check("feedback_analyzer", {
142
+ "feedbackScores": log_entry.get("feedbackScores", {}),
143
+ "effectiveness": {
144
+ "outputs": 0, "positive": 0, "negative": 0, "neutral": 0,
145
+ "rate": eff_rate if isinstance(eff_rate, (int, float)) else 0,
146
+ },
147
+ "curateDirective": log_entry.get("curateDirective", "normal"),
148
+ "interpretation": log_entry.get("interpretation", ""),
149
+ })
150
+
151
+ # --- Memory Miner ---
152
+ # Log stores: miningFindings (str|null), minedSources (list)
153
+ if log_entry.get("miningFindings") is not None:
154
+ _check("memory_miner", {
155
+ "findings": log_entry.get("miningFindings", ""),
156
+ "newPatterns": log_entry.get("newPatterns", []),
157
+ "minedSources": log_entry.get("minedSources", []),
158
+ })
159
+
160
+ # --- Playbook Curator ---
161
+ # Log stores: playbookChanges (dict) — can be {"note": "skipped"} or full
162
+ # curator output with changes.added/pruned/promoted and playbookLines.
163
+ pc = log_entry.get("playbookChanges")
164
+ if isinstance(pc, dict) and "changes" in pc:
165
+ _check("playbook_curator", pc)
166
+
167
+ # --- Insight Synthesizer ---
168
+ # Log stores: output (dict|null). When null, the synthesizer was skipped
169
+ # at the pipeline level (before it ran), which is different from skip=true.
170
+ output = log_entry.get("output")
171
+ if isinstance(output, dict):
172
+ _check("insight_synthesizer", output)
173
+
174
+ return {"total": total, "valid": valid, "failures": failures}
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # LLM judge runner
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def run_judges(log_entry: dict, playbook_text: str, eval_config: dict) -> dict | None:
182
+ """Run LLM-as-Judge evaluators on a tick's outputs.
183
+
184
+ Returns dict of judge results, or None if judges are not applicable.
185
+ """
186
+ # Late import to avoid loading LLM deps in mechanical mode
187
+ from eval.judges.signal_judge import judge_signal
188
+ from eval.judges.curation_judge import judge_curation
189
+ from eval.judges.insight_judge import judge_insight
190
+ from eval.judges.mining_judge import judge_mining
191
+
192
+ judge_cfg = eval_config.get("judges", {})
193
+ kwargs = {}
194
+ if judge_cfg.get("model"):
195
+ # Model will be resolved by common.call_llm via script config
196
+ pass
197
+ if judge_cfg.get("maxTokens"):
198
+ kwargs["max_tokens"] = judge_cfg["maxTokens"]
199
+
200
+ results: dict = {}
201
+
202
+ # Signal judge
203
+ signals = log_entry.get("signals")
204
+ if signals is not None:
205
+ signal_data = {
206
+ "signals": signals,
207
+ "recommendedAction": log_entry.get("recommendedAction"),
208
+ "idle": log_entry.get("idle", False),
209
+ }
210
+ session_summary = log_entry.get("sessionSummary", "")
211
+ result = judge_signal(signal_data, session_summary, **kwargs)
212
+ if result:
213
+ results["signal"] = result
214
+
215
+ # Curation judge
216
+ curator = log_entry.get("playbookChanges")
217
+ if curator is not None:
218
+ directive = log_entry.get("curateDirective", "normal")
219
+ result = judge_curation(curator, directive, playbook_text, **kwargs)
220
+ if result:
221
+ results["curation"] = result
222
+
223
+ # Insight judge
224
+ output = log_entry.get("output")
225
+ if output is not None:
226
+ result = judge_insight(output, playbook_text[:1000] if playbook_text else "", **kwargs)
227
+ if result:
228
+ results["insight"] = result
229
+
230
+ # Mining judge
231
+ mining = log_entry.get("miningResult")
232
+ if mining is not None:
233
+ result = judge_mining(mining, **kwargs)
234
+ if result:
235
+ results["mining"] = result
236
+
237
+ return results if results else None
238
+
239
+
240
+ # ---------------------------------------------------------------------------
241
+ # Main evaluation loop
242
+ # ---------------------------------------------------------------------------
243
+
244
+ def evaluate_tick(
245
+ log_entry: dict,
246
+ recent_logs: list[dict],
247
+ playbook_text: str,
248
+ daily_files: list[str],
249
+ eval_config: dict,
250
+ ) -> dict:
251
+ """Evaluate a single tick's log entry.
252
+
253
+ Returns the eval result dict to be written to eval-logs.
254
+ """
255
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
256
+ tick_ts = log_entry.get("ts", "unknown")
257
+ level = eval_config.get("level", "mechanical")
258
+
259
+ # 1. Schema validation
260
+ schema_result = validate_tick_schemas(log_entry)
261
+
262
+ # 2. Behavioral assertions
263
+ assertion_results = run_tick_assertions(log_entry, recent_logs, playbook_text, daily_files)
264
+ passed = sum(1 for a in assertion_results if a["passed"])
265
+ assertion_failures = [a for a in assertion_results if not a["passed"]]
266
+
267
+ # 3. LLM judges (if level warrants it)
268
+ judges = None
269
+ if level == "full":
270
+ judges = run_judges(log_entry, playbook_text, eval_config)
271
+ elif level == "sampled":
272
+ sample_rate = eval_config.get("sampleRate", 0.2)
273
+ if random.random() < sample_rate:
274
+ judges = run_judges(log_entry, playbook_text, eval_config)
275
+
276
+ # 4. Compute pass rate
277
+ total_checks = schema_result["total"] + len(assertion_results)
278
+ passed_checks = schema_result["valid"] + passed
279
+ pass_rate = round(passed_checks / total_checks, 3) if total_checks > 0 else 1.0
280
+
281
+ result = {
282
+ "ts": now,
283
+ "tickTs": tick_ts,
284
+ "evalLevel": level,
285
+ "schema": {
286
+ "total": schema_result["total"],
287
+ "valid": schema_result["valid"],
288
+ "failures": schema_result["failures"],
289
+ },
290
+ "assertions": {
291
+ "total": len(assertion_results),
292
+ "passed": passed,
293
+ "failures": [{"name": a["name"], "detail": a["detail"]} for a in assertion_failures],
294
+ },
295
+ "judges": judges,
296
+ "passRate": pass_rate,
297
+ }
298
+
299
+ # Add judge average if available
300
+ if judges:
301
+ scores = [v["score"] for v in judges.values() if isinstance(v, dict) and "score" in v]
302
+ if scores:
303
+ result["judgeAvg"] = round(sum(scores) / len(scores), 2)
304
+
305
+ return result
306
+
307
+
308
+ def main():
309
+ parser = argparse.ArgumentParser(description="Sinain Koog Tick Evaluator (Tier 1)")
310
+ parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
311
+ args = parser.parse_args()
312
+
313
+ memory_dir = args.memory_dir
314
+ eval_config = load_eval_config(memory_dir)
315
+
316
+ print(f"[tick-eval] level={eval_config.get('level')} sampleRate={eval_config.get('sampleRate')}",
317
+ file=sys.stderr)
318
+
319
+ # Read today's logs
320
+ playbook_logs = read_today_playbook_logs(memory_dir)
321
+ if not playbook_logs:
322
+ print("[tick-eval] no playbook-log entries for today", file=sys.stderr)
323
+ return
324
+
325
+ # Find unevaluated ticks
326
+ eval_logs = read_today_eval_logs(memory_dir)
327
+ evaluated_ts = get_evaluated_timestamps(eval_logs)
328
+ unevaluated = [e for e in playbook_logs if e.get("ts", "") not in evaluated_ts]
329
+
330
+ if not unevaluated:
331
+ print("[tick-eval] all ticks already evaluated", file=sys.stderr)
332
+ return
333
+
334
+ print(f"[tick-eval] {len(unevaluated)} unevaluated ticks found", file=sys.stderr)
335
+
336
+ # Shared context
337
+ playbook_text = read_playbook(memory_dir)
338
+ daily_files = [Path(f).name for f in list_daily_memory_files(memory_dir)]
339
+
340
+ # Evaluate each unevaluated tick
341
+ eval_log_dir = Path(memory_dir) / "eval-logs"
342
+ eval_log_dir.mkdir(parents=True, exist_ok=True)
343
+ today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
344
+ eval_log_file = eval_log_dir / f"{today}.jsonl"
345
+
346
+ attempted = 0
347
+ succeeded = 0
348
+ failed = 0
349
+ fail_ticks: list[str] = []
350
+
351
+ for entry in unevaluated:
352
+ tick_ts = entry.get("ts", "")
353
+ attempted += 1
354
+
355
+ try:
356
+ # Recent logs for assertion context (logs before this tick)
357
+ recent = [e for e in playbook_logs if e.get("ts", "") < tick_ts]
358
+
359
+ result = evaluate_tick(entry, recent, playbook_text, daily_files, eval_config)
360
+
361
+ with open(eval_log_file, "a", encoding="utf-8") as f:
362
+ f.write(json.dumps(result, ensure_ascii=False) + "\n")
363
+ succeeded += 1
364
+
365
+ status = "PASS" if result["passRate"] >= 0.85 else "WARN"
366
+ judge_info = f" judgeAvg={result.get('judgeAvg', '-')}" if result.get("judges") else ""
367
+ print(f"[tick-eval] {status} tick={tick_ts} passRate={result['passRate']}{judge_info}",
368
+ file=sys.stderr)
369
+ except Exception as exc:
370
+ failed += 1
371
+ fail_ticks.append(tick_ts)
372
+ print(f"[tick-eval] ERROR tick={tick_ts}: {exc}", file=sys.stderr)
373
+
374
+ # Write run summary so the reporter can detect partial runs
375
+ run_summary = {
376
+ "_type": "run_summary",
377
+ "ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
378
+ "attempted": attempted,
379
+ "succeeded": succeeded,
380
+ "failed": failed,
381
+ "isPartial": failed > 0,
382
+ "failedTicks": fail_ticks,
383
+ }
384
+ with open(eval_log_file, "a", encoding="utf-8") as f:
385
+ f.write(json.dumps(run_summary, ensure_ascii=False) + "\n")
386
+
387
+ print(f"[tick-eval] wrote {succeeded} eval entries to {eval_log_file} "
388
+ f"(attempted={attempted}, failed={failed})", file=sys.stderr)
389
+
390
+
391
+ if __name__ == "__main__":
392
+ main()