@geravant/sinain 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +183 -0
  2. package/index.ts +2096 -0
  3. package/install.js +155 -0
  4. package/openclaw.plugin.json +59 -0
  5. package/package.json +21 -0
  6. package/sinain-memory/common.py +403 -0
  7. package/sinain-memory/demo_knowledge_transfer.sh +85 -0
  8. package/sinain-memory/embedder.py +268 -0
  9. package/sinain-memory/eval/__init__.py +0 -0
  10. package/sinain-memory/eval/assertions.py +288 -0
  11. package/sinain-memory/eval/judges/__init__.py +0 -0
  12. package/sinain-memory/eval/judges/base_judge.py +61 -0
  13. package/sinain-memory/eval/judges/curation_judge.py +46 -0
  14. package/sinain-memory/eval/judges/insight_judge.py +48 -0
  15. package/sinain-memory/eval/judges/mining_judge.py +42 -0
  16. package/sinain-memory/eval/judges/signal_judge.py +45 -0
  17. package/sinain-memory/eval/schemas.py +247 -0
  18. package/sinain-memory/eval_delta.py +109 -0
  19. package/sinain-memory/eval_reporter.py +642 -0
  20. package/sinain-memory/feedback_analyzer.py +221 -0
  21. package/sinain-memory/git_backup.sh +19 -0
  22. package/sinain-memory/insight_synthesizer.py +181 -0
  23. package/sinain-memory/memory/2026-03-01.md +11 -0
  24. package/sinain-memory/memory/playbook-archive/sinain-playbook-2026-03-01-1418.md +15 -0
  25. package/sinain-memory/memory/playbook-logs/2026-03-01.jsonl +1 -0
  26. package/sinain-memory/memory/sinain-playbook.md +21 -0
  27. package/sinain-memory/memory-config.json +39 -0
  28. package/sinain-memory/memory_miner.py +183 -0
  29. package/sinain-memory/module_manager.py +695 -0
  30. package/sinain-memory/playbook_curator.py +225 -0
  31. package/sinain-memory/requirements.txt +3 -0
  32. package/sinain-memory/signal_analyzer.py +141 -0
  33. package/sinain-memory/test_local.py +402 -0
  34. package/sinain-memory/tests/__init__.py +0 -0
  35. package/sinain-memory/tests/conftest.py +189 -0
  36. package/sinain-memory/tests/test_curator_helpers.py +94 -0
  37. package/sinain-memory/tests/test_embedder.py +210 -0
  38. package/sinain-memory/tests/test_extract_json.py +124 -0
  39. package/sinain-memory/tests/test_feedback_computation.py +121 -0
  40. package/sinain-memory/tests/test_miner_helpers.py +71 -0
  41. package/sinain-memory/tests/test_module_management.py +458 -0
  42. package/sinain-memory/tests/test_parsers.py +96 -0
  43. package/sinain-memory/tests/test_tick_evaluator.py +430 -0
  44. package/sinain-memory/tests/test_triple_extractor.py +255 -0
  45. package/sinain-memory/tests/test_triple_ingest.py +191 -0
  46. package/sinain-memory/tests/test_triple_migrate.py +138 -0
  47. package/sinain-memory/tests/test_triplestore.py +248 -0
  48. package/sinain-memory/tick_evaluator.py +392 -0
  49. package/sinain-memory/triple_extractor.py +402 -0
  50. package/sinain-memory/triple_ingest.py +290 -0
  51. package/sinain-memory/triple_migrate.py +275 -0
  52. package/sinain-memory/triple_query.py +184 -0
  53. package/sinain-memory/triplestore.py +498 -0
@@ -0,0 +1,61 @@
1
+ """Shared infrastructure for LLM-as-Judge evaluators.
2
+
3
+ Provides ``run_judge()`` which calls the LLM with a rubric prompt and
4
+ extracts a ``{"score": 1-4, "reasoning": "..."}`` response.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ # Add parent dirs so ``common`` is importable when running from anywhere.
14
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
+ if _koog_dir not in sys.path:
16
+ sys.path.insert(0, _koog_dir)
17
+
18
+ from common import LLMError, call_llm, extract_json # noqa: E402
19
+
20
+
21
+ def run_judge(
22
+ system_prompt: str,
23
+ user_prompt: str,
24
+ *,
25
+ model: str | None = None,
26
+ max_tokens: int = 200,
27
+ timeout: int = 30,
28
+ ) -> dict | None:
29
+ """Call LLM with a judge prompt and return ``{"score": int, "reasoning": str}`` or None.
30
+
31
+ *model* defaults to the ``eval.judges.model`` setting resolved externally.
32
+ When None, falls back to ``common.call_llm`` defaults (which reads koog-config).
33
+ """
34
+ try:
35
+ kwargs: dict = {
36
+ "system_prompt": system_prompt,
37
+ "user_prompt": user_prompt,
38
+ "max_tokens": max_tokens,
39
+ "json_mode": True,
40
+ }
41
+ # Use script-based config resolution if no explicit model
42
+ if model:
43
+ kwargs["model"] = model
44
+ else:
45
+ kwargs["script"] = "tick_evaluator"
46
+
47
+ raw = call_llm(**kwargs)
48
+ result = extract_json(raw)
49
+
50
+ score = result.get("score")
51
+ reasoning = result.get("reasoning", "")
52
+
53
+ if not isinstance(score, (int, float)) or not (1 <= score <= 4):
54
+ print(f"[warn] judge returned invalid score: {score}", file=sys.stderr)
55
+ return None
56
+
57
+ return {"score": int(score), "reasoning": str(reasoning)[:300]}
58
+
59
+ except (ValueError, LLMError, KeyError) as e:
60
+ print(f"[warn] judge call failed: {e}", file=sys.stderr)
61
+ return None
@@ -0,0 +1,46 @@
1
+ """LLM-as-Judge: Playbook curation quality evaluator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base_judge import run_judge
6
+
7
+ SYSTEM_PROMPT = """\
8
+ You are an evaluator scoring the quality of playbook curation changes.
9
+
10
+ The curator follows a directive and three laws:
11
+ Law 1: Don't remove error-prevention patterns
12
+ Law 2: Preserve high-scoring approaches
13
+ Law 3: Then evolve
14
+
15
+ Rate the curation on a 1-4 scale:
16
+ 4: Changes perfectly match directive + evidence, three laws respected
17
+ 3: Good changes, minor alignment issues with directive
18
+ 2: Changes misaligned with directive or weak evidence
19
+ 1: Destructive changes, violated three laws, or ignored directive entirely
20
+
21
+ Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
22
+
23
+
24
+ def judge_curation(
25
+ curator_result: dict,
26
+ directive: str,
27
+ playbook_before: str = "",
28
+ **kwargs,
29
+ ) -> dict | None:
30
+ """Evaluate playbook curation quality. Returns {"score": 1-4, "reasoning": str} or None."""
31
+ changes = curator_result.get("changes", {})
32
+ stale_actions = curator_result.get("staleItemActions", [])
33
+ lines = curator_result.get("playbookLines", "?")
34
+
35
+ parts = [
36
+ f"## Curate Directive\n{directive}",
37
+ f"\n## Changes Made\nAdded: {changes.get('added', [])}\nPruned: {changes.get('pruned', [])}\nPromoted: {changes.get('promoted', [])}",
38
+ f"\n## Stale Item Actions\n{stale_actions}",
39
+ f"\n## Playbook Lines After: {lines}",
40
+ ]
41
+
42
+ if playbook_before:
43
+ # Truncate to keep prompt manageable
44
+ parts.append(f"\n## Playbook Before (excerpt)\n{playbook_before[:1500]}")
45
+
46
+ return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -0,0 +1,48 @@
1
+ """LLM-as-Judge: Insight synthesis quality evaluator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base_judge import run_judge
6
+
7
+ SYSTEM_PROMPT = """\
8
+ You are an evaluator scoring the quality of an insight synthesizer's output.
9
+
10
+ The synthesizer produces two parts:
11
+ - Suggestion: actionable recommendation grounded in playbook/data
12
+ - Insight: surprising cross-domain connection from accumulated observations
13
+
14
+ Rate the output on a 1-4 scale:
15
+ 4: Suggestion is actionable with specific reference, insight connects 2+ distinct observations
16
+ 3: One component is excellent, the other adequate
17
+ 2: Generic suggestion or obvious insight
18
+ 1: Hallucinated content, not grounded in playbook/logs
19
+
20
+ If the output was skipped, rate the skip decision:
21
+ 4: Skip is well-justified with specific references to what was checked
22
+ 3: Skip is reasonable
23
+ 2: Should not have skipped — there was material to work with
24
+ 1: Skip reason is generic/lazy
25
+
26
+ Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
27
+
28
+
29
+ def judge_insight(
30
+ synth_result: dict,
31
+ playbook_excerpt: str = "",
32
+ **kwargs,
33
+ ) -> dict | None:
34
+ """Evaluate insight synthesis quality. Returns {"score": 1-4, "reasoning": str} or None."""
35
+ skipped = synth_result.get("skip", False)
36
+
37
+ parts = []
38
+ if skipped:
39
+ parts.append(f"## Status: SKIPPED\nReason: {synth_result.get('skipReason', 'none given')}")
40
+ else:
41
+ parts.append(f"## Suggestion\n{synth_result.get('suggestion', '')}")
42
+ parts.append(f"\n## Insight\n{synth_result.get('insight', '')}")
43
+ parts.append(f"\n## Total Chars: {synth_result.get('totalChars', '?')}")
44
+
45
+ if playbook_excerpt:
46
+ parts.append(f"\n## Playbook Context (excerpt)\n{playbook_excerpt[:1000]}")
47
+
48
+ return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -0,0 +1,42 @@
1
+ """LLM-as-Judge: Memory mining quality evaluator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base_judge import run_judge
6
+
7
+ SYSTEM_PROMPT = """\
8
+ You are an evaluator scoring the quality of a memory mining agent's findings.
9
+
10
+ The miner reads daily memory files and extracts patterns, preferences, and insights
11
+ that should be added to the evolving playbook.
12
+
13
+ Rate the mining output on a 1-4 scale:
14
+ 4: Found non-obvious cross-day patterns, all grounded in source files
15
+ 3: Valid patterns found, properly grounded in provided daily files
16
+ 2: Only surface-level observations from source files
17
+ 1: Hallucinated patterns not present in provided daily files
18
+
19
+ Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
20
+
21
+
22
+ def judge_mining(
23
+ miner_result: dict,
24
+ mined_file_excerpts: dict[str, str] | None = None,
25
+ **kwargs,
26
+ ) -> dict | None:
27
+ """Evaluate memory mining quality. Returns {"score": 1-4, "reasoning": str} or None."""
28
+ parts = [
29
+ f"## Findings\n{miner_result.get('findings', '')}",
30
+ f"\n## New Patterns\n{miner_result.get('newPatterns', [])}",
31
+ f"\n## Contradictions\n{miner_result.get('contradictions', [])}",
32
+ f"\n## Preferences\n{miner_result.get('preferences', [])}",
33
+ f"\n## Mined Sources\n{miner_result.get('minedSources', [])}",
34
+ ]
35
+
36
+ if mined_file_excerpts:
37
+ for name, content in mined_file_excerpts.items():
38
+ # Truncate large files
39
+ excerpt = content[:1500] if len(content) > 1500 else content
40
+ parts.append(f"\n## Source File: {name}\n{excerpt}")
41
+
42
+ return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -0,0 +1,45 @@
1
+ """LLM-as-Judge: Signal detection quality evaluator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base_judge import run_judge
6
+
7
+ SYSTEM_PROMPT = """\
8
+ You are an evaluator scoring the quality of a signal detection system.
9
+
10
+ Rate the signal analysis on a 1-4 scale:
11
+ 4: All real signals detected, action is highly relevant and specific
12
+ 3: Key signals detected, action is reasonable
13
+ 2: Missed important signals or action is vague
14
+ 1: Hallucinated signals or inappropriate action
15
+
16
+ Respond with ONLY a JSON object: {"score": <1-4>, "reasoning": "brief explanation"}"""
17
+
18
+
19
+ def judge_signal(
20
+ signal_result: dict,
21
+ session_summary: str,
22
+ recent_logs: list[dict] | None = None,
23
+ **kwargs,
24
+ ) -> dict | None:
25
+ """Evaluate signal detection quality. Returns {"score": 1-4, "reasoning": str} or None."""
26
+ parts = [f"## Session Summary\n{session_summary}"]
27
+
28
+ signals = signal_result.get("signals", [])
29
+ action = signal_result.get("recommendedAction")
30
+ idle = signal_result.get("idle", False)
31
+
32
+ parts.append(f"\n## Detected Signals\n{signals}")
33
+ parts.append(f"\n## Recommended Action\n{action}")
34
+ parts.append(f"\n## Idle: {idle}")
35
+
36
+ if recent_logs:
37
+ recent_actions = []
38
+ for log in recent_logs[:3]:
39
+ for a in log.get("actionsConsidered", []):
40
+ if a.get("chosen"):
41
+ recent_actions.append(a)
42
+ if recent_actions:
43
+ parts.append(f"\n## Recent Actions (should not repeat)\n{recent_actions}")
44
+
45
+ return run_judge(SYSTEM_PROMPT, "\n".join(parts), **kwargs)
@@ -0,0 +1,247 @@
1
+ """JSON Schema definitions for all sinain-koog script outputs.
2
+
3
+ Each schema corresponds to the JSON printed by output_json() in its respective
4
+ script. Used by tick_evaluator.py for mechanical validation (Tier 1 eval).
5
+ """
6
+
7
+ import json
8
+ from typing import Any
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # signal_analyzer.py output
13
+ # ---------------------------------------------------------------------------
14
+
15
+ SIGNAL_ANALYZER_SCHEMA: dict = {
16
+ "type": "object",
17
+ "required": ["signals", "recommendedAction", "idle"],
18
+ "properties": {
19
+ "signals": {
20
+ "type": "array",
21
+ "items": {"type": "string"},
22
+ },
23
+ "recommendedAction": {
24
+ "oneOf": [
25
+ {"type": "null"},
26
+ {
27
+ "type": "object",
28
+ "required": ["action"],
29
+ "properties": {
30
+ "action": {"enum": ["sessions_spawn", "telegram_tip", "skip"]},
31
+ "task": {"type": "string"},
32
+ "confidence": {"type": "number", "minimum": 0, "maximum": 1},
33
+ },
34
+ },
35
+ ],
36
+ },
37
+ "idle": {"type": "boolean"},
38
+ },
39
+ }
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # feedback_analyzer.py output
43
+ # ---------------------------------------------------------------------------
44
+
45
+ FEEDBACK_ANALYZER_SCHEMA: dict = {
46
+ "type": "object",
47
+ "required": ["feedbackScores", "effectiveness", "curateDirective"],
48
+ "properties": {
49
+ "feedbackScores": {
50
+ "type": "object",
51
+ "required": ["avg"],
52
+ "properties": {
53
+ "avg": {"type": "number"},
54
+ "high": {"type": "array", "items": {"type": "string"}},
55
+ "low": {"type": "array", "items": {"type": "string"}},
56
+ },
57
+ },
58
+ "effectiveness": {
59
+ "type": "object",
60
+ "required": ["outputs", "positive", "negative", "neutral", "rate"],
61
+ "properties": {
62
+ "outputs": {"type": "integer", "minimum": 0},
63
+ "positive": {"type": "integer", "minimum": 0},
64
+ "negative": {"type": "integer", "minimum": 0},
65
+ "neutral": {"type": "integer", "minimum": 0},
66
+ "rate": {"type": "number", "minimum": 0, "maximum": 1},
67
+ },
68
+ },
69
+ "curateDirective": {
70
+ "enum": ["aggressive_prune", "normal", "stability", "insufficient_data"],
71
+ },
72
+ "interpretation": {"type": "string"},
73
+ },
74
+ }
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # memory_miner.py output
78
+ # ---------------------------------------------------------------------------
79
+
80
+ MEMORY_MINER_SCHEMA: dict = {
81
+ "type": "object",
82
+ "required": ["findings", "newPatterns"],
83
+ "properties": {
84
+ "findings": {"type": "string"},
85
+ "newPatterns": {"type": "array", "items": {"type": "string"}},
86
+ "contradictions": {"type": "array", "items": {"type": "string"}},
87
+ "preferences": {"type": "array", "items": {"type": "string"}},
88
+ "minedSources": {"type": "array", "items": {"type": "string"}},
89
+ },
90
+ }
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # playbook_curator.py output
94
+ # ---------------------------------------------------------------------------
95
+
96
+ PLAYBOOK_CURATOR_SCHEMA: dict = {
97
+ "type": "object",
98
+ "required": ["changes", "playbookLines"],
99
+ "properties": {
100
+ "changes": {
101
+ "type": "object",
102
+ "required": ["added", "pruned", "promoted"],
103
+ "properties": {
104
+ "added": {"type": "array", "items": {"type": "string"}},
105
+ "pruned": {"type": "array", "items": {"type": "string"}},
106
+ "promoted": {"type": "array", "items": {"type": "string"}},
107
+ },
108
+ },
109
+ "staleItemActions": {"type": "array", "items": {"type": "string"}},
110
+ "playbookLines": {"type": "integer", "minimum": 0},
111
+ "error": {"type": "string"},
112
+ },
113
+ }
114
+
115
+ # ---------------------------------------------------------------------------
116
+ # insight_synthesizer.py output (non-skip case)
117
+ # ---------------------------------------------------------------------------
118
+
119
+ INSIGHT_SYNTHESIZER_SCHEMA: dict = {
120
+ "type": "object",
121
+ "required": ["skip"],
122
+ "properties": {
123
+ "skip": {"type": "boolean"},
124
+ "suggestion": {"type": "string"},
125
+ "insight": {"type": "string"},
126
+ "totalChars": {"type": "integer", "minimum": 0},
127
+ "skipReason": {"type": "string"},
128
+ },
129
+ }
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # module_manager.py extract output
133
+ # ---------------------------------------------------------------------------
134
+
135
+ MODULE_EXTRACT_SCHEMA: dict = {
136
+ "type": "object",
137
+ "required": ["extracted", "domain", "status"],
138
+ "properties": {
139
+ "extracted": {"type": "string"},
140
+ "domain": {"type": "string"},
141
+ "patternsEstablished": {"type": "integer", "minimum": 0},
142
+ "patternsEmerging": {"type": "integer", "minimum": 0},
143
+ "vocabularyTerms": {"type": "integer", "minimum": 0},
144
+ "modulePath": {"type": "string"},
145
+ "status": {"enum": ["suspended", "active"]},
146
+ "activateWith": {"type": "string"},
147
+ },
148
+ }
149
+
150
+
151
+ # ---------------------------------------------------------------------------
152
+ # Registry: script name → schema
153
+ # ---------------------------------------------------------------------------
154
+
155
+ SCHEMA_REGISTRY: dict[str, dict] = {
156
+ "signal_analyzer": SIGNAL_ANALYZER_SCHEMA,
157
+ "feedback_analyzer": FEEDBACK_ANALYZER_SCHEMA,
158
+ "memory_miner": MEMORY_MINER_SCHEMA,
159
+ "playbook_curator": PLAYBOOK_CURATOR_SCHEMA,
160
+ "insight_synthesizer": INSIGHT_SYNTHESIZER_SCHEMA,
161
+ "module_extract": MODULE_EXTRACT_SCHEMA,
162
+ }
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Lightweight JSON Schema validator (no external dependency)
167
+ # ---------------------------------------------------------------------------
168
+
169
+ def validate(instance: Any, schema: dict) -> list[str]:
170
+ """Validate *instance* against a JSON Schema subset.
171
+
172
+ Returns a list of error strings (empty = valid). Supports:
173
+ type, required, properties, items, enum, oneOf, minimum, maximum.
174
+ """
175
+ errors: list[str] = []
176
+ _validate(instance, schema, "", errors)
177
+ return errors
178
+
179
+
180
+ def _validate(instance: Any, schema: dict, path: str, errors: list[str]) -> None:
181
+ # --- oneOf ---
182
+ if "oneOf" in schema:
183
+ matches = 0
184
+ for sub in schema["oneOf"]:
185
+ sub_errors: list[str] = []
186
+ _validate(instance, sub, path, sub_errors)
187
+ if not sub_errors:
188
+ matches += 1
189
+ if matches == 0:
190
+ errors.append(f"{path or '.'}: does not match any oneOf variant")
191
+ return
192
+
193
+ # --- enum ---
194
+ if "enum" in schema:
195
+ if instance not in schema["enum"]:
196
+ errors.append(f"{path or '.'}: {instance!r} not in {schema['enum']}")
197
+ return
198
+
199
+ # --- type ---
200
+ expected_type = schema.get("type")
201
+ if expected_type:
202
+ ok = _type_check(instance, expected_type)
203
+ if not ok:
204
+ errors.append(f"{path or '.'}: expected {expected_type}, got {type(instance).__name__}")
205
+ return
206
+
207
+ # --- required ---
208
+ if "required" in schema and isinstance(instance, dict):
209
+ for key in schema["required"]:
210
+ if key not in instance:
211
+ errors.append(f"{path}.{key}: required field missing")
212
+
213
+ # --- properties ---
214
+ if "properties" in schema and isinstance(instance, dict):
215
+ for key, sub_schema in schema["properties"].items():
216
+ if key in instance:
217
+ _validate(instance[key], sub_schema, f"{path}.{key}", errors)
218
+
219
+ # --- items ---
220
+ if "items" in schema and isinstance(instance, list):
221
+ for i, item in enumerate(instance):
222
+ _validate(item, schema["items"], f"{path}[{i}]", errors)
223
+
224
+ # --- minimum / maximum ---
225
+ if isinstance(instance, (int, float)):
226
+ if "minimum" in schema and instance < schema["minimum"]:
227
+ errors.append(f"{path or '.'}: {instance} < minimum {schema['minimum']}")
228
+ if "maximum" in schema and instance > schema["maximum"]:
229
+ errors.append(f"{path or '.'}: {instance} > maximum {schema['maximum']}")
230
+
231
+
232
+ def _type_check(instance: Any, expected: str) -> bool:
233
+ if expected == "object":
234
+ return isinstance(instance, dict)
235
+ if expected == "array":
236
+ return isinstance(instance, list)
237
+ if expected == "string":
238
+ return isinstance(instance, str)
239
+ if expected == "number":
240
+ return isinstance(instance, (int, float))
241
+ if expected == "integer":
242
+ return isinstance(instance, int) and not isinstance(instance, bool)
243
+ if expected == "boolean":
244
+ return isinstance(instance, bool)
245
+ if expected == "null":
246
+ return instance is None
247
+ return True
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """Before/after benchmark delta tool.
3
+
4
+ Loads two daily snapshots and computes per-metric deltas to determine
5
+ whether a change (e.g. prompt tuning) helped or hurt.
6
+
7
+ Usage:
8
+ python3 sinain-koog/eval_delta.py --memory-dir memory/ --after 2026-03-08
9
+ python3 sinain-koog/eval_delta.py --memory-dir memory/ --before 2026-03-06 --after 2026-03-08 --label "tuned insight prompt"
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ _koog_dir = str(Path(__file__).resolve().parent)
18
+ if _koog_dir not in sys.path:
19
+ sys.path.insert(0, _koog_dir)
20
+
21
+ from eval_reporter import compute_delta, load_previous_snapshot
22
+
23
+
24
+ def load_snapshot(report_dir: Path, date_str: str) -> dict | None:
25
+ """Load a snapshot for the given date."""
26
+ snap_path = report_dir / f"{date_str}.snapshot.json"
27
+ if not snap_path.exists():
28
+ return None
29
+ try:
30
+ return json.loads(snap_path.read_text(encoding="utf-8"))
31
+ except (json.JSONDecodeError, OSError):
32
+ return None
33
+
34
+
35
+ def format_human(delta: dict[str, dict], label: str = "") -> str:
36
+ """Format delta as a human-readable summary."""
37
+ lines = []
38
+ if label:
39
+ lines.append(f"Label: {label}")
40
+ lines.append("")
41
+
42
+ improved = [k for k, v in delta.items() if v["status"] == "IMPROVED"]
43
+ regressed = [k for k, v in delta.items() if v["status"] == "REGRESSED"]
44
+ same = [k for k, v in delta.items() if v["status"] == "SAME"]
45
+
46
+ for metric, info in sorted(delta.items()):
47
+ marker = {"IMPROVED": "↑", "REGRESSED": "↓", "SAME": "→"}[info["status"]]
48
+ sign = "+" if info["delta"] > 0 else ""
49
+ lines.append(f" {marker} {metric}: {info['before']} → {info['after']} ({sign}{info['delta']})")
50
+
51
+ lines.append("")
52
+ lines.append(f"Summary: {len(improved)} improved, {len(regressed)} regressed, {len(same)} same")
53
+
54
+ if regressed:
55
+ lines.append(f"Regressions: {', '.join(regressed)}")
56
+
57
+ return "\n".join(lines)
58
+
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description="Before/after benchmark delta comparison")
62
+ parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
63
+ parser.add_argument("--before", default=None, help="Date of 'before' snapshot (YYYY-MM-DD). Auto-detected if omitted")
64
+ parser.add_argument("--after", required=True, help="Date of 'after' snapshot (YYYY-MM-DD)")
65
+ parser.add_argument("--label", default="", help="Optional label for the change being measured")
66
+ parser.add_argument("--json", action="store_true", help="Output raw JSON instead of human-readable text")
67
+ args = parser.parse_args()
68
+
69
+ report_dir = Path(args.memory_dir) / "eval-reports"
70
+ if not report_dir.is_dir():
71
+ print(f"[eval-delta] report directory not found: {report_dir}", file=sys.stderr)
72
+ sys.exit(1)
73
+
74
+ # Load "after" snapshot
75
+ after_snap = load_snapshot(report_dir, args.after)
76
+ if not after_snap:
77
+ print(f"[eval-delta] no snapshot found for --after {args.after}", file=sys.stderr)
78
+ sys.exit(1)
79
+
80
+ # Load "before" snapshot
81
+ if args.before:
82
+ before_date = args.before
83
+ before_snap = load_snapshot(report_dir, before_date)
84
+ if not before_snap:
85
+ print(f"[eval-delta] no snapshot found for --before {before_date}", file=sys.stderr)
86
+ sys.exit(1)
87
+ else:
88
+ before_date, before_snap = load_previous_snapshot(report_dir, args.after)
89
+ if not before_snap:
90
+ print(f"[eval-delta] no previous snapshot found before {args.after}", file=sys.stderr)
91
+ sys.exit(1)
92
+
93
+ delta = compute_delta(before_snap, after_snap)
94
+
95
+ if args.json:
96
+ output = {
97
+ "before": before_date,
98
+ "after": args.after,
99
+ "label": args.label,
100
+ "delta": delta,
101
+ }
102
+ print(json.dumps(output, indent=2))
103
+ else:
104
+ print(f"Delta: {before_date} → {args.after}")
105
+ print(format_human(delta, args.label))
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()