@geravant/sinain 1.13.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/config-shared.js +1 -0
- package/package.json +4 -1
- package/sinain-agent/run.sh +36 -4
- package/sinain-core/src/buffers/feed-buffer.ts +6 -4
- package/sinain-core/src/index.ts +50 -19
- package/sinain-memory/graph_query.py +12 -3
- package/sinain-memory/knowledge_integrator.py +194 -10
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
- package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
- package/sinain-memory/eval/benchmarks/query.py +0 -193
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
- package/sinain-memory/eval/benchmarks/runner.py +0 -283
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
"""Behavioral assertion library for sinain-koog tick evaluation.
|
|
2
|
-
|
|
3
|
-
Each assertion function validates a runtime invariant of the pipeline.
|
|
4
|
-
Returns ``{"name": str, "passed": bool, "detail": str}``.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _result(name: str, passed: bool, detail: str) -> dict:
|
|
11
|
-
return {"name": name, "passed": passed, "detail": detail}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# ---------------------------------------------------------------------------
|
|
15
|
-
# Playbook curator assertions
|
|
16
|
-
# ---------------------------------------------------------------------------
|
|
17
|
-
|
|
18
|
-
def assert_playbook_under_limit(curator_result: dict, limit: int = 50) -> dict:
|
|
19
|
-
"""Verify playbook body stays under the line limit."""
|
|
20
|
-
lines = curator_result.get("playbookLines", 0)
|
|
21
|
-
if lines <= limit:
|
|
22
|
-
return _result("playbook_under_limit", True, f"body has {lines} lines (limit {limit})")
|
|
23
|
-
return _result("playbook_under_limit", False, f"body has {lines} lines, exceeds limit of {limit}")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def assert_curator_respected_directive(curator_result: dict, directive: str) -> dict:
|
|
27
|
-
"""Check that curator changes align with the curate directive."""
|
|
28
|
-
changes = curator_result.get("changes", {})
|
|
29
|
-
added = len(changes.get("added", []))
|
|
30
|
-
pruned = len(changes.get("pruned", []))
|
|
31
|
-
|
|
32
|
-
if directive == "aggressive_prune":
|
|
33
|
-
# Should have pruned items
|
|
34
|
-
if pruned > 0:
|
|
35
|
-
return _result("curator_respected_directive", True,
|
|
36
|
-
f"aggressive_prune: pruned {pruned} items")
|
|
37
|
-
if added == 0 and pruned == 0:
|
|
38
|
-
return _result("curator_respected_directive", True,
|
|
39
|
-
"aggressive_prune: no changes (acceptable if playbook already lean)")
|
|
40
|
-
return _result("curator_respected_directive", False,
|
|
41
|
-
f"aggressive_prune: added {added} but pruned {pruned} — expected pruning")
|
|
42
|
-
|
|
43
|
-
if directive == "stability":
|
|
44
|
-
# Should not aggressively prune established patterns
|
|
45
|
-
if pruned > added + 2:
|
|
46
|
-
return _result("curator_respected_directive", False,
|
|
47
|
-
f"stability: pruned {pruned} items (only added {added}) — too aggressive for stability mode")
|
|
48
|
-
return _result("curator_respected_directive", True,
|
|
49
|
-
f"stability: added {added}, pruned {pruned} — conservative")
|
|
50
|
-
|
|
51
|
-
# normal / insufficient_data — any reasonable mix is fine
|
|
52
|
-
return _result("curator_respected_directive", True,
|
|
53
|
-
f"{directive}: added {added}, pruned {pruned}")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# ---------------------------------------------------------------------------
|
|
57
|
-
# Signal analyzer assertions
|
|
58
|
-
# ---------------------------------------------------------------------------
|
|
59
|
-
|
|
60
|
-
def assert_no_repeat_action(signal_result: dict, recent_logs: list[dict], window: int = 3) -> dict:
|
|
61
|
-
"""Verify recommendedAction doesn't repeat the last N ticks' actions."""
|
|
62
|
-
action = signal_result.get("recommendedAction")
|
|
63
|
-
if action is None or action.get("action") == "skip":
|
|
64
|
-
return _result("no_repeat_action", True, "no action recommended (skip/null)")
|
|
65
|
-
|
|
66
|
-
task = (action.get("task") or "").lower().strip()
|
|
67
|
-
if not task:
|
|
68
|
-
return _result("no_repeat_action", True, "no task description to compare")
|
|
69
|
-
|
|
70
|
-
# Collect recent action tasks
|
|
71
|
-
recent_tasks: list[str] = []
|
|
72
|
-
for log in recent_logs[:window]:
|
|
73
|
-
log_actions = log.get("actionsConsidered", [])
|
|
74
|
-
for a in log_actions:
|
|
75
|
-
if a.get("chosen"):
|
|
76
|
-
recent_tasks.append((a.get("reason") or a.get("task") or "").lower().strip())
|
|
77
|
-
|
|
78
|
-
# Check for near-duplicate (substring match to catch rephrasing)
|
|
79
|
-
for prev_task in recent_tasks:
|
|
80
|
-
if not prev_task:
|
|
81
|
-
continue
|
|
82
|
-
# If >60% of words overlap, consider it a repeat
|
|
83
|
-
task_words = set(task.split())
|
|
84
|
-
prev_words = set(prev_task.split())
|
|
85
|
-
if not task_words or not prev_words:
|
|
86
|
-
continue
|
|
87
|
-
overlap = len(task_words & prev_words) / max(len(task_words), len(prev_words))
|
|
88
|
-
if overlap > 0.6:
|
|
89
|
-
return _result("no_repeat_action", False,
|
|
90
|
-
f"action task '{task[:60]}' overlaps with recent '{prev_task[:60]}' ({overlap:.0%} word overlap)")
|
|
91
|
-
|
|
92
|
-
return _result("no_repeat_action", True,
|
|
93
|
-
f"action task is distinct from last {window} ticks")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def assert_signal_confidence_threshold(signal_result: dict, threshold: float = 0.5) -> dict:
|
|
97
|
-
"""Verify actions are only recommended above the confidence threshold."""
|
|
98
|
-
action = signal_result.get("recommendedAction")
|
|
99
|
-
if action is None or action.get("action") == "skip":
|
|
100
|
-
return _result("signal_confidence_threshold", True, "no action recommended")
|
|
101
|
-
|
|
102
|
-
confidence = action.get("confidence")
|
|
103
|
-
if confidence is None:
|
|
104
|
-
return _result("signal_confidence_threshold", False,
|
|
105
|
-
"action recommended but no confidence value provided")
|
|
106
|
-
|
|
107
|
-
if confidence >= threshold:
|
|
108
|
-
return _result("signal_confidence_threshold", True,
|
|
109
|
-
f"confidence {confidence:.2f} >= threshold {threshold}")
|
|
110
|
-
return _result("signal_confidence_threshold", False,
|
|
111
|
-
f"confidence {confidence:.2f} < threshold {threshold}")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# ---------------------------------------------------------------------------
|
|
115
|
-
# Insight synthesizer assertions
|
|
116
|
-
# ---------------------------------------------------------------------------
|
|
117
|
-
|
|
118
|
-
def assert_insight_char_limit(synth_result: dict, limit: int = 500) -> dict:
|
|
119
|
-
"""Verify suggestion+insight stays under the character limit."""
|
|
120
|
-
if synth_result.get("skip", False):
|
|
121
|
-
return _result("insight_char_limit", True, "output skipped")
|
|
122
|
-
|
|
123
|
-
suggestion = synth_result.get("suggestion", "")
|
|
124
|
-
insight = synth_result.get("insight", "")
|
|
125
|
-
total = len(suggestion) + len(insight)
|
|
126
|
-
|
|
127
|
-
if total <= limit:
|
|
128
|
-
return _result("insight_char_limit", True, f"total {total} chars (limit {limit})")
|
|
129
|
-
return _result("insight_char_limit", False, f"total {total} chars exceeds limit of {limit}")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def assert_skip_reason_specific(synth_result: dict) -> dict:
|
|
133
|
-
"""If skip=true, verify the reason is specific (not generic boilerplate)."""
|
|
134
|
-
if not synth_result.get("skip", False):
|
|
135
|
-
return _result("skip_reason_specific", True, "output not skipped")
|
|
136
|
-
|
|
137
|
-
reason = (synth_result.get("skipReason") or "").strip()
|
|
138
|
-
if not reason:
|
|
139
|
-
return _result("skip_reason_specific", False, "skip=true but no skipReason provided")
|
|
140
|
-
|
|
141
|
-
# Check against known-generic patterns
|
|
142
|
-
generic_phrases = [
|
|
143
|
-
"no new data",
|
|
144
|
-
"nothing new",
|
|
145
|
-
"no updates",
|
|
146
|
-
"insufficient data",
|
|
147
|
-
"not enough information",
|
|
148
|
-
"no changes",
|
|
149
|
-
]
|
|
150
|
-
reason_lower = reason.lower()
|
|
151
|
-
for phrase in generic_phrases:
|
|
152
|
-
if reason_lower == phrase or (len(reason_lower) < 30 and phrase in reason_lower):
|
|
153
|
-
return _result("skip_reason_specific", False,
|
|
154
|
-
f"skipReason is too generic: '{reason}'")
|
|
155
|
-
|
|
156
|
-
return _result("skip_reason_specific", True, f"skipReason is specific ({len(reason)} chars)")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
# ---------------------------------------------------------------------------
|
|
160
|
-
# Memory miner assertions
|
|
161
|
-
# ---------------------------------------------------------------------------
|
|
162
|
-
|
|
163
|
-
def assert_miner_references_sources(miner_result: dict, daily_files: list[str]) -> dict:
|
|
164
|
-
"""Verify mining findings reference actual source files that were provided."""
|
|
165
|
-
mined = miner_result.get("minedSources", [])
|
|
166
|
-
if not mined:
|
|
167
|
-
return _result("miner_references_sources", True, "no sources mined (early return)")
|
|
168
|
-
|
|
169
|
-
# daily_files contains basenames like "2026-02-21.md"
|
|
170
|
-
known_basenames = set(daily_files)
|
|
171
|
-
unknown = [s for s in mined if s not in known_basenames]
|
|
172
|
-
|
|
173
|
-
if unknown:
|
|
174
|
-
return _result("miner_references_sources", False,
|
|
175
|
-
f"minedSources references unknown files: {unknown}")
|
|
176
|
-
return _result("miner_references_sources", True,
|
|
177
|
-
f"all {len(mined)} mined sources are valid")
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# ---------------------------------------------------------------------------
|
|
181
|
-
# Cross-script / structural assertions
|
|
182
|
-
# ---------------------------------------------------------------------------
|
|
183
|
-
|
|
184
|
-
def assert_schema_valid(script_name: str, output: dict, schema_errors: list[str]) -> dict:
|
|
185
|
-
"""Wrap schema validation result as an assertion."""
|
|
186
|
-
if not schema_errors:
|
|
187
|
-
return _result(f"schema_valid_{script_name}", True, "output matches schema")
|
|
188
|
-
return _result(f"schema_valid_{script_name}", False,
|
|
189
|
-
f"{len(schema_errors)} schema errors: {'; '.join(schema_errors[:3])}")
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
|
|
193
|
-
"""Verify the playbook still has its mining-index header and effectiveness footer."""
|
|
194
|
-
has_header = "<!-- mining-index:" in playbook_text
|
|
195
|
-
has_footer = "<!-- effectiveness:" in playbook_text
|
|
196
|
-
|
|
197
|
-
if has_header and has_footer:
|
|
198
|
-
return _result("playbook_header_footer_intact", True,
|
|
199
|
-
"both mining-index and effectiveness comments present")
|
|
200
|
-
missing = []
|
|
201
|
-
if not has_header:
|
|
202
|
-
missing.append("mining-index")
|
|
203
|
-
if not has_footer:
|
|
204
|
-
missing.append("effectiveness")
|
|
205
|
-
return _result("playbook_header_footer_intact", False,
|
|
206
|
-
f"missing playbook comments: {', '.join(missing)}")
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# ---------------------------------------------------------------------------
|
|
210
|
-
# Runner: execute all applicable assertions for a tick
|
|
211
|
-
# ---------------------------------------------------------------------------
|
|
212
|
-
|
|
213
|
-
def run_tick_assertions(
|
|
214
|
-
log_entry: dict,
|
|
215
|
-
recent_logs: list[dict],
|
|
216
|
-
playbook_text: str,
|
|
217
|
-
daily_files: list[str],
|
|
218
|
-
) -> list[dict]:
|
|
219
|
-
"""Run all applicable assertions against a single tick's log entry.
|
|
220
|
-
|
|
221
|
-
Returns a list of assertion result dicts.
|
|
222
|
-
"""
|
|
223
|
-
results: list[dict] = []
|
|
224
|
-
|
|
225
|
-
# Signal analyzer assertions
|
|
226
|
-
signals = log_entry.get("signals")
|
|
227
|
-
if signals is not None:
|
|
228
|
-
results.append(assert_signal_confidence_threshold(
|
|
229
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
230
|
-
))
|
|
231
|
-
results.append(assert_no_repeat_action(
|
|
232
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
233
|
-
recent_logs,
|
|
234
|
-
))
|
|
235
|
-
|
|
236
|
-
# Curator assertions — playbookChanges can be {"note": "skipped"} or full output
|
|
237
|
-
curator = log_entry.get("playbookChanges")
|
|
238
|
-
if isinstance(curator, dict) and "changes" in curator:
|
|
239
|
-
curator_with_lines = {**curator}
|
|
240
|
-
if "playbookLines" not in curator_with_lines:
|
|
241
|
-
curator_with_lines["playbookLines"] = curator.get("playbookLines", 0)
|
|
242
|
-
results.append(assert_playbook_under_limit(curator_with_lines))
|
|
243
|
-
|
|
244
|
-
directive = log_entry.get("curateDirective", "normal")
|
|
245
|
-
results.append(assert_curator_respected_directive(curator_with_lines, directive))
|
|
246
|
-
|
|
247
|
-
# Insight synthesizer assertions — output can be null (pipeline-level skip)
|
|
248
|
-
output = log_entry.get("output")
|
|
249
|
-
if isinstance(output, dict):
|
|
250
|
-
results.append(assert_insight_char_limit(output))
|
|
251
|
-
results.append(assert_skip_reason_specific(output))
|
|
252
|
-
|
|
253
|
-
# Mining assertions — log uses miningFindings (str) and minedSources (list)
|
|
254
|
-
mining = log_entry.get("miningResult")
|
|
255
|
-
if mining is not None:
|
|
256
|
-
results.append(assert_miner_references_sources(mining, daily_files))
|
|
257
|
-
elif log_entry.get("minedSources"):
|
|
258
|
-
# Reconstruct mining result from flat log fields
|
|
259
|
-
results.append(assert_miner_references_sources(
|
|
260
|
-
{"minedSources": log_entry.get("minedSources", [])}, daily_files
|
|
261
|
-
))
|
|
262
|
-
|
|
263
|
-
# Playbook health (if we have playbook text)
|
|
264
|
-
if playbook_text:
|
|
265
|
-
results.append(assert_playbook_header_footer_intact(playbook_text))
|
|
266
|
-
|
|
267
|
-
return results
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class BenchmarkQuestion:
|
|
11
|
-
id: str
|
|
12
|
-
text: str
|
|
13
|
-
gold_answer: str
|
|
14
|
-
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
-
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
-
metadata: dict = field(default_factory=dict)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class BenchmarkInstance:
|
|
21
|
-
"""A set of conversations + questions that share the same context."""
|
|
22
|
-
id: str
|
|
23
|
-
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
-
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
-
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
-
metadata: dict = field(default_factory=dict)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class BenchmarkAdapter(ABC):
|
|
30
|
-
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
@abstractmethod
|
|
34
|
-
def name(self) -> str:
|
|
35
|
-
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
-
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
-
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
-
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
-
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
-
|
|
9
|
-
# LLM models (via OpenRouter)
|
|
10
|
-
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
-
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
-
|
|
13
|
-
# Retrieval
|
|
14
|
-
K_VALUES = [1, 3, 5, 10]
|
|
15
|
-
MAX_FACTS_PER_QUERY = 10
|
|
16
|
-
|
|
17
|
-
# Ingestion
|
|
18
|
-
DISTILLER_TIMEOUT_S = 30
|
|
19
|
-
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
-
|
|
21
|
-
# Dataset URLs
|
|
22
|
-
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
-
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
"""Evaluation pipeline — score answers and compute aggregate metrics.
|
|
2
|
-
|
|
3
|
-
Combines:
|
|
4
|
-
- LLM-as-Judge (QA scoring, 1-5 scale)
|
|
5
|
-
- Retrieval metrics (Recall@k, NDCG@k)
|
|
6
|
-
- Token F1 overlap (mechanical, free)
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
|
-
import math
|
|
12
|
-
import re
|
|
13
|
-
from collections import defaultdict
|
|
14
|
-
|
|
15
|
-
from .base_adapter import BenchmarkQuestion
|
|
16
|
-
from .config import K_VALUES
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
|
|
20
|
-
|
|
21
|
-
def _tokenize(text: str) -> list[str]:
|
|
22
|
-
"""Simple whitespace + punctuation tokenizer."""
|
|
23
|
-
return re.findall(r"\w+", text.lower())
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def token_f1(predicted: str, gold: str | int) -> float:
|
|
27
|
-
"""Compute token-level F1 between predicted and gold answers."""
|
|
28
|
-
pred_tokens = set(_tokenize(str(predicted)))
|
|
29
|
-
gold_tokens = set(_tokenize(str(gold)))
|
|
30
|
-
if not gold_tokens or not pred_tokens:
|
|
31
|
-
return 0.0
|
|
32
|
-
overlap = pred_tokens & gold_tokens
|
|
33
|
-
if not overlap:
|
|
34
|
-
return 0.0
|
|
35
|
-
precision = len(overlap) / len(pred_tokens)
|
|
36
|
-
recall = len(overlap) / len(gold_tokens)
|
|
37
|
-
return 2 * precision * recall / (precision + recall)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
|
|
41
|
-
|
|
42
|
-
def dcg_at_k(relevant_positions: list[int], k: int) -> float:
|
|
43
|
-
"""Discounted Cumulative Gain at k."""
|
|
44
|
-
score = 0.0
|
|
45
|
-
for pos in relevant_positions:
|
|
46
|
-
if pos < k:
|
|
47
|
-
score += 1.0 / math.log2(pos + 2)
|
|
48
|
-
return score
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
|
|
52
|
-
"""Normalized DCG at k."""
|
|
53
|
-
dcg = dcg_at_k(relevant_positions, k)
|
|
54
|
-
ideal_positions = list(range(min(num_relevant, k)))
|
|
55
|
-
idcg = dcg_at_k(ideal_positions, k)
|
|
56
|
-
return dcg / idcg if idcg > 0 else 0.0
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def compute_retrieval_metrics(
|
|
60
|
-
retrieved_ids: list[str],
|
|
61
|
-
expected_ids: list[str],
|
|
62
|
-
k_values: list[int] | None = None,
|
|
63
|
-
) -> dict:
|
|
64
|
-
"""Compute Recall@k and NDCG@k for a single question."""
|
|
65
|
-
ks = k_values or K_VALUES
|
|
66
|
-
expected_set = set(expected_ids)
|
|
67
|
-
relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
|
|
68
|
-
|
|
69
|
-
result = {}
|
|
70
|
-
for k in ks:
|
|
71
|
-
hit = any(pos < k for pos in relevant_positions)
|
|
72
|
-
result[f"recall@{k}"] = 1.0 if hit else 0.0
|
|
73
|
-
result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
|
|
74
|
-
return result
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# ── Aggregate metrics ─────────────────────────────────────────────────────────
|
|
78
|
-
|
|
79
|
-
def aggregate_results(per_question: list[dict]) -> dict:
|
|
80
|
-
"""Compute aggregate metrics from per-question results.
|
|
81
|
-
|
|
82
|
-
Each per_question entry has:
|
|
83
|
-
{id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
|
|
84
|
-
"""
|
|
85
|
-
if not per_question:
|
|
86
|
-
return {"error": "no results"}
|
|
87
|
-
|
|
88
|
-
# Per-condition scores
|
|
89
|
-
condition_scores: dict[str, list[float]] = defaultdict(list)
|
|
90
|
-
condition_f1s: dict[str, list[float]] = defaultdict(list)
|
|
91
|
-
# Per-category per-condition
|
|
92
|
-
cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
|
93
|
-
# Retrieval
|
|
94
|
-
retrieval_metrics: dict[str, list[float]] = defaultdict(list)
|
|
95
|
-
|
|
96
|
-
for q in per_question:
|
|
97
|
-
cat = q.get("category", "unknown")
|
|
98
|
-
|
|
99
|
-
for cond, data in q.get("answers", {}).items():
|
|
100
|
-
if data.get("score") is not None:
|
|
101
|
-
condition_scores[cond].append(data["score"])
|
|
102
|
-
cat_scores[cat][cond].append(data["score"])
|
|
103
|
-
if data.get("f1") is not None:
|
|
104
|
-
condition_f1s[cond].append(data["f1"])
|
|
105
|
-
|
|
106
|
-
for metric, val in q.get("retrieval", {}).items():
|
|
107
|
-
if isinstance(val, (int, float)):
|
|
108
|
-
retrieval_metrics[metric].append(val)
|
|
109
|
-
|
|
110
|
-
def _mean(lst: list[float]) -> float:
|
|
111
|
-
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
|
112
|
-
|
|
113
|
-
# Build summary
|
|
114
|
-
conditions = {}
|
|
115
|
-
for cond in sorted(condition_scores):
|
|
116
|
-
conditions[cond] = {
|
|
117
|
-
"mean_score": _mean(condition_scores[cond]),
|
|
118
|
-
"mean_f1": _mean(condition_f1s.get(cond, [])),
|
|
119
|
-
"n": len(condition_scores[cond]),
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
# IPR: sinain-memory vs full-context
|
|
123
|
-
sm_scores = condition_scores.get("sinain-memory", [])
|
|
124
|
-
fc_scores = condition_scores.get("full-context", [])
|
|
125
|
-
ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
|
|
126
|
-
|
|
127
|
-
# Category breakdown
|
|
128
|
-
categories = {}
|
|
129
|
-
for cat in sorted(cat_scores):
|
|
130
|
-
categories[cat] = {}
|
|
131
|
-
for cond in sorted(cat_scores[cat]):
|
|
132
|
-
categories[cat][cond] = {
|
|
133
|
-
"mean_score": _mean(cat_scores[cat][cond]),
|
|
134
|
-
"n": len(cat_scores[cat][cond]),
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
# Retrieval summary
|
|
138
|
-
retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
|
|
139
|
-
|
|
140
|
-
return {
|
|
141
|
-
"total_questions": len(per_question),
|
|
142
|
-
"conditions": conditions,
|
|
143
|
-
"ipr": round(ipr, 4) if ipr else None,
|
|
144
|
-
"categories": categories,
|
|
145
|
-
"retrieval": retrieval,
|
|
146
|
-
}
|
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
"""Ingestion pipeline — benchmark conversations → sinain triplestore.
|
|
2
|
-
|
|
3
|
-
Runs session_distiller.py + knowledge_integrator.py via subprocess (exact production path).
|
|
4
|
-
Caches results aggressively to avoid repeated LLM calls.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import hashlib
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
import shutil
|
|
13
|
-
import tempfile
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from subprocess import run, PIPE, TimeoutExpired
|
|
16
|
-
|
|
17
|
-
from .base_adapter import BenchmarkInstance
|
|
18
|
-
from .config import DISTILLER_TIMEOUT_S, INTEGRATOR_TIMEOUT_S
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _scripts_dir() -> Path:
|
|
22
|
-
"""Locate sinain-memory scripts directory."""
|
|
23
|
-
return Path(__file__).resolve().parent.parent.parent
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _content_hash(sessions: list[list[dict]]) -> str:
|
|
27
|
-
"""Hash session content for caching."""
|
|
28
|
-
raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
|
|
29
|
-
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def _run_script(script_name: str, args: list[str], timeout: int) -> str | None:
|
|
33
|
-
"""Run a Python script from sinain-memory, return stdout or None on failure."""
|
|
34
|
-
script_path = _scripts_dir() / script_name
|
|
35
|
-
if not script_path.exists():
|
|
36
|
-
print(f"[ingest] {script_name} not found at {script_path}")
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
env = {**os.environ, "PYTHONPATH": str(_scripts_dir())}
|
|
40
|
-
# Ensure a working model is available (common.py defaults may reference unreleased models)
|
|
41
|
-
if "SINAIN_BENCH_MODEL" in os.environ:
|
|
42
|
-
env["SINAIN_FAST_MODEL"] = os.environ["SINAIN_BENCH_MODEL"]
|
|
43
|
-
try:
|
|
44
|
-
result = run(
|
|
45
|
-
["python3", str(script_path)] + args,
|
|
46
|
-
capture_output=True, text=True, timeout=timeout, env=env,
|
|
47
|
-
)
|
|
48
|
-
if result.returncode != 0:
|
|
49
|
-
print(f"[ingest] {script_name} failed: {result.stderr[:200]}")
|
|
50
|
-
return None
|
|
51
|
-
return result.stdout.strip()
|
|
52
|
-
except TimeoutExpired:
|
|
53
|
-
print(f"[ingest] {script_name} timed out ({timeout}s)")
|
|
54
|
-
return None
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def ingest_instance(
|
|
58
|
-
instance: BenchmarkInstance,
|
|
59
|
-
cache_dir: Path,
|
|
60
|
-
) -> Path | None:
|
|
61
|
-
"""Ingest a benchmark instance into a triplestore. Returns db_path or None.
|
|
62
|
-
|
|
63
|
-
Uses caching: if the same haystack was already ingested, returns the cached DB.
|
|
64
|
-
"""
|
|
65
|
-
ch = _content_hash(instance.sessions)
|
|
66
|
-
cache_path = cache_dir / "stores" / f"{ch}.db"
|
|
67
|
-
|
|
68
|
-
if cache_path.exists():
|
|
69
|
-
return cache_path
|
|
70
|
-
|
|
71
|
-
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
-
|
|
73
|
-
# Create temp memory directory
|
|
74
|
-
tmp = tempfile.mkdtemp(prefix="sinain-bench-")
|
|
75
|
-
mem_dir = Path(tmp) / "memory"
|
|
76
|
-
for subdir in ["", "playbook-logs", "playbook-archive"]:
|
|
77
|
-
(mem_dir / subdir).mkdir(parents=True, exist_ok=True)
|
|
78
|
-
|
|
79
|
-
# Write a minimal playbook so integrator doesn't fail
|
|
80
|
-
(mem_dir / "sinain-playbook.md").write_text("# Sinain Playbook\n\n(benchmark run)\n")
|
|
81
|
-
|
|
82
|
-
success = False
|
|
83
|
-
try:
|
|
84
|
-
# Batch sessions into chunks of ~10 for fewer LLM calls.
|
|
85
|
-
# Each chunk becomes one distiller call with a combined transcript.
|
|
86
|
-
BATCH_SIZE = 10
|
|
87
|
-
num_sessions = len(instance.sessions)
|
|
88
|
-
batch_idx = 0
|
|
89
|
-
|
|
90
|
-
for start in range(0, num_sessions, BATCH_SIZE):
|
|
91
|
-
batch = instance.sessions[start:start + BATCH_SIZE]
|
|
92
|
-
# Flatten batch into one transcript
|
|
93
|
-
combined: list[dict] = []
|
|
94
|
-
for session in batch:
|
|
95
|
-
combined.extend(session)
|
|
96
|
-
if len(combined) < 3:
|
|
97
|
-
continue
|
|
98
|
-
|
|
99
|
-
first_ts = combined[0].get("ts", "2025-01-01T10:00:00Z")
|
|
100
|
-
meta = json.dumps({
|
|
101
|
-
"ts": first_ts,
|
|
102
|
-
"sessionKey": f"benchmark-batch-{batch_idx}",
|
|
103
|
-
"durationMs": len(combined) * 30000,
|
|
104
|
-
})
|
|
105
|
-
batch_idx += 1
|
|
106
|
-
|
|
107
|
-
# Step 1: Distill the batch
|
|
108
|
-
digest_json = _run_script("session_distiller.py", [
|
|
109
|
-
"--memory-dir", str(mem_dir),
|
|
110
|
-
"--transcript", json.dumps(combined),
|
|
111
|
-
"--session-meta", meta,
|
|
112
|
-
], DISTILLER_TIMEOUT_S)
|
|
113
|
-
|
|
114
|
-
if not digest_json:
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
try:
|
|
118
|
-
digest = json.loads(digest_json)
|
|
119
|
-
except json.JSONDecodeError:
|
|
120
|
-
continue
|
|
121
|
-
|
|
122
|
-
if digest.get("isEmpty") or digest.get("error"):
|
|
123
|
-
continue
|
|
124
|
-
|
|
125
|
-
# Step 2: Integrate into knowledge graph
|
|
126
|
-
_run_script("knowledge_integrator.py", [
|
|
127
|
-
"--memory-dir", str(mem_dir),
|
|
128
|
-
"--digest", json.dumps(digest),
|
|
129
|
-
], INTEGRATOR_TIMEOUT_S)
|
|
130
|
-
|
|
131
|
-
# Copy the resulting DB to cache
|
|
132
|
-
db_path = mem_dir / "knowledge-graph.db"
|
|
133
|
-
if db_path.exists() and db_path.stat().st_size > 0:
|
|
134
|
-
shutil.copy2(db_path, cache_path)
|
|
135
|
-
success = True
|
|
136
|
-
|
|
137
|
-
finally:
|
|
138
|
-
shutil.rmtree(tmp, ignore_errors=True)
|
|
139
|
-
|
|
140
|
-
return cache_path if success else None
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def get_knowledge_doc(db_path: Path) -> str:
|
|
144
|
-
"""Render a sinain-knowledge.md style document from a triplestore."""
|
|
145
|
-
import sys
|
|
146
|
-
sys.path.insert(0, str(_scripts_dir()))
|
|
147
|
-
from graph_query import query_top_facts, format_facts_text
|
|
148
|
-
|
|
149
|
-
facts = query_top_facts(str(db_path), limit=30)
|
|
150
|
-
if not facts:
|
|
151
|
-
return "(no knowledge available)"
|
|
152
|
-
return format_facts_text(facts, max_chars=6000)
|
|
File without changes
|
|
Binary file
|