@geravant/sinain 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/src/escalation/escalator.ts +1 -0
- package/sinain-core/src/escalation/message-builder.ts +45 -118
- package/sinain-core/src/overlay/commands.ts +16 -3
- package/sinain-core/src/overlay/ws-handler.ts +4 -1
- package/sinain-core/src/types.ts +3 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
- package/sinain-memory/eval/benchmarks/config.py +23 -0
- package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
- package/sinain-memory/eval/benchmarks/ingest.py +152 -0
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
- package/sinain-memory/eval/benchmarks/query.py +172 -0
- package/sinain-memory/eval/benchmarks/report.py +87 -0
- package/sinain-memory/eval/benchmarks/runner.py +276 -0
- package/sinain-memory/koog-config.json +11 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Benchmark runner — evaluates sinain's knowledge graph against published benchmarks.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 eval/benchmarks/runner.py --benchmarks longmemeval --subset 5
|
|
6
|
+
python3 eval/benchmarks/runner.py --benchmarks longmemeval --conditions sinain-memory,full-context
|
|
7
|
+
python3 eval/benchmarks/runner.py --benchmarks longmemeval --format markdown --resume
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
# Add sinain-memory to path
|
|
18
|
+
_koog_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
19
|
+
if _koog_dir not in sys.path:
|
|
20
|
+
sys.path.insert(0, _koog_dir)
|
|
21
|
+
|
|
22
|
+
from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
|
|
23
|
+
from eval.benchmarks.base_adapter import BenchmarkAdapter, BenchmarkInstance
|
|
24
|
+
from eval.benchmarks.longmemeval_adapter import LongMemEvalAdapter
|
|
25
|
+
from eval.benchmarks.ingest import ingest_instance, get_knowledge_doc
|
|
26
|
+
from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
|
|
27
|
+
from eval.benchmarks.evaluate import (
|
|
28
|
+
token_f1, aggregate_results,
|
|
29
|
+
)
|
|
30
|
+
from eval.benchmarks.judges.qa_judge import judge_qa
|
|
31
|
+
from eval.benchmarks.report import generate_markdown, generate_json
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_adapter(name: str) -> BenchmarkAdapter:
|
|
35
|
+
if name == "longmemeval":
|
|
36
|
+
return LongMemEvalAdapter()
|
|
37
|
+
raise ValueError(f"Unknown benchmark: {name}. Available: longmemeval")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _load_resume(resume_path: Path) -> dict[str, dict]:
|
|
41
|
+
"""Load previously computed results for resume support."""
|
|
42
|
+
results = {}
|
|
43
|
+
if resume_path.exists():
|
|
44
|
+
for line in resume_path.read_text().strip().split("\n"):
|
|
45
|
+
if line:
|
|
46
|
+
entry = json.loads(line)
|
|
47
|
+
results[entry["id"]] = entry
|
|
48
|
+
return results
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def run_benchmark(
|
|
52
|
+
benchmark_name: str,
|
|
53
|
+
conditions: list[str],
|
|
54
|
+
*,
|
|
55
|
+
subset: int | None = None,
|
|
56
|
+
qa_model: str = QA_MODEL,
|
|
57
|
+
judge_model: str = JUDGE_MODEL,
|
|
58
|
+
output_dir: Path = RESULTS_DIR,
|
|
59
|
+
cache_dir: Path = DATA_DIR,
|
|
60
|
+
resume: bool = False,
|
|
61
|
+
skip_llm: bool = False,
|
|
62
|
+
stratified: bool = False,
|
|
63
|
+
) -> tuple[dict, list[dict]]:
|
|
64
|
+
"""Run a benchmark end-to-end. Returns (summary, details)."""
|
|
65
|
+
|
|
66
|
+
adapter = _get_adapter(benchmark_name)
|
|
67
|
+
|
|
68
|
+
# Load dataset
|
|
69
|
+
print(f"\n{'='*60}")
|
|
70
|
+
print(f" Benchmark: {benchmark_name}")
|
|
71
|
+
print(f" Conditions: {', '.join(conditions)}")
|
|
72
|
+
print(f" QA model: {qa_model}")
|
|
73
|
+
print(f" Judge model: {judge_model}")
|
|
74
|
+
print(f"{'='*60}\n")
|
|
75
|
+
|
|
76
|
+
instances = adapter.load_dataset(str(cache_dir))
|
|
77
|
+
|
|
78
|
+
# Flatten questions
|
|
79
|
+
all_questions = []
|
|
80
|
+
for inst in instances:
|
|
81
|
+
for q in inst.questions:
|
|
82
|
+
all_questions.append((inst, q))
|
|
83
|
+
|
|
84
|
+
if subset:
|
|
85
|
+
if stratified:
|
|
86
|
+
# Take equal samples from each question category
|
|
87
|
+
from collections import defaultdict
|
|
88
|
+
by_cat: dict[str, list] = defaultdict(list)
|
|
89
|
+
for pair in all_questions:
|
|
90
|
+
by_cat[pair[1].category].append(pair)
|
|
91
|
+
per_cat = max(1, subset // len(by_cat))
|
|
92
|
+
sampled = []
|
|
93
|
+
for cat in sorted(by_cat):
|
|
94
|
+
sampled.extend(by_cat[cat][:per_cat])
|
|
95
|
+
all_questions = sampled[:subset]
|
|
96
|
+
else:
|
|
97
|
+
all_questions = all_questions[:subset]
|
|
98
|
+
|
|
99
|
+
total = len(all_questions)
|
|
100
|
+
print(f"[runner] evaluating {total} questions\n")
|
|
101
|
+
|
|
102
|
+
# Resume support
|
|
103
|
+
resume_path = output_dir / f"{benchmark_name}_progress.jsonl"
|
|
104
|
+
completed = _load_resume(resume_path) if resume else {}
|
|
105
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
# Track ingested instances
|
|
108
|
+
instance_dbs: dict[str, Path | None] = {}
|
|
109
|
+
instance_docs: dict[str, str] = {}
|
|
110
|
+
|
|
111
|
+
details: list[dict] = []
|
|
112
|
+
|
|
113
|
+
for idx, (inst, question) in enumerate(all_questions):
|
|
114
|
+
qid = question.id
|
|
115
|
+
|
|
116
|
+
# Skip if already done
|
|
117
|
+
if qid in completed:
|
|
118
|
+
details.append(completed[qid])
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
print(f"[{idx+1}/{total}] {qid} [{question.category}]")
|
|
122
|
+
|
|
123
|
+
# Ingest instance if not done yet
|
|
124
|
+
if inst.id not in instance_dbs:
|
|
125
|
+
if "sinain-memory" in conditions or "knowledge-doc" in conditions:
|
|
126
|
+
print(f" ingesting {inst.id} ({len(inst.sessions)} sessions)...")
|
|
127
|
+
instance_dbs[inst.id] = ingest_instance(inst, cache_dir / benchmark_name)
|
|
128
|
+
db = instance_dbs[inst.id]
|
|
129
|
+
if db:
|
|
130
|
+
instance_docs[inst.id] = get_knowledge_doc(db)
|
|
131
|
+
print(f" -> ingested ({db.stat().st_size} bytes)")
|
|
132
|
+
else:
|
|
133
|
+
instance_docs[inst.id] = "(ingestion failed)"
|
|
134
|
+
print(f" -> ingestion failed")
|
|
135
|
+
else:
|
|
136
|
+
instance_dbs[inst.id] = None
|
|
137
|
+
instance_docs[inst.id] = ""
|
|
138
|
+
|
|
139
|
+
db_path = instance_dbs.get(inst.id)
|
|
140
|
+
knowledge_doc = instance_docs.get(inst.id, "")
|
|
141
|
+
full_context = adapter.format_full_context(inst)
|
|
142
|
+
|
|
143
|
+
# Retrieval metrics (content-based: do retrieved facts contain the answer?)
|
|
144
|
+
retrieval = {}
|
|
145
|
+
if db_path and "sinain-memory" in conditions:
|
|
146
|
+
retrieved_facts = _get_retrieved_facts(str(db_path), question.text)
|
|
147
|
+
retrieval = compute_content_recall(
|
|
148
|
+
retrieved_facts, question.gold_answer,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Generate answers per condition
|
|
152
|
+
answers = {}
|
|
153
|
+
for cond in conditions:
|
|
154
|
+
if skip_llm:
|
|
155
|
+
answers[cond] = {"text": "(skipped)", "score": None, "f1": None}
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# Skip sinain-memory/knowledge-doc if ingestion failed
|
|
159
|
+
if cond in ("sinain-memory", "knowledge-doc") and not db_path:
|
|
160
|
+
answers[cond] = {"text": "(ingestion failed)", "score": 1, "f1": 0.0, "reasoning": "ingestion failed"}
|
|
161
|
+
print(f" [{cond}] skipped (ingestion failed)")
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
print(f" [{cond}] generating answer...")
|
|
165
|
+
answer_text = answer_question(
|
|
166
|
+
question, cond,
|
|
167
|
+
db_path=str(db_path) if db_path else None,
|
|
168
|
+
full_context=full_context,
|
|
169
|
+
knowledge_doc=knowledge_doc,
|
|
170
|
+
model=qa_model,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Score
|
|
174
|
+
f1 = token_f1(answer_text, question.gold_answer)
|
|
175
|
+
|
|
176
|
+
judge_result = judge_qa(
|
|
177
|
+
question.text, question.gold_answer, answer_text,
|
|
178
|
+
condition=cond, model=judge_model,
|
|
179
|
+
)
|
|
180
|
+
score = judge_result["score"] if judge_result else None
|
|
181
|
+
reasoning = judge_result["reasoning"] if judge_result else None
|
|
182
|
+
|
|
183
|
+
answers[cond] = {
|
|
184
|
+
"text": answer_text[:500],
|
|
185
|
+
"score": score,
|
|
186
|
+
"f1": round(f1, 4),
|
|
187
|
+
"reasoning": reasoning,
|
|
188
|
+
}
|
|
189
|
+
print(f" score={score}/5 f1={f1:.2f}")
|
|
190
|
+
|
|
191
|
+
entry = {
|
|
192
|
+
"id": qid,
|
|
193
|
+
"question": question.text,
|
|
194
|
+
"gold_answer": question.gold_answer,
|
|
195
|
+
"category": question.category,
|
|
196
|
+
"retrieval": retrieval,
|
|
197
|
+
"answers": answers,
|
|
198
|
+
}
|
|
199
|
+
details.append(entry)
|
|
200
|
+
|
|
201
|
+
# Save progress incrementally
|
|
202
|
+
with open(resume_path, "a") as f:
|
|
203
|
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
204
|
+
|
|
205
|
+
# Aggregate
|
|
206
|
+
summary = aggregate_results(details)
|
|
207
|
+
return summary, details
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def main() -> None:
|
|
211
|
+
parser = argparse.ArgumentParser(description="Sinain Knowledge Graph Benchmark Runner")
|
|
212
|
+
parser.add_argument("--benchmarks", default="longmemeval",
|
|
213
|
+
help="Comma-separated benchmark names (longmemeval, locomo)")
|
|
214
|
+
parser.add_argument("--conditions", default="sinain-memory,full-context,knowledge-doc",
|
|
215
|
+
help="Comma-separated conditions to evaluate")
|
|
216
|
+
parser.add_argument("--subset", type=int, default=None,
|
|
217
|
+
help="Run only first N questions (for dev iteration)")
|
|
218
|
+
parser.add_argument("--qa-model", default=QA_MODEL, help="Model for QA generation")
|
|
219
|
+
parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Model for QA judging")
|
|
220
|
+
parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
|
|
221
|
+
parser.add_argument("--cache-dir", type=Path, default=DATA_DIR)
|
|
222
|
+
parser.add_argument("--format", default="json,markdown",
|
|
223
|
+
help="Output formats (json, markdown)")
|
|
224
|
+
parser.add_argument("--resume", action="store_true", help="Resume from partial results")
|
|
225
|
+
parser.add_argument("--skip-llm", action="store_true",
|
|
226
|
+
help="Skip LLM calls (retrieval + mechanical metrics only)")
|
|
227
|
+
parser.add_argument("--stratified", action="store_true",
|
|
228
|
+
help="Sample equally from each question category (with --subset)")
|
|
229
|
+
args = parser.parse_args()
|
|
230
|
+
|
|
231
|
+
conditions = [c.strip() for c in args.conditions.split(",")]
|
|
232
|
+
formats = [f.strip() for f in args.format.split(",")]
|
|
233
|
+
|
|
234
|
+
for bench_name in args.benchmarks.split(","):
|
|
235
|
+
bench_name = bench_name.strip()
|
|
236
|
+
summary, details = run_benchmark(
|
|
237
|
+
bench_name, conditions,
|
|
238
|
+
subset=args.subset,
|
|
239
|
+
qa_model=args.qa_model,
|
|
240
|
+
judge_model=args.judge_model,
|
|
241
|
+
output_dir=args.output_dir,
|
|
242
|
+
cache_dir=args.cache_dir,
|
|
243
|
+
resume=args.resume,
|
|
244
|
+
skip_llm=args.skip_llm,
|
|
245
|
+
stratified=args.stratified,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Write outputs
|
|
249
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
250
|
+
|
|
251
|
+
if "json" in formats:
|
|
252
|
+
json_path = args.output_dir / f"{bench_name}_results.json"
|
|
253
|
+
json_path.write_text(generate_json(bench_name, summary, details))
|
|
254
|
+
print(f"\n[output] JSON: {json_path}")
|
|
255
|
+
|
|
256
|
+
if "markdown" in formats:
|
|
257
|
+
md_path = args.output_dir / f"{bench_name}_results.md"
|
|
258
|
+
md_path.write_text(generate_markdown(bench_name, summary, details))
|
|
259
|
+
print(f"[output] Markdown: {md_path}")
|
|
260
|
+
|
|
261
|
+
# Print summary
|
|
262
|
+
print(f"\n{'='*60}")
|
|
263
|
+
print(f" {bench_name} — Summary")
|
|
264
|
+
print(f"{'='*60}")
|
|
265
|
+
ipr = summary.get("ipr")
|
|
266
|
+
if ipr:
|
|
267
|
+
print(f" IPR: {ipr:.1%}")
|
|
268
|
+
for cond, data in summary.get("conditions", {}).items():
|
|
269
|
+
print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
|
|
270
|
+
for k, v in summary.get("retrieval", {}).items():
|
|
271
|
+
print(f" {k}: {v:.1%}")
|
|
272
|
+
print()
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
if __name__ == "__main__":
|
|
276
|
+
main()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"models": {
|
|
3
|
+
"fast": "google/gemini-2.5-flash-lite",
|
|
4
|
+
"smart": "google/gemini-2.5-flash"
|
|
5
|
+
},
|
|
6
|
+
"scripts": {
|
|
7
|
+
"session_distiller": { "model": "fast", "maxTokens": 2000 },
|
|
8
|
+
"knowledge_integrator": { "model": "fast", "maxTokens": 2000 },
|
|
9
|
+
"meeting_benchmark": { "model": "smart", "maxTokens": 200 }
|
|
10
|
+
}
|
|
11
|
+
}
|