@geravant/sinain 1.10.1 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/package.json +1 -1
  2. package/sinain-agent/CLAUDE.md +1 -1
  3. package/sinain-agent/run.sh +66 -7
  4. package/sinain-core/src/agent/analyzer.ts +4 -27
  5. package/sinain-core/src/agent/loop.ts +10 -40
  6. package/sinain-core/src/agent/situation-writer.ts +0 -16
  7. package/sinain-core/src/config.ts +1 -9
  8. package/sinain-core/src/escalation/escalator.ts +44 -16
  9. package/sinain-core/src/escalation/message-builder.ts +45 -118
  10. package/sinain-core/src/index.ts +20 -36
  11. package/sinain-core/src/learning/local-curation.ts +4 -4
  12. package/sinain-core/src/overlay/commands.ts +46 -13
  13. package/sinain-core/src/overlay/ws-handler.ts +13 -1
  14. package/sinain-core/src/server.ts +121 -0
  15. package/sinain-core/src/types.ts +25 -28
  16. package/sinain-mcp-server/index.ts +28 -0
  17. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  18. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  19. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/assertions.py +0 -21
  21. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
  32. package/sinain-memory/eval/benchmarks/config.py +23 -0
  33. package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
  34. package/sinain-memory/eval/benchmarks/ingest.py +152 -0
  35. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  36. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  38. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
  39. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
  40. package/sinain-memory/eval/benchmarks/query.py +172 -0
  41. package/sinain-memory/eval/benchmarks/report.py +87 -0
  42. package/sinain-memory/eval/benchmarks/runner.py +276 -0
  43. package/sinain-memory/koog-config.json +11 -0
  44. package/sinain-core/src/agent/traits.ts +0 -520
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env python3
2
+ """Benchmark runner — evaluates sinain's knowledge graph against published benchmarks.
3
+
4
+ Usage:
5
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --subset 5
6
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --conditions sinain-memory,full-context
7
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --format markdown --resume
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ # Add sinain-memory to path
18
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
19
+ if _koog_dir not in sys.path:
20
+ sys.path.insert(0, _koog_dir)
21
+
22
+ from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
23
+ from eval.benchmarks.base_adapter import BenchmarkAdapter, BenchmarkInstance
24
+ from eval.benchmarks.longmemeval_adapter import LongMemEvalAdapter
25
+ from eval.benchmarks.ingest import ingest_instance, get_knowledge_doc
26
+ from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
27
+ from eval.benchmarks.evaluate import (
28
+ token_f1, aggregate_results,
29
+ )
30
+ from eval.benchmarks.judges.qa_judge import judge_qa
31
+ from eval.benchmarks.report import generate_markdown, generate_json
32
+
33
+
34
+ def _get_adapter(name: str) -> BenchmarkAdapter:
35
+ if name == "longmemeval":
36
+ return LongMemEvalAdapter()
37
+ raise ValueError(f"Unknown benchmark: {name}. Available: longmemeval")
38
+
39
+
40
+ def _load_resume(resume_path: Path) -> dict[str, dict]:
41
+ """Load previously computed results for resume support."""
42
+ results = {}
43
+ if resume_path.exists():
44
+ for line in resume_path.read_text().strip().split("\n"):
45
+ if line:
46
+ entry = json.loads(line)
47
+ results[entry["id"]] = entry
48
+ return results
49
+
50
+
51
+ def run_benchmark(
52
+ benchmark_name: str,
53
+ conditions: list[str],
54
+ *,
55
+ subset: int | None = None,
56
+ qa_model: str = QA_MODEL,
57
+ judge_model: str = JUDGE_MODEL,
58
+ output_dir: Path = RESULTS_DIR,
59
+ cache_dir: Path = DATA_DIR,
60
+ resume: bool = False,
61
+ skip_llm: bool = False,
62
+ stratified: bool = False,
63
+ ) -> tuple[dict, list[dict]]:
64
+ """Run a benchmark end-to-end. Returns (summary, details)."""
65
+
66
+ adapter = _get_adapter(benchmark_name)
67
+
68
+ # Load dataset
69
+ print(f"\n{'='*60}")
70
+ print(f" Benchmark: {benchmark_name}")
71
+ print(f" Conditions: {', '.join(conditions)}")
72
+ print(f" QA model: {qa_model}")
73
+ print(f" Judge model: {judge_model}")
74
+ print(f"{'='*60}\n")
75
+
76
+ instances = adapter.load_dataset(str(cache_dir))
77
+
78
+ # Flatten questions
79
+ all_questions = []
80
+ for inst in instances:
81
+ for q in inst.questions:
82
+ all_questions.append((inst, q))
83
+
84
+ if subset:
85
+ if stratified:
86
+ # Take equal samples from each question category
87
+ from collections import defaultdict
88
+ by_cat: dict[str, list] = defaultdict(list)
89
+ for pair in all_questions:
90
+ by_cat[pair[1].category].append(pair)
91
+ per_cat = max(1, subset // len(by_cat))
92
+ sampled = []
93
+ for cat in sorted(by_cat):
94
+ sampled.extend(by_cat[cat][:per_cat])
95
+ all_questions = sampled[:subset]
96
+ else:
97
+ all_questions = all_questions[:subset]
98
+
99
+ total = len(all_questions)
100
+ print(f"[runner] evaluating {total} questions\n")
101
+
102
+ # Resume support
103
+ resume_path = output_dir / f"{benchmark_name}_progress.jsonl"
104
+ completed = _load_resume(resume_path) if resume else {}
105
+ output_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ # Track ingested instances
108
+ instance_dbs: dict[str, Path | None] = {}
109
+ instance_docs: dict[str, str] = {}
110
+
111
+ details: list[dict] = []
112
+
113
+ for idx, (inst, question) in enumerate(all_questions):
114
+ qid = question.id
115
+
116
+ # Skip if already done
117
+ if qid in completed:
118
+ details.append(completed[qid])
119
+ continue
120
+
121
+ print(f"[{idx+1}/{total}] {qid} [{question.category}]")
122
+
123
+ # Ingest instance if not done yet
124
+ if inst.id not in instance_dbs:
125
+ if "sinain-memory" in conditions or "knowledge-doc" in conditions:
126
+ print(f" ingesting {inst.id} ({len(inst.sessions)} sessions)...")
127
+ instance_dbs[inst.id] = ingest_instance(inst, cache_dir / benchmark_name)
128
+ db = instance_dbs[inst.id]
129
+ if db:
130
+ instance_docs[inst.id] = get_knowledge_doc(db)
131
+ print(f" -> ingested ({db.stat().st_size} bytes)")
132
+ else:
133
+ instance_docs[inst.id] = "(ingestion failed)"
134
+ print(f" -> ingestion failed")
135
+ else:
136
+ instance_dbs[inst.id] = None
137
+ instance_docs[inst.id] = ""
138
+
139
+ db_path = instance_dbs.get(inst.id)
140
+ knowledge_doc = instance_docs.get(inst.id, "")
141
+ full_context = adapter.format_full_context(inst)
142
+
143
+ # Retrieval metrics (content-based: do retrieved facts contain the answer?)
144
+ retrieval = {}
145
+ if db_path and "sinain-memory" in conditions:
146
+ retrieved_facts = _get_retrieved_facts(str(db_path), question.text)
147
+ retrieval = compute_content_recall(
148
+ retrieved_facts, question.gold_answer,
149
+ )
150
+
151
+ # Generate answers per condition
152
+ answers = {}
153
+ for cond in conditions:
154
+ if skip_llm:
155
+ answers[cond] = {"text": "(skipped)", "score": None, "f1": None}
156
+ continue
157
+
158
+ # Skip sinain-memory/knowledge-doc if ingestion failed
159
+ if cond in ("sinain-memory", "knowledge-doc") and not db_path:
160
+ answers[cond] = {"text": "(ingestion failed)", "score": 1, "f1": 0.0, "reasoning": "ingestion failed"}
161
+ print(f" [{cond}] skipped (ingestion failed)")
162
+ continue
163
+
164
+ print(f" [{cond}] generating answer...")
165
+ answer_text = answer_question(
166
+ question, cond,
167
+ db_path=str(db_path) if db_path else None,
168
+ full_context=full_context,
169
+ knowledge_doc=knowledge_doc,
170
+ model=qa_model,
171
+ )
172
+
173
+ # Score
174
+ f1 = token_f1(answer_text, question.gold_answer)
175
+
176
+ judge_result = judge_qa(
177
+ question.text, question.gold_answer, answer_text,
178
+ condition=cond, model=judge_model,
179
+ )
180
+ score = judge_result["score"] if judge_result else None
181
+ reasoning = judge_result["reasoning"] if judge_result else None
182
+
183
+ answers[cond] = {
184
+ "text": answer_text[:500],
185
+ "score": score,
186
+ "f1": round(f1, 4),
187
+ "reasoning": reasoning,
188
+ }
189
+ print(f" score={score}/5 f1={f1:.2f}")
190
+
191
+ entry = {
192
+ "id": qid,
193
+ "question": question.text,
194
+ "gold_answer": question.gold_answer,
195
+ "category": question.category,
196
+ "retrieval": retrieval,
197
+ "answers": answers,
198
+ }
199
+ details.append(entry)
200
+
201
+ # Save progress incrementally
202
+ with open(resume_path, "a") as f:
203
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
204
+
205
+ # Aggregate
206
+ summary = aggregate_results(details)
207
+ return summary, details
208
+
209
+
210
+ def main() -> None:
211
+ parser = argparse.ArgumentParser(description="Sinain Knowledge Graph Benchmark Runner")
212
+ parser.add_argument("--benchmarks", default="longmemeval",
213
+ help="Comma-separated benchmark names (longmemeval, locomo)")
214
+ parser.add_argument("--conditions", default="sinain-memory,full-context,knowledge-doc",
215
+ help="Comma-separated conditions to evaluate")
216
+ parser.add_argument("--subset", type=int, default=None,
217
+ help="Run only first N questions (for dev iteration)")
218
+ parser.add_argument("--qa-model", default=QA_MODEL, help="Model for QA generation")
219
+ parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Model for QA judging")
220
+ parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
221
+ parser.add_argument("--cache-dir", type=Path, default=DATA_DIR)
222
+ parser.add_argument("--format", default="json,markdown",
223
+ help="Output formats (json, markdown)")
224
+ parser.add_argument("--resume", action="store_true", help="Resume from partial results")
225
+ parser.add_argument("--skip-llm", action="store_true",
226
+ help="Skip LLM calls (retrieval + mechanical metrics only)")
227
+ parser.add_argument("--stratified", action="store_true",
228
+ help="Sample equally from each question category (with --subset)")
229
+ args = parser.parse_args()
230
+
231
+ conditions = [c.strip() for c in args.conditions.split(",")]
232
+ formats = [f.strip() for f in args.format.split(",")]
233
+
234
+ for bench_name in args.benchmarks.split(","):
235
+ bench_name = bench_name.strip()
236
+ summary, details = run_benchmark(
237
+ bench_name, conditions,
238
+ subset=args.subset,
239
+ qa_model=args.qa_model,
240
+ judge_model=args.judge_model,
241
+ output_dir=args.output_dir,
242
+ cache_dir=args.cache_dir,
243
+ resume=args.resume,
244
+ skip_llm=args.skip_llm,
245
+ stratified=args.stratified,
246
+ )
247
+
248
+ # Write outputs
249
+ args.output_dir.mkdir(parents=True, exist_ok=True)
250
+
251
+ if "json" in formats:
252
+ json_path = args.output_dir / f"{bench_name}_results.json"
253
+ json_path.write_text(generate_json(bench_name, summary, details))
254
+ print(f"\n[output] JSON: {json_path}")
255
+
256
+ if "markdown" in formats:
257
+ md_path = args.output_dir / f"{bench_name}_results.md"
258
+ md_path.write_text(generate_markdown(bench_name, summary, details))
259
+ print(f"[output] Markdown: {md_path}")
260
+
261
+ # Print summary
262
+ print(f"\n{'='*60}")
263
+ print(f" {bench_name} — Summary")
264
+ print(f"{'='*60}")
265
+ ipr = summary.get("ipr")
266
+ if ipr:
267
+ print(f" IPR: {ipr:.1%}")
268
+ for cond, data in summary.get("conditions", {}).items():
269
+ print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
270
+ for k, v in summary.get("retrieval", {}).items():
271
+ print(f" {k}: {v:.1%}")
272
+ print()
273
+
274
+
275
+ if __name__ == "__main__":
276
+ main()
@@ -0,0 +1,11 @@
1
+ {
2
+ "models": {
3
+ "fast": "google/gemini-2.5-flash-lite",
4
+ "smart": "google/gemini-2.5-flash"
5
+ },
6
+ "scripts": {
7
+ "session_distiller": { "model": "fast", "maxTokens": 2000 },
8
+ "knowledge_integrator": { "model": "fast", "maxTokens": 2000 },
9
+ "meeting_benchmark": { "model": "smart", "maxTokens": 200 }
10
+ }
11
+ }