@geravant/sinain 1.11.0 → 1.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/package.json +1 -1
  2. package/sinain-core/package-lock.json +963 -0
  3. package/sinain-core/package.json +1 -0
  4. package/sinain-core/src/buffers/feed-buffer.ts +32 -0
  5. package/sinain-core/src/embedding/service.ts +66 -0
  6. package/sinain-core/src/escalation/escalator.ts +1 -0
  7. package/sinain-core/src/escalation/message-builder.ts +45 -118
  8. package/sinain-core/src/index.ts +19 -2
  9. package/sinain-core/src/learning/local-curation.ts +137 -7
  10. package/sinain-core/src/overlay/commands.ts +16 -3
  11. package/sinain-core/src/overlay/ws-handler.ts +4 -1
  12. package/sinain-core/src/server.ts +31 -0
  13. package/sinain-core/src/types.ts +3 -0
  14. package/sinain-memory/README.md +105 -0
  15. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  16. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  17. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  18. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  19. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  20. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  21. package/sinain-memory/embed_client.py +117 -0
  22. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
  36. package/sinain-memory/eval/benchmarks/config.py +23 -0
  37. package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
  38. package/sinain-memory/eval/benchmarks/ingest.py +152 -0
  39. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  40. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  41. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  42. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
  43. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
  44. package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
  45. package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
  46. package/sinain-memory/eval/benchmarks/query.py +193 -0
  47. package/sinain-memory/eval/benchmarks/report.py +87 -0
  48. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
  49. package/sinain-memory/eval/benchmarks/runner.py +283 -0
  50. package/sinain-memory/graph_query.py +257 -15
  51. package/sinain-memory/knowledge_integrator.py +365 -72
  52. package/sinain-memory/koog-config.json +11 -0
  53. package/sinain-memory/memory-config.json +1 -1
  54. package/sinain-memory/session_distiller.py +43 -19
  55. package/sinain-memory/triplestore.py +60 -0
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python3
2
+ """Meeting Memory Benchmark runner — evaluates a captured knowledge-graph.db.
3
+
4
+ Standalone script: does NOT modify or import runner.py.
5
+ Reuses shared infrastructure: query, judge, evaluate, report.
6
+
7
+ Usage:
8
+ python3 eval/benchmarks/meeting_runner.py \
9
+ --db /tmp/sinain-bench-XXXX/knowledge-graph.db \
10
+ --conditions sinain-memory,full-context
11
+
12
+ # Quick test (3 questions):
13
+ python3 eval/benchmarks/meeting_runner.py \
14
+ --db /tmp/sinain-bench-XXXX/knowledge-graph.db \
15
+ --subset 3
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ # Add sinain-memory to path
26
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
27
+ if _koog_dir not in sys.path:
28
+ sys.path.insert(0, _koog_dir)
29
+
30
+ from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
31
+ from eval.benchmarks.meeting_adapter import MeetingMemoryAdapter
32
+ from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
33
+ from eval.benchmarks.evaluate import token_f1, aggregate_results
34
+ from eval.benchmarks.judges.qa_judge import judge_qa
35
+ from eval.benchmarks.report import generate_markdown, generate_json
36
+
37
+
38
+ def run_meeting_benchmark(
39
+ db_path: str,
40
+ conditions: list[str],
41
+ *,
42
+ subset: int | None = None,
43
+ meeting_filter: str | None = None,
44
+ qa_model: str = QA_MODEL,
45
+ judge_model: str = JUDGE_MODEL,
46
+ output_dir: Path = RESULTS_DIR,
47
+ data_dir: Path = DATA_DIR,
48
+ resume: bool = False,
49
+ ) -> tuple[dict, list[dict]]:
50
+ """Run meeting benchmark against a captured knowledge-graph.db."""
51
+
52
+ adapter = MeetingMemoryAdapter()
53
+ instances = adapter.load_dataset(str(data_dir))
54
+
55
+ if meeting_filter:
56
+ instances = [i for i in instances if meeting_filter in i.id]
57
+ print(f"[meeting] filtered to {len(instances)} meeting(s) matching '{meeting_filter}'")
58
+
59
+ if not instances:
60
+ print("[meeting] no meeting data found — check data/meeting/ directory")
61
+ return {"error": "no data"}, []
62
+
63
+ # Validate DB exists for sinain-memory condition
64
+ if "sinain-memory" in conditions:
65
+ if not db_path or not Path(db_path).exists():
66
+ print(f"[meeting] ERROR: --db path does not exist: {db_path}")
67
+ print(" Run the capture pipeline first (see plan Part 1)")
68
+ sys.exit(1)
69
+
70
+ print(f"\n{'='*60}")
71
+ print(f" Meeting Memory Benchmark")
72
+ print(f" Conditions: {', '.join(conditions)}")
73
+ print(f" DB: {db_path or '(none)'}")
74
+ print(f" QA model: {qa_model}")
75
+ print(f" Judge model: {judge_model}")
76
+ print(f"{'='*60}\n")
77
+
78
+ # Flatten questions
79
+ all_questions = []
80
+ for inst in instances:
81
+ for q in inst.questions:
82
+ all_questions.append((inst, q))
83
+
84
+ if subset:
85
+ all_questions = all_questions[:subset]
86
+
87
+ total = len(all_questions)
88
+ print(f"[meeting] evaluating {total} questions\n")
89
+
90
+ # Resume support
91
+ resume_path = output_dir / "meeting_progress.jsonl"
92
+ completed: dict[str, dict] = {}
93
+ if resume and resume_path.exists():
94
+ for line in resume_path.read_text().strip().split("\n"):
95
+ if line:
96
+ entry = json.loads(line)
97
+ completed[entry["id"]] = entry
98
+
99
+ output_dir.mkdir(parents=True, exist_ok=True)
100
+ details: list[dict] = []
101
+
102
+ for idx, (inst, question) in enumerate(all_questions):
103
+ qid = question.id
104
+
105
+ if qid in completed:
106
+ details.append(completed[qid])
107
+ continue
108
+
109
+ print(f"[{idx+1}/{total}] {qid} [{question.category}]")
110
+ print(f" Q: {question.text[:80]}...")
111
+
112
+ full_context = adapter.format_full_context(inst)
113
+
114
+ # Retrieval metrics
115
+ retrieval = {}
116
+ if db_path and "sinain-memory" in conditions:
117
+ retrieved_facts = _get_retrieved_facts(db_path, question.text)
118
+ retrieval = compute_content_recall(retrieved_facts, question.gold_answer)
119
+
120
+ # Generate answers per condition
121
+ answers = {}
122
+ for cond in conditions:
123
+ if cond == "sinain-memory" and not db_path:
124
+ answers[cond] = {"text": "(no DB)", "score": 1, "f1": 0.0}
125
+ continue
126
+
127
+ print(f" [{cond}] generating answer...")
128
+ answer_text = answer_question(
129
+ question, cond,
130
+ db_path=db_path,
131
+ full_context=full_context,
132
+ model=qa_model,
133
+ )
134
+
135
+ f1 = token_f1(answer_text, question.gold_answer)
136
+
137
+ judge_result = judge_qa(
138
+ question.text, question.gold_answer, answer_text,
139
+ condition=cond, model=judge_model,
140
+ )
141
+ score = judge_result["score"] if judge_result else None
142
+ reasoning = judge_result["reasoning"] if judge_result else None
143
+
144
+ answers[cond] = {
145
+ "text": answer_text[:500],
146
+ "score": score,
147
+ "f1": round(f1, 4),
148
+ "reasoning": reasoning,
149
+ }
150
+ print(f" score={score}/5 f1={f1:.2f}")
151
+
152
+ entry = {
153
+ "id": qid,
154
+ "question": question.text,
155
+ "gold_answer": question.gold_answer,
156
+ "category": question.category,
157
+ "retrieval": retrieval,
158
+ "answers": answers,
159
+ }
160
+ details.append(entry)
161
+
162
+ # Save progress
163
+ with open(resume_path, "a") as f:
164
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
165
+
166
+ summary = aggregate_results(details)
167
+ return summary, details
168
+
169
+
170
+ def main() -> None:
171
+ parser = argparse.ArgumentParser(description="Meeting Memory Benchmark")
172
+ parser.add_argument("--db", required=False, default=None,
173
+ help="Path to knowledge-graph.db from capture run")
174
+ parser.add_argument("--conditions", default="sinain-memory,full-context",
175
+ help="Comma-separated conditions (sinain-memory, full-context)")
176
+ parser.add_argument("--subset", type=int, default=None,
177
+ help="Run only first N questions")
178
+ parser.add_argument("--meeting", default=None,
179
+ help="Filter to specific meeting by stem (e.g. al-futaim-prep-5min)")
180
+ parser.add_argument("--qa-model", default=QA_MODEL)
181
+ parser.add_argument("--judge-model", default=JUDGE_MODEL)
182
+ parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
183
+ parser.add_argument("--data-dir", type=Path, default=DATA_DIR)
184
+ parser.add_argument("--format", default="json,markdown")
185
+ parser.add_argument("--resume", action="store_true")
186
+ args = parser.parse_args()
187
+
188
+ conditions = [c.strip() for c in args.conditions.split(",")]
189
+ formats = [f.strip() for f in args.format.split(",")]
190
+
191
+ summary, details = run_meeting_benchmark(
192
+ args.db, conditions,
193
+ subset=args.subset,
194
+ meeting_filter=args.meeting,
195
+ qa_model=args.qa_model,
196
+ judge_model=args.judge_model,
197
+ output_dir=args.output_dir,
198
+ data_dir=args.data_dir,
199
+ resume=args.resume,
200
+ )
201
+
202
+ # Write outputs
203
+ args.output_dir.mkdir(parents=True, exist_ok=True)
204
+
205
+ if "json" in formats:
206
+ json_path = args.output_dir / "meeting_results.json"
207
+ json_path.write_text(generate_json("meeting", summary, details))
208
+ print(f"\n[output] JSON: {json_path}")
209
+
210
+ if "markdown" in formats:
211
+ md_path = args.output_dir / "meeting_results.md"
212
+ md_path.write_text(generate_markdown("meeting", summary, details))
213
+ print(f"[output] Markdown: {md_path}")
214
+
215
+ # Print summary
216
+ print(f"\n{'='*60}")
217
+ print(f" Meeting Memory Benchmark — Summary")
218
+ print(f"{'='*60}")
219
+ ipr = summary.get("ipr")
220
+ if ipr:
221
+ print(f" IPR: {ipr:.1%}")
222
+ for cond, data in summary.get("conditions", {}).items():
223
+ print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
224
+ for k, v in summary.get("retrieval", {}).items():
225
+ print(f" {k}: {v:.1%}")
226
+ print()
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,193 @@
1
+ """Query pipeline — benchmark questions → LLM answers under 3 conditions.
2
+
3
+ Condition A (sinain-memory): answer from knowledge graph facts
4
+ Condition B (full-context): answer from full conversation history
5
+ Condition C (knowledge-doc): answer from portable knowledge document
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ # Add sinain-memory to path
14
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
+ if _koog_dir not in sys.path:
16
+ sys.path.insert(0, _koog_dir)
17
+
18
+ from common import call_llm # noqa: E402
19
+
20
+ from .base_adapter import BenchmarkQuestion
21
+ from .config import QA_MODEL, MAX_FACTS_PER_QUERY
22
+
23
+
24
+ def _extract_keywords(query: str) -> list[str]:
25
+ """Extract search keywords (reuses logic from retrieval_evaluator)."""
26
+ import re
27
+ words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
28
+ stopwords = {
29
+ "the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an",
30
+ "it", "was", "not", "how", "what", "when", "does", "did", "do", "my",
31
+ "your", "their", "have", "has", "had", "are", "were", "been", "being",
32
+ "about", "from", "with", "that", "this", "which", "who", "whom",
33
+ "where", "why", "can", "could", "would", "should",
34
+ }
35
+ return [w for w in words if len(w) > 2 and w not in stopwords]
36
+
37
+
38
+ def _get_all_facts_text(db_path: str) -> str:
39
+ """Dump ALL facts from the knowledge graph as formatted text.
40
+
41
+ Sinain triplestores are small (10-50 facts per session), so including
42
+ everything is feasible and avoids tag-matching failures.
43
+ """
44
+ from graph_query import query_top_facts, format_facts_text
45
+
46
+ facts = query_top_facts(db_path, limit=50)
47
+ if not facts:
48
+ return "(no knowledge available)"
49
+ return format_facts_text(facts, max_chars=6000)
50
+
51
+
52
+ def _query_knowledge(db_path: str, question: str) -> str:
53
+ """Query sinain knowledge graph for facts relevant to a question.
54
+
55
+ Strategy: retrieve broadly, then re-rank by keyword overlap with the question.
56
+ This ensures specific facts (CTO background) beat generic ones (meeting schedule)
57
+ when the question asks about the CTO.
58
+ """
59
+ from graph_query import query_facts_hybrid, query_top_facts, format_facts_text
60
+
61
+ # Retrieve a broad candidate set
62
+ candidates = query_facts_hybrid(db_path, question, max_facts=30)
63
+ if not candidates:
64
+ candidates = query_top_facts(db_path, limit=30)
65
+ if not candidates:
66
+ return "(no knowledge available)"
67
+
68
+ # Re-rank by embedding similarity if available, fall back to keyword overlap
69
+ try:
70
+ from embed_client import rank_by_similarity
71
+ fact_texts = [str(f.get("value", "")) for f in candidates]
72
+ ranked_indices = rank_by_similarity(question, fact_texts)
73
+ if ranked_indices is not None:
74
+ ranked = [candidates[i] for i, _ in ranked_indices[:MAX_FACTS_PER_QUERY]]
75
+ return format_facts_text(ranked, max_chars=3000)
76
+ except Exception:
77
+ pass
78
+
79
+ # Fallback: keyword overlap ranking
80
+ q_keywords = set(_extract_keywords(question))
81
+ def _relevance(fact: dict) -> float:
82
+ value = str(fact.get("value", "")).lower()
83
+ entity = str(fact.get("entity", "")).lower()
84
+ fact_words = set(_extract_keywords(value + " " + entity))
85
+ if not q_keywords:
86
+ return 0.0
87
+ return len(q_keywords & fact_words) / len(q_keywords)
88
+
89
+ ranked = sorted(candidates, key=_relevance, reverse=True)
90
+ return format_facts_text(ranked[:MAX_FACTS_PER_QUERY], max_chars=3000)
91
+
92
+
93
+ def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
94
+ """Get facts retrieved for a question (for retrieval evaluation)."""
95
+ from graph_query import query_facts_hybrid, query_top_facts
96
+
97
+ facts = query_facts_hybrid(db_path, question, max_facts=k)
98
+ if facts:
99
+ return facts
100
+
101
+ # Fallback: top facts by confidence
102
+ return query_top_facts(db_path, limit=k)
103
+
104
+
105
+ def compute_content_recall(
106
+ retrieved_facts: list[dict],
107
+ gold_answer: str,
108
+ k_values: list[int] | None = None,
109
+ ) -> dict:
110
+ """Content-based retrieval metric: do retrieved facts contain the answer?
111
+
112
+ Instead of matching entity IDs (which don't align between LongMemEval
113
+ session IDs and sinain entity IDs), we check whether the gold answer's
114
+ key terms appear in any retrieved fact's content.
115
+ """
116
+ from .config import K_VALUES
117
+ ks = k_values or K_VALUES
118
+
119
+ gold_terms = set(_extract_keywords(str(gold_answer)))
120
+ if not gold_terms:
121
+ return {f"content_recall@{k}": 0.0 for k in ks}
122
+
123
+ result = {}
124
+ for k in ks:
125
+ top_k = retrieved_facts[:k]
126
+ # Check if any fact in top-k contains gold answer terms
127
+ for fact in top_k:
128
+ fact_text = f"{fact.get('entity', '')} {fact.get('value', '')}".lower()
129
+ fact_terms = set(_extract_keywords(fact_text))
130
+ overlap = gold_terms & fact_terms
131
+ if len(overlap) >= max(1, len(gold_terms) // 2): # ≥50% of gold terms
132
+ result[f"content_recall@{k}"] = 1.0
133
+ break
134
+ else:
135
+ result[f"content_recall@{k}"] = 0.0
136
+
137
+ return result
138
+
139
+
140
+ def answer_question(
141
+ question: BenchmarkQuestion,
142
+ condition: str,
143
+ *,
144
+ db_path: str | None = None,
145
+ full_context: str | None = None,
146
+ knowledge_doc: str | None = None,
147
+ model: str | None = None,
148
+ ) -> str:
149
+ """Generate an answer for a benchmark question under a specific condition.
150
+
151
+ Returns the LLM's answer text.
152
+ """
153
+ qa_model = model or QA_MODEL
154
+
155
+ if condition == "sinain-memory":
156
+ assert db_path, "db_path required for sinain-memory condition"
157
+ facts = _query_knowledge(db_path, question.text)
158
+ system = (
159
+ "Answer the question using ONLY the provided knowledge facts. "
160
+ "If the facts don't contain enough information to answer, say \"I don't know.\""
161
+ )
162
+ user = f"## Knowledge Facts\n{facts}\n\n## Question\n{question.text}"
163
+
164
+ elif condition == "full-context":
165
+ assert full_context, "full_context required for full-context condition"
166
+ system = (
167
+ "Answer the question based on the conversation history below. "
168
+ "Be concise and specific."
169
+ )
170
+ # Truncate context if too large (some models have limits)
171
+ ctx = full_context[:100_000] if len(full_context) > 100_000 else full_context
172
+ user = f"## Conversation History\n{ctx}\n\n## Question\n{question.text}"
173
+
174
+ elif condition == "knowledge-doc":
175
+ assert knowledge_doc, "knowledge_doc required for knowledge-doc condition"
176
+ system = (
177
+ "Answer the question using ONLY the knowledge document provided. "
178
+ "If the document doesn't contain enough information, say \"I don't know.\""
179
+ )
180
+ user = f"## Knowledge Document\n{knowledge_doc}\n\n## Question\n{question.text}"
181
+
182
+ else:
183
+ raise ValueError(f"Unknown condition: {condition}")
184
+
185
+ try:
186
+ return call_llm(
187
+ system_prompt=system,
188
+ user_prompt=user,
189
+ model=qa_model,
190
+ max_tokens=300,
191
+ ).strip()
192
+ except Exception as e:
193
+ return f"(error: {e})"
@@ -0,0 +1,87 @@
1
+ """Report generation — markdown, JSON, and LaTeX output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime, timezone
7
+
8
+
9
+ def generate_markdown(benchmark_name: str, summary: dict, details: list[dict]) -> str:
10
+ """Generate a publishable markdown report."""
11
+ lines = [
12
+ f"# Sinain Knowledge Graph — {benchmark_name} Results",
13
+ f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
14
+ "",
15
+ ]
16
+
17
+ # Headline IPR
18
+ ipr = summary.get("ipr")
19
+ if ipr:
20
+ lines.append(f"**Information Preservation Rate (IPR)**: {ipr:.1%}")
21
+ lines.append("")
22
+
23
+ # Condition scores table
24
+ conditions = summary.get("conditions", {})
25
+ if conditions:
26
+ cond_names = sorted(conditions.keys())
27
+ header = "| Condition | Mean Score (1-5) | Mean F1 | N |"
28
+ sep = "|-----------|------------------|---------|---|"
29
+ lines.extend([header, sep])
30
+ for cond in cond_names:
31
+ c = conditions[cond]
32
+ lines.append(f"| {cond} | {c['mean_score']:.2f} | {c.get('mean_f1', 0):.2f} | {c['n']} |")
33
+ lines.append("")
34
+
35
+ # Retrieval metrics
36
+ retrieval = summary.get("retrieval", {})
37
+ if retrieval:
38
+ lines.append("## Retrieval Quality")
39
+ lines.append("| Metric | Score |")
40
+ lines.append("|--------|-------|")
41
+ for k, v in sorted(retrieval.items()):
42
+ lines.append(f"| {k} | {v:.1%} |")
43
+ lines.append("")
44
+
45
+ # Category breakdown
46
+ categories = summary.get("categories", {})
47
+ if categories:
48
+ lines.append("## By Category")
49
+ cond_names = sorted(set(c for cat in categories.values() for c in cat))
50
+ header = "| Category | " + " | ".join(cond_names) + " |"
51
+ sep = "|----------|" + "|".join(["------"] * len(cond_names)) + "|"
52
+ lines.extend([header, sep])
53
+ for cat in sorted(categories):
54
+ cells = []
55
+ for cond in cond_names:
56
+ if cond in categories[cat]:
57
+ cells.append(f"{categories[cat][cond]['mean_score']:.2f} (n={categories[cat][cond]['n']})")
58
+ else:
59
+ cells.append("-")
60
+ lines.append(f"| {cat} | " + " | ".join(cells) + " |")
61
+ lines.append("")
62
+
63
+ # Failures (worst questions)
64
+ if details:
65
+ sm_details = [d for d in details if d.get("answers", {}).get("sinain-memory", {}).get("score") is not None]
66
+ sm_details.sort(key=lambda d: d["answers"]["sinain-memory"]["score"])
67
+ if sm_details:
68
+ lines.append("## Hardest Questions for sinain-memory (bottom 5)")
69
+ for d in sm_details[:5]:
70
+ sm = d["answers"]["sinain-memory"]
71
+ fc = d["answers"].get("full-context", {})
72
+ lines.append(f"- **{d['id']}** [{d['category']}]: score={sm['score']}/5 "
73
+ f"(full-context: {fc.get('score', '?')}/5)")
74
+ lines.append(f" Q: {d['question'][:100]}...")
75
+ lines.append("")
76
+
77
+ return "\n".join(lines)
78
+
79
+
80
+ def generate_json(benchmark_name: str, summary: dict, details: list[dict]) -> str:
81
+ """Generate JSON report."""
82
+ return json.dumps({
83
+ "benchmark": benchmark_name,
84
+ "timestamp": datetime.now(timezone.utc).isoformat(),
85
+ "summary": summary,
86
+ "details": details,
87
+ }, indent=2, ensure_ascii=False)