@geravant/sinain 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/.env.example +33 -27
  2. package/cli.js +30 -14
  3. package/config-shared.js +173 -30
  4. package/launcher.js +38 -21
  5. package/onboard.js +36 -20
  6. package/package.json +4 -1
  7. package/sinain-agent/run.sh +600 -127
  8. package/sinain-core/src/agents-loader.ts +254 -0
  9. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  10. package/sinain-core/src/config.ts +77 -15
  11. package/sinain-core/src/escalation/escalator.ts +178 -18
  12. package/sinain-core/src/index.ts +218 -31
  13. package/sinain-core/src/learning/local-curation.ts +81 -27
  14. package/sinain-core/src/overlay/commands.ts +25 -0
  15. package/sinain-core/src/overlay/ws-handler.ts +3 -0
  16. package/sinain-core/src/server.ts +101 -10
  17. package/sinain-core/src/types.ts +29 -3
  18. package/sinain-memory/graph_query.py +12 -3
  19. package/sinain-memory/knowledge_integrator.py +194 -10
  20. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  21. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  22. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  23. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  24. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  25. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/__init__.py +0 -0
  27. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/assertions.py +0 -267
  29. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  38. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  39. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  40. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  41. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  42. package/sinain-memory/eval/benchmarks/config.py +0 -23
  43. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  44. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  45. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  47. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  48. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  49. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  50. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  51. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  52. package/sinain-memory/eval/benchmarks/query.py +0 -193
  53. package/sinain-memory/eval/benchmarks/report.py +0 -87
  54. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  55. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  56. package/sinain-memory/eval/judges/__init__.py +0 -0
  57. package/sinain-memory/eval/judges/base_judge.py +0 -61
  58. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  59. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  60. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  61. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  62. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  63. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  64. package/sinain-memory/eval/schemas.py +0 -247
  65. package/sinain-memory/tests/__init__.py +0 -0
  66. package/sinain-memory/tests/conftest.py +0 -189
  67. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  68. package/sinain-memory/tests/test_embedder.py +0 -210
  69. package/sinain-memory/tests/test_extract_json.py +0 -124
  70. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  71. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  72. package/sinain-memory/tests/test_module_management.py +0 -458
  73. package/sinain-memory/tests/test_parsers.py +0 -96
  74. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  75. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  76. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  77. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  78. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,146 +0,0 @@
1
- """Evaluation pipeline — score answers and compute aggregate metrics.
2
-
3
- Combines:
4
- - LLM-as-Judge (QA scoring, 1-5 scale)
5
- - Retrieval metrics (Recall@k, NDCG@k)
6
- - Token F1 overlap (mechanical, free)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import math
12
- import re
13
- from collections import defaultdict
14
-
15
- from .base_adapter import BenchmarkQuestion
16
- from .config import K_VALUES
17
-
18
-
19
- # ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
20
-
21
- def _tokenize(text: str) -> list[str]:
22
- """Simple whitespace + punctuation tokenizer."""
23
- return re.findall(r"\w+", text.lower())
24
-
25
-
26
- def token_f1(predicted: str, gold: str | int) -> float:
27
- """Compute token-level F1 between predicted and gold answers."""
28
- pred_tokens = set(_tokenize(str(predicted)))
29
- gold_tokens = set(_tokenize(str(gold)))
30
- if not gold_tokens or not pred_tokens:
31
- return 0.0
32
- overlap = pred_tokens & gold_tokens
33
- if not overlap:
34
- return 0.0
35
- precision = len(overlap) / len(pred_tokens)
36
- recall = len(overlap) / len(gold_tokens)
37
- return 2 * precision * recall / (precision + recall)
38
-
39
-
40
- # ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
41
-
42
- def dcg_at_k(relevant_positions: list[int], k: int) -> float:
43
- """Discounted Cumulative Gain at k."""
44
- score = 0.0
45
- for pos in relevant_positions:
46
- if pos < k:
47
- score += 1.0 / math.log2(pos + 2)
48
- return score
49
-
50
-
51
- def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
52
- """Normalized DCG at k."""
53
- dcg = dcg_at_k(relevant_positions, k)
54
- ideal_positions = list(range(min(num_relevant, k)))
55
- idcg = dcg_at_k(ideal_positions, k)
56
- return dcg / idcg if idcg > 0 else 0.0
57
-
58
-
59
- def compute_retrieval_metrics(
60
- retrieved_ids: list[str],
61
- expected_ids: list[str],
62
- k_values: list[int] | None = None,
63
- ) -> dict:
64
- """Compute Recall@k and NDCG@k for a single question."""
65
- ks = k_values or K_VALUES
66
- expected_set = set(expected_ids)
67
- relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
68
-
69
- result = {}
70
- for k in ks:
71
- hit = any(pos < k for pos in relevant_positions)
72
- result[f"recall@{k}"] = 1.0 if hit else 0.0
73
- result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
74
- return result
75
-
76
-
77
- # ── Aggregate metrics ─────────────────────────────────────────────────────────
78
-
79
- def aggregate_results(per_question: list[dict]) -> dict:
80
- """Compute aggregate metrics from per-question results.
81
-
82
- Each per_question entry has:
83
- {id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
84
- """
85
- if not per_question:
86
- return {"error": "no results"}
87
-
88
- # Per-condition scores
89
- condition_scores: dict[str, list[float]] = defaultdict(list)
90
- condition_f1s: dict[str, list[float]] = defaultdict(list)
91
- # Per-category per-condition
92
- cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
93
- # Retrieval
94
- retrieval_metrics: dict[str, list[float]] = defaultdict(list)
95
-
96
- for q in per_question:
97
- cat = q.get("category", "unknown")
98
-
99
- for cond, data in q.get("answers", {}).items():
100
- if data.get("score") is not None:
101
- condition_scores[cond].append(data["score"])
102
- cat_scores[cat][cond].append(data["score"])
103
- if data.get("f1") is not None:
104
- condition_f1s[cond].append(data["f1"])
105
-
106
- for metric, val in q.get("retrieval", {}).items():
107
- if isinstance(val, (int, float)):
108
- retrieval_metrics[metric].append(val)
109
-
110
- def _mean(lst: list[float]) -> float:
111
- return round(sum(lst) / len(lst), 4) if lst else 0.0
112
-
113
- # Build summary
114
- conditions = {}
115
- for cond in sorted(condition_scores):
116
- conditions[cond] = {
117
- "mean_score": _mean(condition_scores[cond]),
118
- "mean_f1": _mean(condition_f1s.get(cond, [])),
119
- "n": len(condition_scores[cond]),
120
- }
121
-
122
- # IPR: sinain-memory vs full-context
123
- sm_scores = condition_scores.get("sinain-memory", [])
124
- fc_scores = condition_scores.get("full-context", [])
125
- ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
126
-
127
- # Category breakdown
128
- categories = {}
129
- for cat in sorted(cat_scores):
130
- categories[cat] = {}
131
- for cond in sorted(cat_scores[cat]):
132
- categories[cat][cond] = {
133
- "mean_score": _mean(cat_scores[cat][cond]),
134
- "n": len(cat_scores[cat][cond]),
135
- }
136
-
137
- # Retrieval summary
138
- retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
139
-
140
- return {
141
- "total_questions": len(per_question),
142
- "conditions": conditions,
143
- "ipr": round(ipr, 4) if ipr else None,
144
- "categories": categories,
145
- "retrieval": retrieval,
146
- }
@@ -1,152 +0,0 @@
1
- """Ingestion pipeline — benchmark conversations → sinain triplestore.
2
-
3
- Runs session_distiller.py + knowledge_integrator.py via subprocess (exact production path).
4
- Caches results aggressively to avoid repeated LLM calls.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import hashlib
10
- import json
11
- import os
12
- import shutil
13
- import tempfile
14
- from pathlib import Path
15
- from subprocess import run, PIPE, TimeoutExpired
16
-
17
- from .base_adapter import BenchmarkInstance
18
- from .config import DISTILLER_TIMEOUT_S, INTEGRATOR_TIMEOUT_S
19
-
20
-
21
- def _scripts_dir() -> Path:
22
- """Locate sinain-memory scripts directory."""
23
- return Path(__file__).resolve().parent.parent.parent
24
-
25
-
26
- def _content_hash(sessions: list[list[dict]]) -> str:
27
- """Hash session content for caching."""
28
- raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
29
- return hashlib.sha256(raw.encode()).hexdigest()[:16]
30
-
31
-
32
- def _run_script(script_name: str, args: list[str], timeout: int) -> str | None:
33
- """Run a Python script from sinain-memory, return stdout or None on failure."""
34
- script_path = _scripts_dir() / script_name
35
- if not script_path.exists():
36
- print(f"[ingest] {script_name} not found at {script_path}")
37
- return None
38
-
39
- env = {**os.environ, "PYTHONPATH": str(_scripts_dir())}
40
- # Ensure a working model is available (common.py defaults may reference unreleased models)
41
- if "SINAIN_BENCH_MODEL" in os.environ:
42
- env["SINAIN_FAST_MODEL"] = os.environ["SINAIN_BENCH_MODEL"]
43
- try:
44
- result = run(
45
- ["python3", str(script_path)] + args,
46
- capture_output=True, text=True, timeout=timeout, env=env,
47
- )
48
- if result.returncode != 0:
49
- print(f"[ingest] {script_name} failed: {result.stderr[:200]}")
50
- return None
51
- return result.stdout.strip()
52
- except TimeoutExpired:
53
- print(f"[ingest] {script_name} timed out ({timeout}s)")
54
- return None
55
-
56
-
57
- def ingest_instance(
58
- instance: BenchmarkInstance,
59
- cache_dir: Path,
60
- ) -> Path | None:
61
- """Ingest a benchmark instance into a triplestore. Returns db_path or None.
62
-
63
- Uses caching: if the same haystack was already ingested, returns the cached DB.
64
- """
65
- ch = _content_hash(instance.sessions)
66
- cache_path = cache_dir / "stores" / f"{ch}.db"
67
-
68
- if cache_path.exists():
69
- return cache_path
70
-
71
- cache_path.parent.mkdir(parents=True, exist_ok=True)
72
-
73
- # Create temp memory directory
74
- tmp = tempfile.mkdtemp(prefix="sinain-bench-")
75
- mem_dir = Path(tmp) / "memory"
76
- for subdir in ["", "playbook-logs", "playbook-archive"]:
77
- (mem_dir / subdir).mkdir(parents=True, exist_ok=True)
78
-
79
- # Write a minimal playbook so integrator doesn't fail
80
- (mem_dir / "sinain-playbook.md").write_text("# Sinain Playbook\n\n(benchmark run)\n")
81
-
82
- success = False
83
- try:
84
- # Batch sessions into chunks of ~10 for fewer LLM calls.
85
- # Each chunk becomes one distiller call with a combined transcript.
86
- BATCH_SIZE = 10
87
- num_sessions = len(instance.sessions)
88
- batch_idx = 0
89
-
90
- for start in range(0, num_sessions, BATCH_SIZE):
91
- batch = instance.sessions[start:start + BATCH_SIZE]
92
- # Flatten batch into one transcript
93
- combined: list[dict] = []
94
- for session in batch:
95
- combined.extend(session)
96
- if len(combined) < 3:
97
- continue
98
-
99
- first_ts = combined[0].get("ts", "2025-01-01T10:00:00Z")
100
- meta = json.dumps({
101
- "ts": first_ts,
102
- "sessionKey": f"benchmark-batch-{batch_idx}",
103
- "durationMs": len(combined) * 30000,
104
- })
105
- batch_idx += 1
106
-
107
- # Step 1: Distill the batch
108
- digest_json = _run_script("session_distiller.py", [
109
- "--memory-dir", str(mem_dir),
110
- "--transcript", json.dumps(combined),
111
- "--session-meta", meta,
112
- ], DISTILLER_TIMEOUT_S)
113
-
114
- if not digest_json:
115
- continue
116
-
117
- try:
118
- digest = json.loads(digest_json)
119
- except json.JSONDecodeError:
120
- continue
121
-
122
- if digest.get("isEmpty") or digest.get("error"):
123
- continue
124
-
125
- # Step 2: Integrate into knowledge graph
126
- _run_script("knowledge_integrator.py", [
127
- "--memory-dir", str(mem_dir),
128
- "--digest", json.dumps(digest),
129
- ], INTEGRATOR_TIMEOUT_S)
130
-
131
- # Copy the resulting DB to cache
132
- db_path = mem_dir / "knowledge-graph.db"
133
- if db_path.exists() and db_path.stat().st_size > 0:
134
- shutil.copy2(db_path, cache_path)
135
- success = True
136
-
137
- finally:
138
- shutil.rmtree(tmp, ignore_errors=True)
139
-
140
- return cache_path if success else None
141
-
142
-
143
- def get_knowledge_doc(db_path: Path) -> str:
144
- """Render a sinain-knowledge.md style document from a triplestore."""
145
- import sys
146
- sys.path.insert(0, str(_scripts_dir()))
147
- from graph_query import query_top_facts, format_facts_text
148
-
149
- facts = query_top_facts(str(db_path), limit=30)
150
- if not facts:
151
- return "(no knowledge available)"
152
- return format_facts_text(facts, max_chars=6000)
File without changes
@@ -1,81 +0,0 @@
1
- """LLM-as-Judge: QA answer quality evaluator (LongMemEval-compatible, 1-5 scale).
2
-
3
- Uses GPT-4o via OpenRouter for comparability with published results.
4
- """
5
-
6
- from __future__ import annotations
7
-
8
- import sys
9
- from pathlib import Path
10
-
11
- # Add sinain-memory to path for common imports
12
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
13
- if _koog_dir not in sys.path:
14
- sys.path.insert(0, _koog_dir)
15
-
16
- from common import LLMError, call_llm, extract_json # noqa: E402
17
-
18
- SYSTEM_PROMPT = """\
19
- You are evaluating whether a predicted answer correctly answers a question.
20
- The gold (reference) answer is provided.
21
-
22
- Score on a scale of 1-5:
23
- 5: Perfect — captures all key information from the gold answer, no errors
24
- 4: Mostly correct — minor omissions or imprecision, main point is right
25
- 3: Partially correct — captures some key points but misses important details
26
- 2: Related but mostly wrong — touches the topic but answer is largely incorrect
27
- 1: Completely wrong, contradicts the gold answer, or says "I don't know" when the answer exists
28
-
29
- Special cases:
30
- - If the gold answer indicates abstention is correct (e.g. "I don't know" or "not mentioned"),
31
- then a predicted "I don't know" scores 5.
32
- - Numeric answers within 10% of gold = full credit.
33
- - Getting the gist right but missing specifics = 3-4 depending on importance.
34
-
35
- Respond with ONLY a JSON object: {"score": <1-5>, "reasoning": "brief explanation"}"""
36
-
37
-
38
- def judge_qa(
39
- question: str,
40
- gold_answer: str,
41
- predicted_answer: str,
42
- *,
43
- condition: str = "",
44
- model: str | None = None,
45
- ) -> dict | None:
46
- """Score a QA answer. Returns {"score": 1-5, "reasoning": str} or None on failure."""
47
- user_parts = [
48
- f"## Question\n{question}",
49
- f"\n## Gold Answer\n{gold_answer}",
50
- f"\n## Predicted Answer\n{predicted_answer}",
51
- ]
52
- if condition:
53
- user_parts.append(f"\n## Context Condition: {condition}")
54
-
55
- try:
56
- kwargs: dict = {
57
- "system_prompt": SYSTEM_PROMPT,
58
- "user_prompt": "\n".join(user_parts),
59
- "max_tokens": 200,
60
- "json_mode": True,
61
- }
62
- if model:
63
- kwargs["model"] = model
64
- else:
65
- kwargs["script"] = "meeting_benchmark"
66
-
67
- raw = call_llm(**kwargs)
68
- result = extract_json(raw)
69
-
70
- score = result.get("score")
71
- reasoning = result.get("reasoning", "")
72
-
73
- if not isinstance(score, (int, float)) or not (1 <= score <= 5):
74
- print(f"[warn] qa_judge returned invalid score: {score}", file=sys.stderr)
75
- return None
76
-
77
- return {"score": int(score), "reasoning": str(reasoning)[:300]}
78
-
79
- except (ValueError, LLMError, KeyError) as e:
80
- print(f"[warn] qa_judge call failed: {e}", file=sys.stderr)
81
- return None
@@ -1,177 +0,0 @@
1
- """LongMemEval (ICLR 2025) adapter — download + parse into sinain format.
2
-
3
- Dataset: https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
4
- Paper: https://arxiv.org/abs/2410.10813
5
-
6
- Fields per item:
7
- question_id, question_type, question, answer, question_date,
8
- haystack_session_ids, haystack_dates, haystack_sessions, answer_session_ids
9
-
10
- haystack_sessions entries: {"role": "user"/"assistant", "content": "...", "has_answer": bool}
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- import hashlib
16
- import json
17
- from datetime import datetime, timedelta, timezone
18
- from pathlib import Path
19
-
20
- from .base_adapter import BenchmarkAdapter, BenchmarkInstance, BenchmarkQuestion
21
-
22
-
23
- def _download_dataset(data_dir: Path) -> Path:
24
- """Download LongMemEval from HuggingFace if not cached."""
25
- cache_path = data_dir / "longmemeval" / "longmemeval_s_cleaned.json"
26
- if cache_path.exists():
27
- return cache_path
28
-
29
- cache_path.parent.mkdir(parents=True, exist_ok=True)
30
-
31
- url = "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json"
32
- print(f"[longmemeval] downloading from {url} ...")
33
-
34
- # Use curl to avoid macOS Python SSL cert issues
35
- import subprocess
36
- result = subprocess.run(
37
- ["curl", "-fSL", "-o", str(cache_path), url],
38
- capture_output=True, text=True, timeout=120,
39
- )
40
- if result.returncode != 0:
41
- raise RuntimeError(f"Download failed: {result.stderr[:200]}")
42
- print(f"[longmemeval] saved to {cache_path} ({cache_path.stat().st_size} bytes)")
43
- return cache_path
44
-
45
-
46
- def _session_hash(sessions: list[dict]) -> str:
47
- """Content hash for a haystack (for grouping questions with shared context)."""
48
- raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
49
- return hashlib.sha256(raw.encode()).hexdigest()[:16]
50
-
51
-
52
- def _sessions_to_feed_items(
53
- haystack_sessions: list[list[dict]],
54
- haystack_session_ids: list[str],
55
- haystack_dates: list[str],
56
- ) -> list[list[dict]]:
57
- """Convert LongMemEval haystack into sinain feed item sessions.
58
-
59
- haystack_sessions is a list of sessions, each a list of turn dicts:
60
- sessions[i][j] = {"role": "user"/"assistant", "content": "..."}
61
-
62
- Each session becomes a list of feed items with synthesized timestamps.
63
- User turns → source: "audio", assistant turns → source: "agent".
64
- """
65
- result: list[list[dict]] = []
66
-
67
- for i, session_turns in enumerate(haystack_sessions):
68
- if not session_turns:
69
- continue
70
-
71
- base_ts = haystack_dates[i] if i < len(haystack_dates) else "2025-01-01T10:00:00Z"
72
- base_dt = _parse_date(base_ts)
73
-
74
- items = []
75
- for j, turn in enumerate(session_turns):
76
- ts = (base_dt + timedelta(seconds=30 * j)).isoformat()
77
- source = "audio" if turn.get("role") == "user" else "agent"
78
- items.append({
79
- "source": source,
80
- "text": turn.get("content", ""),
81
- "ts": ts,
82
- "channel": "benchmark",
83
- })
84
- if items:
85
- result.append(items)
86
-
87
- return result
88
-
89
-
90
- def _parse_date(s: str) -> datetime:
91
- """Best-effort date parsing."""
92
- for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%m/%d/%Y"):
93
- try:
94
- return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
95
- except (ValueError, TypeError):
96
- continue
97
- return datetime(2025, 1, 1, 10, 0, tzinfo=timezone.utc)
98
-
99
-
100
- class LongMemEvalAdapter(BenchmarkAdapter):
101
- """Adapter for LongMemEval (ICLR 2025) benchmark."""
102
-
103
- @property
104
- def name(self) -> str:
105
- return "longmemeval"
106
-
107
- def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
108
- """Download and parse LongMemEval, grouping questions by shared haystack."""
109
- path = _download_dataset(Path(data_dir))
110
- with open(path) as f:
111
- raw_items = json.load(f)
112
-
113
- # Group questions by haystack content hash
114
- groups: dict[str, dict] = {}
115
- for item in raw_items:
116
- h = _session_hash(item.get("haystack_sessions", []))
117
- if h not in groups:
118
- groups[h] = {
119
- "haystack_sessions": item["haystack_sessions"],
120
- "haystack_session_ids": item.get("haystack_session_ids", []),
121
- "haystack_dates": item.get("haystack_dates", []),
122
- "questions": [],
123
- }
124
- groups[h]["questions"].append(item)
125
-
126
- instances = []
127
- for h, group in groups.items():
128
- feed_sessions = _sessions_to_feed_items(
129
- group["haystack_sessions"],
130
- group["haystack_session_ids"],
131
- group["haystack_dates"],
132
- )
133
-
134
- questions = []
135
- for item in group["questions"]:
136
- questions.append(BenchmarkQuestion(
137
- id=item["question_id"],
138
- text=item["question"],
139
- gold_answer=str(item["answer"]),
140
- category=item.get("question_type", "unknown"),
141
- evidence_session_ids=item.get("answer_session_ids", []),
142
- metadata={
143
- "question_date": item.get("question_date", ""),
144
- },
145
- ))
146
-
147
- instances.append(BenchmarkInstance(
148
- id=f"lme-{h}",
149
- sessions=feed_sessions,
150
- questions=questions,
151
- raw_sessions=group["haystack_sessions"],
152
- metadata={
153
- "haystack_hash": h,
154
- "num_sessions": len(feed_sessions),
155
- "num_turns": len(group["haystack_sessions"]),
156
- },
157
- ))
158
-
159
- print(f"[longmemeval] loaded {sum(len(i.questions) for i in instances)} questions "
160
- f"across {len(instances)} unique haystacks")
161
- return instances
162
-
163
- def format_full_context(self, instance: BenchmarkInstance) -> str:
164
- """Render the full conversation history for the baseline condition."""
165
- lines = []
166
- for session in instance.raw_sessions:
167
- if isinstance(session, list):
168
- for turn in session:
169
- role = turn.get("role", "unknown").capitalize()
170
- content = turn.get("content", "")
171
- lines.append(f"{role}: {content}")
172
- lines.append("---") # session separator
173
- elif isinstance(session, dict):
174
- role = session.get("role", "unknown").capitalize()
175
- content = session.get("content", "")
176
- lines.append(f"{role}: {content}")
177
- return "\n\n".join(lines)
@@ -1,81 +0,0 @@
1
- """Meeting Memory adapter — loads QA pairs + ground-truth transcript.
2
-
3
- Unlike LongMemEval, this adapter does NOT ingest. The real sinain pipeline
4
- (start-local.sh) produces the knowledge-graph.db during a live capture run.
5
- This adapter only loads QA pairs and provides the full-context baseline.
6
-
7
- Data layout:
8
- eval/benchmarks/data/meeting/
9
- <name>.txt — ground-truth transcript
10
- <name>_qa.json — gold QA pairs
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- import json
16
- from pathlib import Path
17
-
18
- from .base_adapter import BenchmarkAdapter, BenchmarkInstance, BenchmarkQuestion
19
-
20
-
21
- class MeetingMemoryAdapter(BenchmarkAdapter):
22
- """Adapter for meeting memory benchmarks (real pipeline capture)."""
23
-
24
- @property
25
- def name(self) -> str:
26
- return "meeting"
27
-
28
- def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
29
- """Load meeting QA pairs and transcripts from data/meeting/."""
30
- meeting_dir = Path(data_dir) / "meeting"
31
- if not meeting_dir.exists():
32
- raise FileNotFoundError(f"Meeting data not found: {meeting_dir}")
33
-
34
- instances = []
35
- for qa_path in sorted(meeting_dir.glob("*_qa.json")):
36
- # Derive transcript path: foo_qa.json → foo.txt
37
- stem = qa_path.stem.replace("_qa", "")
38
- transcript_path = qa_path.parent / f"{stem}.txt"
39
- if not transcript_path.exists():
40
- print(f"[meeting] warning: no transcript for {qa_path.name}, skipping")
41
- continue
42
-
43
- # Load QA pairs
44
- with open(qa_path) as f:
45
- raw_questions = json.load(f)
46
-
47
- questions = []
48
- for item in raw_questions:
49
- questions.append(BenchmarkQuestion(
50
- id=item["id"],
51
- text=item["question"],
52
- gold_answer=item["gold_answer"],
53
- category=item.get("category", "unknown"),
54
- evidence_session_ids=item.get("evidence_timestamps", []),
55
- metadata={
56
- "evidence_timestamps": item.get("evidence_timestamps", []),
57
- },
58
- ))
59
-
60
- # Load transcript
61
- transcript_text = transcript_path.read_text(encoding="utf-8")
62
-
63
- instances.append(BenchmarkInstance(
64
- id=f"meeting-{stem}",
65
- sessions=[], # Not used — real pipeline does ingestion
66
- questions=questions,
67
- raw_sessions=[],
68
- metadata={
69
- "transcript_path": str(transcript_path),
70
- "raw_transcript": transcript_text,
71
- "stem": stem,
72
- },
73
- ))
74
-
75
- total_q = sum(len(i.questions) for i in instances)
76
- print(f"[meeting] loaded {total_q} questions across {len(instances)} meetings")
77
- return instances
78
-
79
- def format_full_context(self, instance: BenchmarkInstance) -> str:
80
- """Return the raw transcript verbatim for the full-context baseline."""
81
- return instance.metadata.get("raw_transcript", "")