@geravant/sinain 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/src/escalation/escalator.ts +1 -0
- package/sinain-core/src/escalation/message-builder.ts +45 -118
- package/sinain-core/src/overlay/commands.ts +16 -3
- package/sinain-core/src/overlay/ws-handler.ts +4 -1
- package/sinain-core/src/types.ts +3 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
- package/sinain-memory/eval/benchmarks/config.py +23 -0
- package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
- package/sinain-memory/eval/benchmarks/ingest.py +152 -0
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
- package/sinain-memory/eval/benchmarks/query.py +172 -0
- package/sinain-memory/eval/benchmarks/report.py +87 -0
- package/sinain-memory/eval/benchmarks/runner.py +276 -0
- package/sinain-memory/koog-config.json +11 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Ingestion pipeline — benchmark conversations → sinain triplestore.
|
|
2
|
+
|
|
3
|
+
Runs session_distiller.py + knowledge_integrator.py via subprocess (exact production path).
|
|
4
|
+
Caches results aggressively to avoid repeated LLM calls.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import shutil
|
|
13
|
+
import tempfile
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from subprocess import run, PIPE, TimeoutExpired
|
|
16
|
+
|
|
17
|
+
from .base_adapter import BenchmarkInstance
|
|
18
|
+
from .config import DISTILLER_TIMEOUT_S, INTEGRATOR_TIMEOUT_S
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _scripts_dir() -> Path:
|
|
22
|
+
"""Locate sinain-memory scripts directory."""
|
|
23
|
+
return Path(__file__).resolve().parent.parent.parent
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _content_hash(sessions: list[list[dict]]) -> str:
|
|
27
|
+
"""Hash session content for caching."""
|
|
28
|
+
raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
|
|
29
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _run_script(script_name: str, args: list[str], timeout: int) -> str | None:
|
|
33
|
+
"""Run a Python script from sinain-memory, return stdout or None on failure."""
|
|
34
|
+
script_path = _scripts_dir() / script_name
|
|
35
|
+
if not script_path.exists():
|
|
36
|
+
print(f"[ingest] {script_name} not found at {script_path}")
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
env = {**os.environ, "PYTHONPATH": str(_scripts_dir())}
|
|
40
|
+
# Ensure a working model is available (common.py defaults may reference unreleased models)
|
|
41
|
+
if "SINAIN_BENCH_MODEL" in os.environ:
|
|
42
|
+
env["SINAIN_FAST_MODEL"] = os.environ["SINAIN_BENCH_MODEL"]
|
|
43
|
+
try:
|
|
44
|
+
result = run(
|
|
45
|
+
["python3", str(script_path)] + args,
|
|
46
|
+
capture_output=True, text=True, timeout=timeout, env=env,
|
|
47
|
+
)
|
|
48
|
+
if result.returncode != 0:
|
|
49
|
+
print(f"[ingest] {script_name} failed: {result.stderr[:200]}")
|
|
50
|
+
return None
|
|
51
|
+
return result.stdout.strip()
|
|
52
|
+
except TimeoutExpired:
|
|
53
|
+
print(f"[ingest] {script_name} timed out ({timeout}s)")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def ingest_instance(
|
|
58
|
+
instance: BenchmarkInstance,
|
|
59
|
+
cache_dir: Path,
|
|
60
|
+
) -> Path | None:
|
|
61
|
+
"""Ingest a benchmark instance into a triplestore. Returns db_path or None.
|
|
62
|
+
|
|
63
|
+
Uses caching: if the same haystack was already ingested, returns the cached DB.
|
|
64
|
+
"""
|
|
65
|
+
ch = _content_hash(instance.sessions)
|
|
66
|
+
cache_path = cache_dir / "stores" / f"{ch}.db"
|
|
67
|
+
|
|
68
|
+
if cache_path.exists():
|
|
69
|
+
return cache_path
|
|
70
|
+
|
|
71
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
# Create temp memory directory
|
|
74
|
+
tmp = tempfile.mkdtemp(prefix="sinain-bench-")
|
|
75
|
+
mem_dir = Path(tmp) / "memory"
|
|
76
|
+
for subdir in ["", "playbook-logs", "playbook-archive"]:
|
|
77
|
+
(mem_dir / subdir).mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
# Write a minimal playbook so integrator doesn't fail
|
|
80
|
+
(mem_dir / "sinain-playbook.md").write_text("# Sinain Playbook\n\n(benchmark run)\n")
|
|
81
|
+
|
|
82
|
+
success = False
|
|
83
|
+
try:
|
|
84
|
+
# Batch sessions into chunks of ~10 for fewer LLM calls.
|
|
85
|
+
# Each chunk becomes one distiller call with a combined transcript.
|
|
86
|
+
BATCH_SIZE = 10
|
|
87
|
+
num_sessions = len(instance.sessions)
|
|
88
|
+
batch_idx = 0
|
|
89
|
+
|
|
90
|
+
for start in range(0, num_sessions, BATCH_SIZE):
|
|
91
|
+
batch = instance.sessions[start:start + BATCH_SIZE]
|
|
92
|
+
# Flatten batch into one transcript
|
|
93
|
+
combined: list[dict] = []
|
|
94
|
+
for session in batch:
|
|
95
|
+
combined.extend(session)
|
|
96
|
+
if len(combined) < 3:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
first_ts = combined[0].get("ts", "2025-01-01T10:00:00Z")
|
|
100
|
+
meta = json.dumps({
|
|
101
|
+
"ts": first_ts,
|
|
102
|
+
"sessionKey": f"benchmark-batch-{batch_idx}",
|
|
103
|
+
"durationMs": len(combined) * 30000,
|
|
104
|
+
})
|
|
105
|
+
batch_idx += 1
|
|
106
|
+
|
|
107
|
+
# Step 1: Distill the batch
|
|
108
|
+
digest_json = _run_script("session_distiller.py", [
|
|
109
|
+
"--memory-dir", str(mem_dir),
|
|
110
|
+
"--transcript", json.dumps(combined),
|
|
111
|
+
"--session-meta", meta,
|
|
112
|
+
], DISTILLER_TIMEOUT_S)
|
|
113
|
+
|
|
114
|
+
if not digest_json:
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
digest = json.loads(digest_json)
|
|
119
|
+
except json.JSONDecodeError:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
if digest.get("isEmpty") or digest.get("error"):
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Step 2: Integrate into knowledge graph
|
|
126
|
+
_run_script("knowledge_integrator.py", [
|
|
127
|
+
"--memory-dir", str(mem_dir),
|
|
128
|
+
"--digest", json.dumps(digest),
|
|
129
|
+
], INTEGRATOR_TIMEOUT_S)
|
|
130
|
+
|
|
131
|
+
# Copy the resulting DB to cache
|
|
132
|
+
db_path = mem_dir / "knowledge-graph.db"
|
|
133
|
+
if db_path.exists() and db_path.stat().st_size > 0:
|
|
134
|
+
shutil.copy2(db_path, cache_path)
|
|
135
|
+
success = True
|
|
136
|
+
|
|
137
|
+
finally:
|
|
138
|
+
shutil.rmtree(tmp, ignore_errors=True)
|
|
139
|
+
|
|
140
|
+
return cache_path if success else None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_knowledge_doc(db_path: Path) -> str:
|
|
144
|
+
"""Render a sinain-knowledge.md style document from a triplestore."""
|
|
145
|
+
import sys
|
|
146
|
+
sys.path.insert(0, str(_scripts_dir()))
|
|
147
|
+
from graph_query import query_top_facts, format_facts_text
|
|
148
|
+
|
|
149
|
+
facts = query_top_facts(str(db_path), limit=30)
|
|
150
|
+
if not facts:
|
|
151
|
+
return "(no knowledge available)"
|
|
152
|
+
return format_facts_text(facts, max_chars=6000)
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""LLM-as-Judge: QA answer quality evaluator (LongMemEval-compatible, 1-5 scale).
|
|
2
|
+
|
|
3
|
+
Uses GPT-4o via OpenRouter for comparability with published results.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# Add sinain-memory to path for common imports
|
|
12
|
+
_koog_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
|
|
13
|
+
if _koog_dir not in sys.path:
|
|
14
|
+
sys.path.insert(0, _koog_dir)
|
|
15
|
+
|
|
16
|
+
from common import LLMError, call_llm, extract_json # noqa: E402
|
|
17
|
+
|
|
18
|
+
SYSTEM_PROMPT = """\
|
|
19
|
+
You are evaluating whether a predicted answer correctly answers a question.
|
|
20
|
+
The gold (reference) answer is provided.
|
|
21
|
+
|
|
22
|
+
Score on a scale of 1-5:
|
|
23
|
+
5: Perfect — captures all key information from the gold answer, no errors
|
|
24
|
+
4: Mostly correct — minor omissions or imprecision, main point is right
|
|
25
|
+
3: Partially correct — captures some key points but misses important details
|
|
26
|
+
2: Related but mostly wrong — touches the topic but answer is largely incorrect
|
|
27
|
+
1: Completely wrong, contradicts the gold answer, or says "I don't know" when the answer exists
|
|
28
|
+
|
|
29
|
+
Special cases:
|
|
30
|
+
- If the gold answer indicates abstention is correct (e.g. "I don't know" or "not mentioned"),
|
|
31
|
+
then a predicted "I don't know" scores 5.
|
|
32
|
+
- Numeric answers within 10% of gold = full credit.
|
|
33
|
+
- Getting the gist right but missing specifics = 3-4 depending on importance.
|
|
34
|
+
|
|
35
|
+
Respond with ONLY a JSON object: {"score": <1-5>, "reasoning": "brief explanation"}"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def judge_qa(
|
|
39
|
+
question: str,
|
|
40
|
+
gold_answer: str,
|
|
41
|
+
predicted_answer: str,
|
|
42
|
+
*,
|
|
43
|
+
condition: str = "",
|
|
44
|
+
model: str | None = None,
|
|
45
|
+
) -> dict | None:
|
|
46
|
+
"""Score a QA answer. Returns {"score": 1-5, "reasoning": str} or None on failure."""
|
|
47
|
+
user_parts = [
|
|
48
|
+
f"## Question\n{question}",
|
|
49
|
+
f"\n## Gold Answer\n{gold_answer}",
|
|
50
|
+
f"\n## Predicted Answer\n{predicted_answer}",
|
|
51
|
+
]
|
|
52
|
+
if condition:
|
|
53
|
+
user_parts.append(f"\n## Context Condition: {condition}")
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
kwargs: dict = {
|
|
57
|
+
"system_prompt": SYSTEM_PROMPT,
|
|
58
|
+
"user_prompt": "\n".join(user_parts),
|
|
59
|
+
"max_tokens": 200,
|
|
60
|
+
"json_mode": True,
|
|
61
|
+
}
|
|
62
|
+
if model:
|
|
63
|
+
kwargs["model"] = model
|
|
64
|
+
else:
|
|
65
|
+
kwargs["script"] = "meeting_benchmark"
|
|
66
|
+
|
|
67
|
+
raw = call_llm(**kwargs)
|
|
68
|
+
result = extract_json(raw)
|
|
69
|
+
|
|
70
|
+
score = result.get("score")
|
|
71
|
+
reasoning = result.get("reasoning", "")
|
|
72
|
+
|
|
73
|
+
if not isinstance(score, (int, float)) or not (1 <= score <= 5):
|
|
74
|
+
print(f"[warn] qa_judge returned invalid score: {score}", file=sys.stderr)
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
return {"score": int(score), "reasoning": str(reasoning)[:300]}
|
|
78
|
+
|
|
79
|
+
except (ValueError, LLMError, KeyError) as e:
|
|
80
|
+
print(f"[warn] qa_judge call failed: {e}", file=sys.stderr)
|
|
81
|
+
return None
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""LongMemEval (ICLR 2025) adapter — download + parse into sinain format.
|
|
2
|
+
|
|
3
|
+
Dataset: https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
|
|
4
|
+
Paper: https://arxiv.org/abs/2410.10813
|
|
5
|
+
|
|
6
|
+
Fields per item:
|
|
7
|
+
question_id, question_type, question, answer, question_date,
|
|
8
|
+
haystack_session_ids, haystack_dates, haystack_sessions, answer_session_ids
|
|
9
|
+
|
|
10
|
+
haystack_sessions entries: {"role": "user"/"assistant", "content": "...", "has_answer": bool}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import json
|
|
17
|
+
from datetime import datetime, timedelta, timezone
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from .base_adapter import BenchmarkAdapter, BenchmarkInstance, BenchmarkQuestion
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _download_dataset(data_dir: Path) -> Path:
|
|
24
|
+
"""Download LongMemEval from HuggingFace if not cached."""
|
|
25
|
+
cache_path = data_dir / "longmemeval" / "longmemeval_s_cleaned.json"
|
|
26
|
+
if cache_path.exists():
|
|
27
|
+
return cache_path
|
|
28
|
+
|
|
29
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
url = "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json"
|
|
32
|
+
print(f"[longmemeval] downloading from {url} ...")
|
|
33
|
+
|
|
34
|
+
# Use curl to avoid macOS Python SSL cert issues
|
|
35
|
+
import subprocess
|
|
36
|
+
result = subprocess.run(
|
|
37
|
+
["curl", "-fSL", "-o", str(cache_path), url],
|
|
38
|
+
capture_output=True, text=True, timeout=120,
|
|
39
|
+
)
|
|
40
|
+
if result.returncode != 0:
|
|
41
|
+
raise RuntimeError(f"Download failed: {result.stderr[:200]}")
|
|
42
|
+
print(f"[longmemeval] saved to {cache_path} ({cache_path.stat().st_size} bytes)")
|
|
43
|
+
return cache_path
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _session_hash(sessions: list[dict]) -> str:
|
|
47
|
+
"""Content hash for a haystack (for grouping questions with shared context)."""
|
|
48
|
+
raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
|
|
49
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _sessions_to_feed_items(
|
|
53
|
+
haystack_sessions: list[list[dict]],
|
|
54
|
+
haystack_session_ids: list[str],
|
|
55
|
+
haystack_dates: list[str],
|
|
56
|
+
) -> list[list[dict]]:
|
|
57
|
+
"""Convert LongMemEval haystack into sinain feed item sessions.
|
|
58
|
+
|
|
59
|
+
haystack_sessions is a list of sessions, each a list of turn dicts:
|
|
60
|
+
sessions[i][j] = {"role": "user"/"assistant", "content": "..."}
|
|
61
|
+
|
|
62
|
+
Each session becomes a list of feed items with synthesized timestamps.
|
|
63
|
+
User turns → source: "audio", assistant turns → source: "agent".
|
|
64
|
+
"""
|
|
65
|
+
result: list[list[dict]] = []
|
|
66
|
+
|
|
67
|
+
for i, session_turns in enumerate(haystack_sessions):
|
|
68
|
+
if not session_turns:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
base_ts = haystack_dates[i] if i < len(haystack_dates) else "2025-01-01T10:00:00Z"
|
|
72
|
+
base_dt = _parse_date(base_ts)
|
|
73
|
+
|
|
74
|
+
items = []
|
|
75
|
+
for j, turn in enumerate(session_turns):
|
|
76
|
+
ts = (base_dt + timedelta(seconds=30 * j)).isoformat()
|
|
77
|
+
source = "audio" if turn.get("role") == "user" else "agent"
|
|
78
|
+
items.append({
|
|
79
|
+
"source": source,
|
|
80
|
+
"text": turn.get("content", ""),
|
|
81
|
+
"ts": ts,
|
|
82
|
+
"channel": "benchmark",
|
|
83
|
+
})
|
|
84
|
+
if items:
|
|
85
|
+
result.append(items)
|
|
86
|
+
|
|
87
|
+
return result
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _parse_date(s: str) -> datetime:
|
|
91
|
+
"""Best-effort date parsing."""
|
|
92
|
+
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%m/%d/%Y"):
|
|
93
|
+
try:
|
|
94
|
+
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
|
|
95
|
+
except (ValueError, TypeError):
|
|
96
|
+
continue
|
|
97
|
+
return datetime(2025, 1, 1, 10, 0, tzinfo=timezone.utc)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class LongMemEvalAdapter(BenchmarkAdapter):
|
|
101
|
+
"""Adapter for LongMemEval (ICLR 2025) benchmark."""
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def name(self) -> str:
|
|
105
|
+
return "longmemeval"
|
|
106
|
+
|
|
107
|
+
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
108
|
+
"""Download and parse LongMemEval, grouping questions by shared haystack."""
|
|
109
|
+
path = _download_dataset(Path(data_dir))
|
|
110
|
+
with open(path) as f:
|
|
111
|
+
raw_items = json.load(f)
|
|
112
|
+
|
|
113
|
+
# Group questions by haystack content hash
|
|
114
|
+
groups: dict[str, dict] = {}
|
|
115
|
+
for item in raw_items:
|
|
116
|
+
h = _session_hash(item.get("haystack_sessions", []))
|
|
117
|
+
if h not in groups:
|
|
118
|
+
groups[h] = {
|
|
119
|
+
"haystack_sessions": item["haystack_sessions"],
|
|
120
|
+
"haystack_session_ids": item.get("haystack_session_ids", []),
|
|
121
|
+
"haystack_dates": item.get("haystack_dates", []),
|
|
122
|
+
"questions": [],
|
|
123
|
+
}
|
|
124
|
+
groups[h]["questions"].append(item)
|
|
125
|
+
|
|
126
|
+
instances = []
|
|
127
|
+
for h, group in groups.items():
|
|
128
|
+
feed_sessions = _sessions_to_feed_items(
|
|
129
|
+
group["haystack_sessions"],
|
|
130
|
+
group["haystack_session_ids"],
|
|
131
|
+
group["haystack_dates"],
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
questions = []
|
|
135
|
+
for item in group["questions"]:
|
|
136
|
+
questions.append(BenchmarkQuestion(
|
|
137
|
+
id=item["question_id"],
|
|
138
|
+
text=item["question"],
|
|
139
|
+
gold_answer=str(item["answer"]),
|
|
140
|
+
category=item.get("question_type", "unknown"),
|
|
141
|
+
evidence_session_ids=item.get("answer_session_ids", []),
|
|
142
|
+
metadata={
|
|
143
|
+
"question_date": item.get("question_date", ""),
|
|
144
|
+
},
|
|
145
|
+
))
|
|
146
|
+
|
|
147
|
+
instances.append(BenchmarkInstance(
|
|
148
|
+
id=f"lme-{h}",
|
|
149
|
+
sessions=feed_sessions,
|
|
150
|
+
questions=questions,
|
|
151
|
+
raw_sessions=group["haystack_sessions"],
|
|
152
|
+
metadata={
|
|
153
|
+
"haystack_hash": h,
|
|
154
|
+
"num_sessions": len(feed_sessions),
|
|
155
|
+
"num_turns": len(group["haystack_sessions"]),
|
|
156
|
+
},
|
|
157
|
+
))
|
|
158
|
+
|
|
159
|
+
print(f"[longmemeval] loaded {sum(len(i.questions) for i in instances)} questions "
|
|
160
|
+
f"across {len(instances)} unique haystacks")
|
|
161
|
+
return instances
|
|
162
|
+
|
|
163
|
+
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
164
|
+
"""Render the full conversation history for the baseline condition."""
|
|
165
|
+
lines = []
|
|
166
|
+
for session in instance.raw_sessions:
|
|
167
|
+
if isinstance(session, list):
|
|
168
|
+
for turn in session:
|
|
169
|
+
role = turn.get("role", "unknown").capitalize()
|
|
170
|
+
content = turn.get("content", "")
|
|
171
|
+
lines.append(f"{role}: {content}")
|
|
172
|
+
lines.append("---") # session separator
|
|
173
|
+
elif isinstance(session, dict):
|
|
174
|
+
role = session.get("role", "unknown").capitalize()
|
|
175
|
+
content = session.get("content", "")
|
|
176
|
+
lines.append(f"{role}: {content}")
|
|
177
|
+
return "\n\n".join(lines)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Query pipeline — benchmark questions → LLM answers under 3 conditions.
|
|
2
|
+
|
|
3
|
+
Condition A (sinain-memory): answer from knowledge graph facts
|
|
4
|
+
Condition B (full-context): answer from full conversation history
|
|
5
|
+
Condition C (knowledge-doc): answer from portable knowledge document
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
# Add sinain-memory to path
|
|
14
|
+
_koog_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
15
|
+
if _koog_dir not in sys.path:
|
|
16
|
+
sys.path.insert(0, _koog_dir)
|
|
17
|
+
|
|
18
|
+
from common import call_llm # noqa: E402
|
|
19
|
+
|
|
20
|
+
from .base_adapter import BenchmarkQuestion
|
|
21
|
+
from .config import QA_MODEL, MAX_FACTS_PER_QUERY
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _extract_keywords(query: str) -> list[str]:
|
|
25
|
+
"""Extract search keywords (reuses logic from retrieval_evaluator)."""
|
|
26
|
+
import re
|
|
27
|
+
words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
|
|
28
|
+
stopwords = {
|
|
29
|
+
"the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an",
|
|
30
|
+
"it", "was", "not", "how", "what", "when", "does", "did", "do", "my",
|
|
31
|
+
"your", "their", "have", "has", "had", "are", "were", "been", "being",
|
|
32
|
+
"about", "from", "with", "that", "this", "which", "who", "whom",
|
|
33
|
+
"where", "why", "can", "could", "would", "should",
|
|
34
|
+
}
|
|
35
|
+
return [w for w in words if len(w) > 2 and w not in stopwords]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _get_all_facts_text(db_path: str) -> str:
|
|
39
|
+
"""Dump ALL facts from the knowledge graph as formatted text.
|
|
40
|
+
|
|
41
|
+
Sinain triplestores are small (10-50 facts per session), so including
|
|
42
|
+
everything is feasible and avoids tag-matching failures.
|
|
43
|
+
"""
|
|
44
|
+
from graph_query import query_top_facts, format_facts_text
|
|
45
|
+
|
|
46
|
+
facts = query_top_facts(db_path, limit=50)
|
|
47
|
+
if not facts:
|
|
48
|
+
return "(no knowledge available)"
|
|
49
|
+
return format_facts_text(facts, max_chars=6000)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _query_knowledge(db_path: str, question: str) -> str:
|
|
53
|
+
"""Query sinain knowledge graph for facts relevant to a question.
|
|
54
|
+
|
|
55
|
+
Strategy: first try tag-based retrieval (targeted). If nothing found,
|
|
56
|
+
fall back to full DB dump (sinain stores are small enough).
|
|
57
|
+
"""
|
|
58
|
+
from graph_query import query_facts_by_entities, format_facts_text
|
|
59
|
+
|
|
60
|
+
keywords = _extract_keywords(question)
|
|
61
|
+
if keywords:
|
|
62
|
+
facts = query_facts_by_entities(db_path, keywords, max_facts=MAX_FACTS_PER_QUERY)
|
|
63
|
+
if facts:
|
|
64
|
+
return format_facts_text(facts, max_chars=3000)
|
|
65
|
+
|
|
66
|
+
# Fallback: include all facts (DB is small, typically 10-30 facts)
|
|
67
|
+
return _get_all_facts_text(db_path)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
|
|
71
|
+
"""Get facts retrieved for a question (for retrieval evaluation)."""
|
|
72
|
+
from graph_query import query_facts_by_entities, query_top_facts
|
|
73
|
+
|
|
74
|
+
keywords = _extract_keywords(question)
|
|
75
|
+
if keywords:
|
|
76
|
+
facts = query_facts_by_entities(db_path, keywords, max_facts=k)
|
|
77
|
+
if facts:
|
|
78
|
+
return facts
|
|
79
|
+
|
|
80
|
+
# Fallback: top facts by confidence
|
|
81
|
+
return query_top_facts(db_path, limit=k)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def compute_content_recall(
|
|
85
|
+
retrieved_facts: list[dict],
|
|
86
|
+
gold_answer: str,
|
|
87
|
+
k_values: list[int] | None = None,
|
|
88
|
+
) -> dict:
|
|
89
|
+
"""Content-based retrieval metric: do retrieved facts contain the answer?
|
|
90
|
+
|
|
91
|
+
Instead of matching entity IDs (which don't align between LongMemEval
|
|
92
|
+
session IDs and sinain entity IDs), we check whether the gold answer's
|
|
93
|
+
key terms appear in any retrieved fact's content.
|
|
94
|
+
"""
|
|
95
|
+
from .config import K_VALUES
|
|
96
|
+
ks = k_values or K_VALUES
|
|
97
|
+
|
|
98
|
+
gold_terms = set(_extract_keywords(str(gold_answer)))
|
|
99
|
+
if not gold_terms:
|
|
100
|
+
return {f"content_recall@{k}": 0.0 for k in ks}
|
|
101
|
+
|
|
102
|
+
result = {}
|
|
103
|
+
for k in ks:
|
|
104
|
+
top_k = retrieved_facts[:k]
|
|
105
|
+
# Check if any fact in top-k contains gold answer terms
|
|
106
|
+
for fact in top_k:
|
|
107
|
+
fact_text = f"{fact.get('entity', '')} {fact.get('value', '')}".lower()
|
|
108
|
+
fact_terms = set(_extract_keywords(fact_text))
|
|
109
|
+
overlap = gold_terms & fact_terms
|
|
110
|
+
if len(overlap) >= max(1, len(gold_terms) // 2): # ≥50% of gold terms
|
|
111
|
+
result[f"content_recall@{k}"] = 1.0
|
|
112
|
+
break
|
|
113
|
+
else:
|
|
114
|
+
result[f"content_recall@{k}"] = 0.0
|
|
115
|
+
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def answer_question(
|
|
120
|
+
question: BenchmarkQuestion,
|
|
121
|
+
condition: str,
|
|
122
|
+
*,
|
|
123
|
+
db_path: str | None = None,
|
|
124
|
+
full_context: str | None = None,
|
|
125
|
+
knowledge_doc: str | None = None,
|
|
126
|
+
model: str | None = None,
|
|
127
|
+
) -> str:
|
|
128
|
+
"""Generate an answer for a benchmark question under a specific condition.
|
|
129
|
+
|
|
130
|
+
Returns the LLM's answer text.
|
|
131
|
+
"""
|
|
132
|
+
qa_model = model or QA_MODEL
|
|
133
|
+
|
|
134
|
+
if condition == "sinain-memory":
|
|
135
|
+
assert db_path, "db_path required for sinain-memory condition"
|
|
136
|
+
facts = _query_knowledge(db_path, question.text)
|
|
137
|
+
system = (
|
|
138
|
+
"Answer the question using ONLY the provided knowledge facts. "
|
|
139
|
+
"If the facts don't contain enough information to answer, say \"I don't know.\""
|
|
140
|
+
)
|
|
141
|
+
user = f"## Knowledge Facts\n{facts}\n\n## Question\n{question.text}"
|
|
142
|
+
|
|
143
|
+
elif condition == "full-context":
|
|
144
|
+
assert full_context, "full_context required for full-context condition"
|
|
145
|
+
system = (
|
|
146
|
+
"Answer the question based on the conversation history below. "
|
|
147
|
+
"Be concise and specific."
|
|
148
|
+
)
|
|
149
|
+
# Truncate context if too large (some models have limits)
|
|
150
|
+
ctx = full_context[:100_000] if len(full_context) > 100_000 else full_context
|
|
151
|
+
user = f"## Conversation History\n{ctx}\n\n## Question\n{question.text}"
|
|
152
|
+
|
|
153
|
+
elif condition == "knowledge-doc":
|
|
154
|
+
assert knowledge_doc, "knowledge_doc required for knowledge-doc condition"
|
|
155
|
+
system = (
|
|
156
|
+
"Answer the question using ONLY the knowledge document provided. "
|
|
157
|
+
"If the document doesn't contain enough information, say \"I don't know.\""
|
|
158
|
+
)
|
|
159
|
+
user = f"## Knowledge Document\n{knowledge_doc}\n\n## Question\n{question.text}"
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unknown condition: {condition}")
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
return call_llm(
|
|
166
|
+
system_prompt=system,
|
|
167
|
+
user_prompt=user,
|
|
168
|
+
model=qa_model,
|
|
169
|
+
max_tokens=300,
|
|
170
|
+
).strip()
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return f"(error: {e})"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Report generation — markdown, JSON, and LaTeX output."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def generate_markdown(benchmark_name: str, summary: dict, details: list[dict]) -> str:
|
|
10
|
+
"""Generate a publishable markdown report."""
|
|
11
|
+
lines = [
|
|
12
|
+
f"# Sinain Knowledge Graph — {benchmark_name} Results",
|
|
13
|
+
f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
|
|
14
|
+
"",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
# Headline IPR
|
|
18
|
+
ipr = summary.get("ipr")
|
|
19
|
+
if ipr:
|
|
20
|
+
lines.append(f"**Information Preservation Rate (IPR)**: {ipr:.1%}")
|
|
21
|
+
lines.append("")
|
|
22
|
+
|
|
23
|
+
# Condition scores table
|
|
24
|
+
conditions = summary.get("conditions", {})
|
|
25
|
+
if conditions:
|
|
26
|
+
cond_names = sorted(conditions.keys())
|
|
27
|
+
header = "| Condition | Mean Score (1-5) | Mean F1 | N |"
|
|
28
|
+
sep = "|-----------|------------------|---------|---|"
|
|
29
|
+
lines.extend([header, sep])
|
|
30
|
+
for cond in cond_names:
|
|
31
|
+
c = conditions[cond]
|
|
32
|
+
lines.append(f"| {cond} | {c['mean_score']:.2f} | {c.get('mean_f1', 0):.2f} | {c['n']} |")
|
|
33
|
+
lines.append("")
|
|
34
|
+
|
|
35
|
+
# Retrieval metrics
|
|
36
|
+
retrieval = summary.get("retrieval", {})
|
|
37
|
+
if retrieval:
|
|
38
|
+
lines.append("## Retrieval Quality")
|
|
39
|
+
lines.append("| Metric | Score |")
|
|
40
|
+
lines.append("|--------|-------|")
|
|
41
|
+
for k, v in sorted(retrieval.items()):
|
|
42
|
+
lines.append(f"| {k} | {v:.1%} |")
|
|
43
|
+
lines.append("")
|
|
44
|
+
|
|
45
|
+
# Category breakdown
|
|
46
|
+
categories = summary.get("categories", {})
|
|
47
|
+
if categories:
|
|
48
|
+
lines.append("## By Category")
|
|
49
|
+
cond_names = sorted(set(c for cat in categories.values() for c in cat))
|
|
50
|
+
header = "| Category | " + " | ".join(cond_names) + " |"
|
|
51
|
+
sep = "|----------|" + "|".join(["------"] * len(cond_names)) + "|"
|
|
52
|
+
lines.extend([header, sep])
|
|
53
|
+
for cat in sorted(categories):
|
|
54
|
+
cells = []
|
|
55
|
+
for cond in cond_names:
|
|
56
|
+
if cond in categories[cat]:
|
|
57
|
+
cells.append(f"{categories[cat][cond]['mean_score']:.2f} (n={categories[cat][cond]['n']})")
|
|
58
|
+
else:
|
|
59
|
+
cells.append("-")
|
|
60
|
+
lines.append(f"| {cat} | " + " | ".join(cells) + " |")
|
|
61
|
+
lines.append("")
|
|
62
|
+
|
|
63
|
+
# Failures (worst questions)
|
|
64
|
+
if details:
|
|
65
|
+
sm_details = [d for d in details if d.get("answers", {}).get("sinain-memory", {}).get("score") is not None]
|
|
66
|
+
sm_details.sort(key=lambda d: d["answers"]["sinain-memory"]["score"])
|
|
67
|
+
if sm_details:
|
|
68
|
+
lines.append("## Hardest Questions for sinain-memory (bottom 5)")
|
|
69
|
+
for d in sm_details[:5]:
|
|
70
|
+
sm = d["answers"]["sinain-memory"]
|
|
71
|
+
fc = d["answers"].get("full-context", {})
|
|
72
|
+
lines.append(f"- **{d['id']}** [{d['category']}]: score={sm['score']}/5 "
|
|
73
|
+
f"(full-context: {fc.get('score', '?')}/5)")
|
|
74
|
+
lines.append(f" Q: {d['question'][:100]}...")
|
|
75
|
+
lines.append("")
|
|
76
|
+
|
|
77
|
+
return "\n".join(lines)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def generate_json(benchmark_name: str, summary: dict, details: list[dict]) -> str:
|
|
81
|
+
"""Generate JSON report."""
|
|
82
|
+
return json.dumps({
|
|
83
|
+
"benchmark": benchmark_name,
|
|
84
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
85
|
+
"summary": summary,
|
|
86
|
+
"details": details,
|
|
87
|
+
}, indent=2, ensure_ascii=False)
|