@geravant/sinain 1.11.0 → 1.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/package.json +1 -1
  2. package/sinain-core/package-lock.json +963 -0
  3. package/sinain-core/package.json +1 -0
  4. package/sinain-core/src/buffers/feed-buffer.ts +32 -0
  5. package/sinain-core/src/embedding/service.ts +66 -0
  6. package/sinain-core/src/escalation/escalator.ts +1 -0
  7. package/sinain-core/src/escalation/message-builder.ts +45 -118
  8. package/sinain-core/src/index.ts +19 -2
  9. package/sinain-core/src/learning/local-curation.ts +137 -7
  10. package/sinain-core/src/overlay/commands.ts +16 -3
  11. package/sinain-core/src/overlay/ws-handler.ts +4 -1
  12. package/sinain-core/src/server.ts +31 -0
  13. package/sinain-core/src/types.ts +3 -0
  14. package/sinain-memory/README.md +105 -0
  15. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  16. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  17. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  18. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  19. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  20. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  21. package/sinain-memory/embed_client.py +117 -0
  22. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
  36. package/sinain-memory/eval/benchmarks/config.py +23 -0
  37. package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
  38. package/sinain-memory/eval/benchmarks/ingest.py +152 -0
  39. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  40. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  41. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  42. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
  43. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
  44. package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
  45. package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
  46. package/sinain-memory/eval/benchmarks/query.py +193 -0
  47. package/sinain-memory/eval/benchmarks/report.py +87 -0
  48. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
  49. package/sinain-memory/eval/benchmarks/runner.py +283 -0
  50. package/sinain-memory/graph_query.py +257 -15
  51. package/sinain-memory/knowledge_integrator.py +365 -72
  52. package/sinain-memory/koog-config.json +11 -0
  53. package/sinain-memory/memory-config.json +1 -1
  54. package/sinain-memory/session_distiller.py +43 -19
  55. package/sinain-memory/triplestore.py +60 -0
@@ -0,0 +1,318 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # ── Meeting Memory Benchmark — end-to-end capture + evaluate ─────────────────
5
+ # 1. Opens meeting recording fullscreen in QuickTime
6
+ # 2. Starts sinain (audio + sense capture, no agent, no overlay)
7
+ # 3. Waits for recording to finish
8
+ # 4. Stops sinain → saves pending session
9
+ # 5. Restarts sinain → distills pending session into knowledge graph
10
+ # 6. Runs evaluation harness against the distilled DB
11
+ #
12
+ # Usage: ./run_meeting_bench.sh <mp4_path>
13
+ # Output: eval/benchmarks/results/meeting_results.md
14
+
15
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
16
+ SINAIN_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
17
+ KOOG_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
18
+
19
+ BOLD='\033[1m'
20
+ GREEN='\033[0;32m'
21
+ YELLOW='\033[0;33m'
22
+ RED='\033[0;31m'
23
+ CYAN='\033[0;36m'
24
+ RESET='\033[0m'
25
+
26
+ log() { echo -e "${BOLD}[bench]${RESET} $*"; }
27
+ ok() { echo -e "${BOLD}[bench]${RESET} ${GREEN}✓${RESET} $*"; }
28
+ warn() { echo -e "${BOLD}[bench]${RESET} ${YELLOW}⚠${RESET} $*"; }
29
+ fail() { echo -e "${BOLD}[bench]${RESET} ${RED}✗${RESET} $*"; exit 1; }
30
+
31
+ # ── Args ─────────────────────────────────────────────────────────────────────
32
+ RECORDING="${1:-}"
33
+ if [ -z "$RECORDING" ] || [ ! -f "$RECORDING" ]; then
34
+ fail "Usage: $0 <path-to-mp4>"
35
+ fi
36
+
37
+ # ── Setup ────────────────────────────────────────────────────────────────────
38
+ BENCH_DIR="/tmp/sinain-bench-$(date +%s)"
39
+ mkdir -p "$BENCH_DIR"
40
+ log "Benchmark directory: ${CYAN}${BENCH_DIR}${RESET}"
41
+
42
+ # Source .env for API keys and audio config (safe parser from start-local.sh)
43
+ for _env_file in "$SINAIN_ROOT/.env" "$SINAIN_ROOT/sinain-core/.env" "$HOME/.sinain/.env"; do
44
+ if [ -f "$_env_file" ]; then
45
+ log "Loading $_env_file"
46
+ while IFS='=' read -r _k _v; do
47
+ [[ -z "$_k" || "$_k" =~ ^[[:space:]]*# ]] && continue
48
+ _k=$(echo "$_k" | xargs)
49
+ _v=$(echo "$_v" | xargs)
50
+ _v="${_v%%#*}"
51
+ _v=$(echo "$_v" | xargs)
52
+ [[ -z "$_v" ]] && continue
53
+ if [ -z "${!_k+x}" ]; then export "$_k=$_v"; fi
54
+ done < "$_env_file"
55
+ break
56
+ fi
57
+ done
58
+
59
+ # Bench-specific overrides
60
+ export SINAIN_MEMORY_DIR="$BENCH_DIR"
61
+ export AGENT_ENABLED=false
62
+ export ESCALATION_MODE=off
63
+
64
+ # Local whisper setup (from start-local.sh)
65
+ MODEL_DIR="$HOME/models"
66
+ MODEL_NAME="ggml-large-v3-turbo.bin"
67
+ export LOCAL_WHISPER_MODEL="${LOCAL_WHISPER_MODEL:-$MODEL_DIR/$MODEL_NAME}"
68
+ export LOCAL_WHISPER_BIN="${LOCAL_WHISPER_BIN:-whisper-cli}"
69
+ export TRANSCRIPTION_BACKEND=local
70
+
71
+ # ── Get recording duration ───────────────────────────────────────────────────
72
+ DURATION_RAW=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$RECORDING" 2>/dev/null || echo "")
73
+ if [ -n "$DURATION_RAW" ]; then
74
+ DURATION=$(echo "$DURATION_RAW" | cut -d. -f1)
75
+ else
76
+ DURATION=1620 # fallback: 27 min
77
+ fi
78
+ log "Recording duration: ${DURATION}s (~$((DURATION / 60))m)"
79
+
80
+ # ── Cleanup handler ──────────────────────────────────────────────────────────
81
+ CORE_PID=""
82
+ SENSE_PID=""
83
+
84
+ cleanup() {
85
+ log "Cleaning up..."
86
+ [ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
87
+ [ -n "$CORE_PID" ] && kill "$CORE_PID" 2>/dev/null || true
88
+ sleep 2
89
+ [ -n "$SENSE_PID" ] && kill -9 "$SENSE_PID" 2>/dev/null || true
90
+ [ -n "$CORE_PID" ] && kill -9 "$CORE_PID" 2>/dev/null || true
91
+ # Kill anything on port 9500
92
+ lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
93
+ # Close QuickTime
94
+ osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
95
+ }
96
+ trap cleanup EXIT
97
+
98
+ # ── Kill stale sinain processes ──────────────────────────────────────────────
99
+ log "Killing stale processes..."
100
+ pkill -f "tsx.*src/index.ts" 2>/dev/null || true
101
+ pkill -f "python3 -m sense_client" 2>/dev/null || true
102
+ pkill -f "Python -m sense_client" 2>/dev/null || true
103
+ pkill -f "tools/sck-capture/sck-capture" 2>/dev/null || true
104
+ lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
105
+ sleep 2
106
+
107
+ # ── Phase 1a: Open video fullscreen ─────────────────────────────────────────
108
+ log "Opening recording in QuickTime (fullscreen)..."
109
+ open -a "QuickTime Player" "$RECORDING"
110
+ sleep 3
111
+ osascript -e '
112
+ tell application "QuickTime Player"
113
+ present front document
114
+ delay 1
115
+ play front document
116
+ end tell
117
+ ' 2>/dev/null || warn "Could not auto-play — check QuickTime"
118
+ ok "Video playing fullscreen"
119
+
120
+ # ── Phase 1b: Start sinain-core ──────────────────────────────────────────────
121
+ log "Starting sinain-core (capture-only, local whisper)..."
122
+ (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
123
+ sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
124
+ CORE_PID=$!
125
+
126
+ # Wait for health
127
+ CORE_OK=false
128
+ for i in $(seq 1 20); do
129
+ if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
130
+ CORE_OK=true
131
+ break
132
+ fi
133
+ sleep 1
134
+ done
135
+ if $CORE_OK; then
136
+ ok "sinain-core healthy on :9500"
137
+ else
138
+ fail "sinain-core did not start"
139
+ fi
140
+
141
+ # ── Phase 1c: Start sense_client ─────────────────────────────────────────────
142
+ log "Starting sense_client (screen capture + OCR)..."
143
+
144
+ # Propagate privacy mode
145
+ export PRIVACY_OCR_OPENROUTER="${PRIVACY_OCR_OPENROUTER:-full}"
146
+ export PRIVACY_IMAGES_OPENROUTER="${PRIVACY_IMAGES_OPENROUTER:-full}"
147
+
148
+ (cd "$SINAIN_ROOT" && python3 -m sense_client 2>&1) | \
149
+ sed -u "s/^/$(printf "${YELLOW}[sense]${RESET} ")/" &
150
+ SENSE_PID=$!
151
+ sleep 2
152
+
153
+ if kill -0 "$SENSE_PID" 2>/dev/null; then
154
+ ok "sense_client running"
155
+ else
156
+ warn "sense_client failed to start — continuing with audio only"
157
+ SENSE_PID=""
158
+ fi
159
+
160
+ # ── Phase 1d: Wait for recording to finish ───────────────────────────────────
161
+ BUFFER=60 # extra time for trailing transcription/OCR
162
+ TOTAL_WAIT=$((DURATION + BUFFER))
163
+ log "Waiting ${TOTAL_WAIT}s for recording + buffer..."
164
+ log " (recording ends at $(date -v+${DURATION}S '+%H:%M:%S'), buffer until $(date -v+${TOTAL_WAIT}S '+%H:%M:%S'))"
165
+
166
+ # Progress updates every 5 minutes
167
+ ELAPSED=0
168
+ while [ $ELAPSED -lt $TOTAL_WAIT ]; do
169
+ SLEEP_CHUNK=300
170
+ if [ $((ELAPSED + SLEEP_CHUNK)) -gt $TOTAL_WAIT ]; then
171
+ SLEEP_CHUNK=$((TOTAL_WAIT - ELAPSED))
172
+ fi
173
+ sleep $SLEEP_CHUNK
174
+ ELAPSED=$((ELAPSED + SLEEP_CHUNK))
175
+ REMAINING=$((TOTAL_WAIT - ELAPSED))
176
+ if [ $REMAINING -gt 0 ]; then
177
+ # Check feed count
178
+ FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
179
+ log " ${ELAPSED}s elapsed, ${REMAINING}s remaining — feed items: ${FEED_COUNT}"
180
+ fi
181
+ done
182
+
183
+ ok "Recording capture complete"
184
+
185
+ # Check what we captured
186
+ FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
187
+ log "Captured ${FEED_COUNT} feed items"
188
+
189
+ # ── Phase 1e: Stop sinain (saves pending session) ───────────────────────────
190
+ log "Stopping sinain (saving pending session)..."
191
+ [ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
192
+ SENSE_PID=""
193
+
194
+ # Send SIGINT directly to the tsx/node process (not the pipe wrapper)
195
+ # The pipe means $CORE_PID is sed, not tsx — so we pkill the actual process
196
+ pkill -INT -f "tsx src/index.ts" 2>/dev/null || true
197
+ log "Sent SIGINT to tsx, waiting for graceful shutdown..."
198
+ sleep 10
199
+
200
+ # Force if still alive
201
+ pkill -9 -f "tsx src/index.ts" 2>/dev/null || true
202
+ kill -9 "$CORE_PID" 2>/dev/null || true
203
+ CORE_PID=""
204
+ lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
205
+ sleep 2
206
+
207
+ # Close QuickTime
208
+ osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
209
+
210
+ # Verify pending session was saved (or inline distillation already consumed it)
211
+ if [ -f "$BENCH_DIR/pending-session.json" ]; then
212
+ PENDING_ITEMS=$(python3 -c "import json; print(len(json.load(open('$BENCH_DIR/pending-session.json')).get('items',[])))" 2>/dev/null || echo "?")
213
+ ok "Pending session saved: ${PENDING_ITEMS} items"
214
+ elif [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
215
+ ok "Inline distillation completed (pending-session.json already consumed)"
216
+ else
217
+ warn "No pending-session.json and no knowledge-graph.db — will retry with longer shutdown"
218
+ # Try again: start core briefly, let it capture a few items, then shut down gracefully
219
+ log "Starting core for a brief capture + shutdown cycle..."
220
+ (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) > /tmp/sinain-bench-retry.log &
221
+ RETRY_PID=$!
222
+ sleep 15 # let it start and capture a few items
223
+ # Get the actual node PID and send SIGINT
224
+ NODE_PID=$(pgrep -f "tsx src/index.ts" 2>/dev/null | head -1 || true)
225
+ if [ -n "$NODE_PID" ]; then
226
+ kill -INT "$NODE_PID" 2>/dev/null || true
227
+ sleep 10
228
+ kill -9 "$NODE_PID" 2>/dev/null || true
229
+ fi
230
+ kill -9 "$RETRY_PID" 2>/dev/null || true
231
+ lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
232
+ sleep 2
233
+ if [ -f "$BENCH_DIR/pending-session.json" ] || [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
234
+ ok "Recovery succeeded"
235
+ else
236
+ fail "Could not capture any session data"
237
+ fi
238
+ fi
239
+
240
+ # ── Phase 1f: Restart for distillation ───────────────────────────────────────
241
+ log "Restarting sinain-core for distillation..."
242
+ (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
243
+ sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
244
+ CORE_PID=$!
245
+
246
+ # Wait for health
247
+ for i in $(seq 1 20); do
248
+ if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
249
+ break
250
+ fi
251
+ sleep 1
252
+ done
253
+
254
+ # Wait for distillation to complete (knowledge-graph.db appears or grows)
255
+ log "Waiting for distillation..."
256
+ for i in $(seq 1 120); do
257
+ if [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
258
+ DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
259
+ if [ "$DB_SIZE" -gt 4096 ]; then
260
+ ok "Distillation complete (DB: ${DB_SIZE} bytes)"
261
+ break
262
+ fi
263
+ fi
264
+ # Also check if pending-session.json is gone (distillation consumed it)
265
+ if [ ! -f "$BENCH_DIR/pending-session.json" ] && [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
266
+ DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
267
+ ok "Distillation complete (DB: ${DB_SIZE} bytes)"
268
+ break
269
+ fi
270
+ sleep 5
271
+ done
272
+
273
+ # Keep core running for /embed endpoint during evaluation
274
+ log "Keeping sinain-core running for embedding service during evaluation..."
275
+
276
+ # ── Phase 2: Evaluate ────────────────────────────────────────────────────────
277
+ log ""
278
+ log "═══════════════════════════════════════════════"
279
+ log " Phase 2: Evaluation"
280
+ log "═══════════════════════════════════════════════"
281
+ log ""
282
+
283
+ DB_PATH="$BENCH_DIR/knowledge-graph.db"
284
+ if [ ! -f "$DB_PATH" ]; then
285
+ fail "No knowledge-graph.db found — distillation may have failed"
286
+ fi
287
+
288
+ # Show what's in the DB
289
+ log "Knowledge graph contents:"
290
+ cd "$KOOG_DIR"
291
+ python3 -c "
292
+ from triplestore import TripleStore
293
+ ts = TripleStore('$DB_PATH')
294
+ facts = ts.all_facts()
295
+ print(f' Total facts: {len(facts)}')
296
+ entities = set()
297
+ for f in facts:
298
+ entities.add(f.get('entity', ''))
299
+ print(f' Unique entities: {len(entities)}')
300
+ for e in sorted(entities)[:10]:
301
+ print(f' - {e}')
302
+ if len(entities) > 10:
303
+ print(f' ... and {len(entities) - 10} more')
304
+ " 2>/dev/null || warn "Could not inspect DB"
305
+
306
+ # Run evaluation
307
+ log "Running QA evaluation..."
308
+ python3 eval/benchmarks/meeting_runner.py \
309
+ --db "$DB_PATH" \
310
+ --conditions sinain-memory,full-context \
311
+ --format json,markdown
312
+
313
+ log ""
314
+ log "═══════════════════════════════════════════════"
315
+ log " Done!"
316
+ log " Results: eval/benchmarks/results/meeting_results.md"
317
+ log " DB: $DB_PATH"
318
+ log "═══════════════════════════════════════════════"
@@ -0,0 +1,283 @@
1
+ #!/usr/bin/env python3
2
+ """Benchmark runner — evaluates sinain's knowledge graph against published benchmarks.
3
+
4
+ Usage:
5
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --subset 5
6
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --conditions sinain-memory,full-context
7
+ python3 eval/benchmarks/runner.py --benchmarks longmemeval --format markdown --resume
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ # Add sinain-memory to path
18
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
19
+ if _koog_dir not in sys.path:
20
+ sys.path.insert(0, _koog_dir)
21
+
22
+ from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
23
+ from eval.benchmarks.base_adapter import BenchmarkAdapter, BenchmarkInstance
24
+ from eval.benchmarks.longmemeval_adapter import LongMemEvalAdapter
25
+ from eval.benchmarks.ingest import ingest_instance, get_knowledge_doc
26
+ from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
27
+ from eval.benchmarks.evaluate import (
28
+ token_f1, aggregate_results,
29
+ )
30
+ from eval.benchmarks.judges.qa_judge import judge_qa
31
+ from eval.benchmarks.report import generate_markdown, generate_json
32
+
33
+
34
+ def _get_adapter(name: str) -> BenchmarkAdapter:
35
+ if name == "longmemeval":
36
+ return LongMemEvalAdapter()
37
+ raise ValueError(f"Unknown benchmark: {name}. Available: longmemeval")
38
+
39
+
40
+ def _load_resume(resume_path: Path) -> dict[str, dict]:
41
+ """Load previously computed results for resume support."""
42
+ results = {}
43
+ if resume_path.exists():
44
+ for line in resume_path.read_text().strip().split("\n"):
45
+ if line:
46
+ entry = json.loads(line)
47
+ results[entry["id"]] = entry
48
+ return results
49
+
50
+
51
+ def run_benchmark(
52
+ benchmark_name: str,
53
+ conditions: list[str],
54
+ *,
55
+ subset: int | None = None,
56
+ qa_model: str = QA_MODEL,
57
+ judge_model: str = JUDGE_MODEL,
58
+ output_dir: Path = RESULTS_DIR,
59
+ cache_dir: Path = DATA_DIR,
60
+ resume: bool = False,
61
+ skip_llm: bool = False,
62
+ stratified: bool = False,
63
+ ) -> tuple[dict, list[dict]]:
64
+ """Run a benchmark end-to-end. Returns (summary, details)."""
65
+
66
+ adapter = _get_adapter(benchmark_name)
67
+
68
+ # Load dataset
69
+ print(f"\n{'='*60}")
70
+ print(f" Benchmark: {benchmark_name}")
71
+ print(f" Conditions: {', '.join(conditions)}")
72
+ print(f" QA model: {qa_model}")
73
+ print(f" Judge model: {judge_model}")
74
+ print(f"{'='*60}\n")
75
+
76
+ instances = adapter.load_dataset(str(cache_dir))
77
+
78
+ # Flatten questions
79
+ all_questions = []
80
+ for inst in instances:
81
+ for q in inst.questions:
82
+ all_questions.append((inst, q))
83
+
84
+ if subset:
85
+ if stratified:
86
+ # Take equal samples from each question category
87
+ from collections import defaultdict
88
+ by_cat: dict[str, list] = defaultdict(list)
89
+ for pair in all_questions:
90
+ by_cat[pair[1].category].append(pair)
91
+ per_cat = max(1, subset // len(by_cat))
92
+ sampled = []
93
+ for cat in sorted(by_cat):
94
+ sampled.extend(by_cat[cat][:per_cat])
95
+ all_questions = sampled[:subset]
96
+ else:
97
+ all_questions = all_questions[:subset]
98
+
99
+ total = len(all_questions)
100
+ print(f"[runner] evaluating {total} questions\n")
101
+
102
+ # Resume support
103
+ resume_path = output_dir / f"{benchmark_name}_progress.jsonl"
104
+ completed = _load_resume(resume_path) if resume else {}
105
+ output_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ # Track ingested instances
108
+ instance_dbs: dict[str, Path | None] = {}
109
+ instance_docs: dict[str, str] = {}
110
+
111
+ details: list[dict] = []
112
+
113
+ for idx, (inst, question) in enumerate(all_questions):
114
+ qid = question.id
115
+
116
+ # Skip if already done (with all conditions scored)
117
+ if qid in completed:
118
+ prev = completed[qid]
119
+ all_scored = all(
120
+ prev.get("answers", {}).get(c, {}).get("score") is not None
121
+ for c in conditions
122
+ )
123
+ if all_scored:
124
+ details.append(prev)
125
+ continue
126
+ # Otherwise re-run this question (previous attempt had failures)
127
+
128
+ print(f"[{idx+1}/{total}] {qid} [{question.category}]")
129
+
130
+ # Ingest instance if not done yet
131
+ if inst.id not in instance_dbs:
132
+ if "sinain-memory" in conditions or "knowledge-doc" in conditions:
133
+ print(f" ingesting {inst.id} ({len(inst.sessions)} sessions)...")
134
+ instance_dbs[inst.id] = ingest_instance(inst, cache_dir / benchmark_name)
135
+ db = instance_dbs[inst.id]
136
+ if db:
137
+ instance_docs[inst.id] = get_knowledge_doc(db)
138
+ print(f" -> ingested ({db.stat().st_size} bytes)")
139
+ else:
140
+ instance_docs[inst.id] = "(ingestion failed)"
141
+ print(f" -> ingestion failed")
142
+ else:
143
+ instance_dbs[inst.id] = None
144
+ instance_docs[inst.id] = ""
145
+
146
+ db_path = instance_dbs.get(inst.id)
147
+ knowledge_doc = instance_docs.get(inst.id, "")
148
+ full_context = adapter.format_full_context(inst)
149
+
150
+ # Retrieval metrics (content-based: do retrieved facts contain the answer?)
151
+ retrieval = {}
152
+ if db_path and "sinain-memory" in conditions:
153
+ retrieved_facts = _get_retrieved_facts(str(db_path), question.text)
154
+ retrieval = compute_content_recall(
155
+ retrieved_facts, question.gold_answer,
156
+ )
157
+
158
+ # Generate answers per condition
159
+ answers = {}
160
+ for cond in conditions:
161
+ if skip_llm:
162
+ answers[cond] = {"text": "(skipped)", "score": None, "f1": None}
163
+ continue
164
+
165
+ # Skip sinain-memory/knowledge-doc if ingestion failed
166
+ if cond in ("sinain-memory", "knowledge-doc") and not db_path:
167
+ answers[cond] = {"text": "(ingestion failed)", "score": 1, "f1": 0.0, "reasoning": "ingestion failed"}
168
+ print(f" [{cond}] skipped (ingestion failed)")
169
+ continue
170
+
171
+ print(f" [{cond}] generating answer...")
172
+ answer_text = answer_question(
173
+ question, cond,
174
+ db_path=str(db_path) if db_path else None,
175
+ full_context=full_context,
176
+ knowledge_doc=knowledge_doc,
177
+ model=qa_model,
178
+ )
179
+
180
+ # Score
181
+ f1 = token_f1(answer_text, question.gold_answer)
182
+
183
+ judge_result = judge_qa(
184
+ question.text, question.gold_answer, answer_text,
185
+ condition=cond, model=judge_model,
186
+ )
187
+ score = judge_result["score"] if judge_result else None
188
+ reasoning = judge_result["reasoning"] if judge_result else None
189
+
190
+ answers[cond] = {
191
+ "text": answer_text[:500],
192
+ "score": score,
193
+ "f1": round(f1, 4),
194
+ "reasoning": reasoning,
195
+ }
196
+ print(f" score={score}/5 f1={f1:.2f}")
197
+
198
+ entry = {
199
+ "id": qid,
200
+ "question": question.text,
201
+ "gold_answer": question.gold_answer,
202
+ "category": question.category,
203
+ "retrieval": retrieval,
204
+ "answers": answers,
205
+ }
206
+ details.append(entry)
207
+
208
+ # Save progress incrementally
209
+ with open(resume_path, "a") as f:
210
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
211
+
212
+ # Aggregate
213
+ summary = aggregate_results(details)
214
+ return summary, details
215
+
216
+
217
+ def main() -> None:
218
+ parser = argparse.ArgumentParser(description="Sinain Knowledge Graph Benchmark Runner")
219
+ parser.add_argument("--benchmarks", default="longmemeval",
220
+ help="Comma-separated benchmark names (longmemeval, locomo)")
221
+ parser.add_argument("--conditions", default="sinain-memory,full-context,knowledge-doc",
222
+ help="Comma-separated conditions to evaluate")
223
+ parser.add_argument("--subset", type=int, default=None,
224
+ help="Run only first N questions (for dev iteration)")
225
+ parser.add_argument("--qa-model", default=QA_MODEL, help="Model for QA generation")
226
+ parser.add_argument("--judge-model", default=JUDGE_MODEL, help="Model for QA judging")
227
+ parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
228
+ parser.add_argument("--cache-dir", type=Path, default=DATA_DIR)
229
+ parser.add_argument("--format", default="json,markdown",
230
+ help="Output formats (json, markdown)")
231
+ parser.add_argument("--resume", action="store_true", help="Resume from partial results")
232
+ parser.add_argument("--skip-llm", action="store_true",
233
+ help="Skip LLM calls (retrieval + mechanical metrics only)")
234
+ parser.add_argument("--stratified", action="store_true",
235
+ help="Sample equally from each question category (with --subset)")
236
+ args = parser.parse_args()
237
+
238
+ conditions = [c.strip() for c in args.conditions.split(",")]
239
+ formats = [f.strip() for f in args.format.split(",")]
240
+
241
+ for bench_name in args.benchmarks.split(","):
242
+ bench_name = bench_name.strip()
243
+ summary, details = run_benchmark(
244
+ bench_name, conditions,
245
+ subset=args.subset,
246
+ qa_model=args.qa_model,
247
+ judge_model=args.judge_model,
248
+ output_dir=args.output_dir,
249
+ cache_dir=args.cache_dir,
250
+ resume=args.resume,
251
+ skip_llm=args.skip_llm,
252
+ stratified=args.stratified,
253
+ )
254
+
255
+ # Write outputs
256
+ args.output_dir.mkdir(parents=True, exist_ok=True)
257
+
258
+ if "json" in formats:
259
+ json_path = args.output_dir / f"{bench_name}_results.json"
260
+ json_path.write_text(generate_json(bench_name, summary, details))
261
+ print(f"\n[output] JSON: {json_path}")
262
+
263
+ if "markdown" in formats:
264
+ md_path = args.output_dir / f"{bench_name}_results.md"
265
+ md_path.write_text(generate_markdown(bench_name, summary, details))
266
+ print(f"[output] Markdown: {md_path}")
267
+
268
+ # Print summary
269
+ print(f"\n{'='*60}")
270
+ print(f" {bench_name} — Summary")
271
+ print(f"{'='*60}")
272
+ ipr = summary.get("ipr")
273
+ if ipr:
274
+ print(f" IPR: {ipr:.1%}")
275
+ for cond, data in summary.get("conditions", {}).items():
276
+ print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
277
+ for k, v in summary.get("retrieval", {}).items():
278
+ print(f" {k}: {v:.1%}")
279
+ print()
280
+
281
+
282
+ if __name__ == "__main__":
283
+ main()