@geravant/sinain 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +1 -1
  2. package/sinain-core/src/escalation/escalator.ts +1 -0
  3. package/sinain-core/src/escalation/message-builder.ts +45 -118
  4. package/sinain-core/src/overlay/commands.ts +16 -3
  5. package/sinain-core/src/overlay/ws-handler.ts +4 -1
  6. package/sinain-core/src/types.ts +3 -0
  7. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  8. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  9. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  10. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  12. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  13. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  14. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  16. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  18. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  19. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  21. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
  24. package/sinain-memory/eval/benchmarks/config.py +23 -0
  25. package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
  26. package/sinain-memory/eval/benchmarks/ingest.py +152 -0
  27. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  28. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
  31. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
  32. package/sinain-memory/eval/benchmarks/query.py +172 -0
  33. package/sinain-memory/eval/benchmarks/report.py +87 -0
  34. package/sinain-memory/eval/benchmarks/runner.py +276 -0
  35. package/sinain-memory/koog-config.json +11 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@geravant/sinain",
3
- "version": "1.11.0",
3
+ "version": "1.12.0",
4
4
  "description": "Ambient intelligence that sees what you see, hears what you hear, and acts on your behalf",
5
5
  "type": "module",
6
6
  "bin": {
@@ -237,6 +237,7 @@ export class Escalator {
237
237
  escalationReason,
238
238
  undefined,
239
239
  this.pendingUserCommand ?? undefined,
240
+ this.deps.wsHandler.getState().responseSize ?? "medium",
240
241
  );
241
242
 
242
243
  // Clear user command after building the message (consumed once)
@@ -1,4 +1,4 @@
1
- import type { ContextWindow, AgentEntry, EscalationMode, FeedbackRecord, UserCommand } from "../types.js";
1
+ import type { ContextWindow, AgentEntry, EscalationMode, FeedbackRecord, UserCommand, ResponseSize } from "../types.js";
2
2
  import { normalizeAppName } from "../agent/context-window.js";
3
3
  import { levelFor, applyLevel } from "../privacy/index.js";
4
4
 
@@ -67,11 +67,18 @@ export function isCodingContext(context: ContextWindow): CodingContextResult {
67
67
  };
68
68
  }
69
69
 
70
- function getInstructions(mode: EscalationMode, context: ContextWindow): string {
70
+ function sizeInstruction(size: ResponseSize): string {
71
+ switch (size) {
72
+ case "small": return "1-2 sentences";
73
+ case "large": return "3-5 sentences";
74
+ default: return "2-3 sentences";
75
+ }
76
+ }
77
+
78
+ function getInstructions(context: ContextWindow): string {
71
79
  const { coding, needsSolution } = isCodingContext(context);
72
80
 
73
81
  if (needsSolution) {
74
- // Coding challenge/problem - be very action-oriented
75
82
  return `The user is working on a coding problem. Be PROACTIVE and SOLVE IT:
76
83
 
77
84
  1. Provide a solution approach and working code based on what you can see
@@ -92,13 +99,10 @@ Response should be actionable: working code with brief explanation.`;
92
99
  - If it's a non-code file (config, markdown, email): share a relevant insight, action item, or connection to their current project
93
100
  - If context is minimal: tell a short clever joke (tech humor — never repeat recent ones)
94
101
 
95
- NEVER just describe what the user is doing. Every response must teach, suggest, or connect dots.
96
- (2-5 sentences, or more + code if there's an error or code question).`;
102
+ NEVER just describe what the user is doing. Every response must teach, suggest, or connect dots.`;
97
103
  }
98
104
 
99
- // Non-coding context proactive insights instead of activity descriptions
100
- if (mode === "focus" || mode === "rich") {
101
- return `Based on the above, ALWAYS provide a useful response for the user's HUD.
105
+ return `Based on the above, ALWAYS provide a useful response for the user's HUD.
102
106
  Important: Do NOT respond with NO_REPLY — a response is always required.
103
107
 
104
108
  - If there's an error: investigate and suggest a fix
@@ -109,40 +113,25 @@ Important: Do NOT respond with NO_REPLY — a response is always required.
109
113
 
110
114
  NEVER just describe what the user is doing — they can see their own screen.
111
115
  NEVER respond with "standing by", "monitoring", or similar filler.
112
- Every response must teach something, suggest something, or connect dots the user hasn't noticed.
113
- (2-5 sentences). Be specific and actionable.`;
114
- }
115
-
116
- return `Based on the above, proactively help the user:
117
- - If there's an error: investigate and suggest a fix
118
- - If they seem stuck: offer guidance
119
- - If they're coding: provide relevant insights
120
- - Keep your response concise and actionable (2-5 sentences)`;
116
+ Every response must teach something, suggest something, or connect dots the user hasn't noticed.`;
121
117
  }
122
118
 
123
119
  /**
124
- * Build a structured escalation message with richness proportional to the context window preset.
125
- *
126
- * Expected message sizes:
127
- * lean (selective): ~7 KB / ~1,700 tokens
128
- * standard (focus): ~25 KB / ~6,000 tokens
129
- * rich: ~111 KB / ~28,000 tokens
120
+ * Build a structured escalation message with full context (rich mode).
130
121
  *
131
- * All fit within the 256 KB HTTP hooks limit and 200K+ model context.
132
- *
133
- * In selective mode, sections are prioritized by relevance:
134
- * - Error escalations prioritize error sections
135
- * - Question escalations prioritize audio sections
136
- * - App context is always included
122
+ * Always includes all sections (screen, audio, errors).
123
+ * Response length is controlled by the `responseSize` parameter (small/medium/large)
124
+ * which is set by the user via the HUD overlay slider.
137
125
  */
138
126
  export function buildEscalationMessage(
139
127
  digest: string,
140
128
  context: ContextWindow,
141
129
  entry: AgentEntry,
142
- mode: EscalationMode,
130
+ _mode: EscalationMode,
143
131
  escalationReason?: string,
144
132
  recentFeedback?: FeedbackRecord[],
145
133
  userCommand?: UserCommand,
134
+ responseSize: ResponseSize = "medium",
146
135
  ): string {
147
136
  const sections: string[] = [];
148
137
 
@@ -167,7 +156,6 @@ export function buildEscalationMessage(
167
156
  // Errors — extracted from OCR, full stack traces in rich mode
168
157
  const errors = context.screen.filter(e => hasErrorPattern(e.ocr));
169
158
  const hasErrors = errors.length > 0;
170
- const hasQuestion = escalationReason?.startsWith("question:");
171
159
 
172
160
  // Privacy levels for agent_gateway destination
173
161
  let ocrLevel: import("../types.js").PrivacyLevel = "full";
@@ -183,99 +171,35 @@ export function buildEscalationMessage(
183
171
  const applyAudio = (text: string) => applyLevel(text.slice(0, context.preset.maxTranscriptChars), audioLevel, "audio");
184
172
  const applyTitle = (title: string | undefined) => title ? applyLevel(title, titlesLevel, "titles") : "";
185
173
 
186
- // In selective mode, prioritize sections based on escalation reason
187
- // In focus/rich modes, include everything
188
- if (mode === "selective") {
189
- // Error-triggered: prioritize errors, then screen
190
- if (hasErrors) {
191
- sections.push("## Errors (high priority)");
192
- for (const e of errors) {
193
- sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
194
- }
195
- // Include screen context (reduced)
196
- if (context.screen.length > 0) {
197
- sections.push("## Screen (recent OCR)");
198
- for (const e of context.screen.slice(0, 5)) { // Limit in selective mode
199
- const ago = Math.round((Date.now() - e.ts) / 1000);
200
- const app = normalizeAppName(e.meta.app);
201
- const title = applyTitle(e.meta.windowTitle);
202
- const titlePart = title ? ` [${title}]` : "";
203
- sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
204
- }
205
- }
206
- }
207
- // Question-triggered: prioritize audio, then screen
208
- else if (hasQuestion) {
209
- if (context.audio.length > 0) {
210
- sections.push("## Audio (recent transcripts)");
211
- for (const e of context.audio) {
212
- const ago = Math.round((Date.now() - e.ts) / 1000);
213
- sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
214
- }
215
- }
216
- // Include screen context (reduced)
217
- if (context.screen.length > 0) {
218
- sections.push("## Screen (recent OCR)");
219
- for (const e of context.screen.slice(0, 5)) {
220
- const ago = Math.round((Date.now() - e.ts) / 1000);
221
- const app = normalizeAppName(e.meta.app);
222
- const title = applyTitle(e.meta.windowTitle);
223
- const titlePart = title ? ` [${title}]` : "";
224
- sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
225
- }
226
- }
227
- }
228
- // Other triggers: balanced sections
229
- else {
230
- if (context.screen.length > 0) {
231
- sections.push("## Screen (recent OCR)");
232
- for (const e of context.screen) {
233
- const ago = Math.round((Date.now() - e.ts) / 1000);
234
- const app = normalizeAppName(e.meta.app);
235
- const title = applyTitle(e.meta.windowTitle);
236
- const titlePart = title ? ` [${title}]` : "";
237
- sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
238
- }
239
- }
240
- if (context.audio.length > 0) {
241
- sections.push("## Audio (recent transcripts)");
242
- for (const e of context.audio) {
243
- const ago = Math.round((Date.now() - e.ts) / 1000);
244
- sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
245
- }
246
- }
247
- }
248
- } else {
249
- // Focus/rich mode: include all sections
250
- if (hasErrors) {
251
- sections.push("## Errors (high priority)");
252
- for (const e of errors) {
253
- sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
254
- }
174
+ // Always include all sections (rich mode)
175
+ if (hasErrors) {
176
+ sections.push("## Errors (high priority)");
177
+ for (const e of errors) {
178
+ sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
255
179
  }
180
+ }
256
181
 
257
- if (context.screen.length > 0) {
258
- sections.push("## Screen (recent OCR)");
259
- for (const e of context.screen) {
260
- const ago = Math.round((Date.now() - e.ts) / 1000);
261
- const app = normalizeAppName(e.meta.app);
262
- const title = applyTitle(e.meta.windowTitle);
263
- const titlePart = title ? ` [${title}]` : "";
264
- sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
265
- }
182
+ if (context.screen.length > 0) {
183
+ sections.push("## Screen (recent OCR)");
184
+ for (const e of context.screen) {
185
+ const ago = Math.round((Date.now() - e.ts) / 1000);
186
+ const app = normalizeAppName(e.meta.app);
187
+ const title = applyTitle(e.meta.windowTitle);
188
+ const titlePart = title ? ` [${title}]` : "";
189
+ sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
266
190
  }
191
+ }
267
192
 
268
- if (context.audio.length > 0) {
269
- sections.push("## Audio (recent transcripts)");
270
- for (const e of context.audio) {
271
- const ago = Math.round((Date.now() - e.ts) / 1000);
272
- sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
273
- }
193
+ if (context.audio.length > 0) {
194
+ sections.push("## Audio (recent transcripts)");
195
+ for (const e of context.audio) {
196
+ const ago = Math.round((Date.now() - e.ts) / 1000);
197
+ sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
274
198
  }
275
199
  }
276
200
 
277
- // Mode-specific instructions (now context-aware)
278
- sections.push(getInstructions(mode, context));
201
+ // Context-aware instructions (no size — that's in the response length section below)
202
+ sections.push(getInstructions(context));
279
203
 
280
204
  // Stale escalation hint — forces a proactive response after prolonged silence
281
205
  if (escalationReason === "stale") {
@@ -293,7 +217,10 @@ the local analyzer reported idle/no-change. Provide a PROACTIVE response:
293
217
  sections.push(formatInlineFeedback(recentFeedback));
294
218
  }
295
219
 
296
- sections.push("Respond naturallythis will appear on the user's HUD overlay.");
220
+ // Response length single authoritative size instruction, placed last for salience
221
+ const limit = sizeInstruction(responseSize);
222
+ sections.push(`## Response Length
223
+ Your response MUST be ${limit}. This appears on the user's HUD overlay — be specific and actionable.`);
297
224
 
298
225
  return sections.join("\n\n");
299
226
  }
@@ -1,5 +1,5 @@
1
1
  import { execFile } from "node:child_process";
2
- import type { InboundMessage } from "../types.js";
2
+ import type { InboundMessage, ResponseSize } from "../types.js";
3
3
  import type { WsHandler } from "./ws-handler.js";
4
4
  import type { AudioPipeline } from "../audio/pipeline.js";
5
5
  import type { CoreConfig } from "../types.js";
@@ -101,7 +101,7 @@ export function setupCommands(deps: CommandDeps): void {
101
101
  break;
102
102
  }
103
103
  case "command": {
104
- handleCommand(msg.action, deps);
104
+ handleCommand(msg, deps);
105
105
  log(TAG, `command processed: ${msg.action}`);
106
106
  break;
107
107
  }
@@ -109,8 +109,11 @@ export function setupCommands(deps: CommandDeps): void {
109
109
  });
110
110
  }
111
111
 
112
- function handleCommand(action: string, deps: CommandDeps): void {
112
+ const VALID_RESPONSE_SIZES = new Set<ResponseSize>(["small", "medium", "large"]);
113
+
114
+ function handleCommand(msg: InboundMessage & { action: string }, deps: CommandDeps): void {
113
115
  const { wsHandler, systemAudioPipeline, micPipeline } = deps;
116
+ const action = msg.action;
114
117
 
115
118
  switch (action) {
116
119
  case "toggle_audio": {
@@ -173,6 +176,16 @@ function handleCommand(action: string, deps: CommandDeps): void {
173
176
  log(TAG, `escalation toggled ${nowActive ? "ON" : "OFF"}`);
174
177
  break;
175
178
  }
179
+ case "set_response_size": {
180
+ const size = (msg as any).responseSize as string;
181
+ if (VALID_RESPONSE_SIZES.has(size as ResponseSize)) {
182
+ wsHandler.updateState({ responseSize: size as ResponseSize });
183
+ log(TAG, `response size set to ${size}`);
184
+ } else {
185
+ log(TAG, `invalid response size: ${size}`);
186
+ }
187
+ break;
188
+ }
176
189
  case "open_settings": {
177
190
  const envPath = loadedEnvPath || `${process.env.HOME || process.env.USERPROFILE}/.sinain/.env`;
178
191
  const cmd = process.platform === "win32" ? "notepad" : "open";
@@ -39,6 +39,7 @@ export class WsHandler {
39
39
  screen: "off",
40
40
  escalation: "active",
41
41
  connection: "disconnected",
42
+ responseSize: "medium",
42
43
  };
43
44
  private replayBuffer: FeedMessage[] = [];
44
45
  private spawnTaskBuffer: Map<string, SpawnTaskMessage> = new Map();
@@ -75,6 +76,7 @@ export class WsHandler {
75
76
  screen: this.state.screen,
76
77
  escalation: this.state.escalation,
77
78
  connection: this.state.connection,
79
+ responseSize: this.state.responseSize,
78
80
  });
79
81
 
80
82
  // Replay recent feed messages for late-joining clients
@@ -151,13 +153,14 @@ export class WsHandler {
151
153
 
152
154
  /** Send a status update to all connected overlays. */
153
155
  broadcastStatus(): void {
154
- const msg: StatusMessage & { envPath?: string; escalation?: string } = {
156
+ const msg: StatusMessage & { envPath?: string; escalation?: string; responseSize?: string } = {
155
157
  type: "status",
156
158
  audio: this.state.audio,
157
159
  mic: this.state.mic,
158
160
  screen: this.state.screen,
159
161
  escalation: this.state.escalation,
160
162
  connection: this.state.connection,
163
+ responseSize: this.state.responseSize,
161
164
  };
162
165
  if (loadedEnvPath) msg.envPath = loadedEnvPath;
163
166
  this.broadcastMessage(msg);
@@ -20,6 +20,7 @@ export interface StatusMessage {
20
20
  screen: string;
21
21
  escalation?: string;
22
22
  connection: string;
23
+ responseSize?: string;
23
24
  }
24
25
 
25
26
  /** sinain-core → Overlay: heartbeat ping */
@@ -244,6 +245,7 @@ export interface StopResult {
244
245
 
245
246
  export type EscalationMode = "off" | "selective" | "focus" | "rich";
246
247
  export type ContextRichness = "lean" | "standard" | "rich";
248
+ export type ResponseSize = "small" | "medium" | "large";
247
249
 
248
250
  export type AnalysisProvider = "openrouter" | "ollama";
249
251
 
@@ -393,6 +395,7 @@ export interface BridgeState {
393
395
  screen: "active" | "off";
394
396
  escalation: "active" | "paused";
395
397
  connection: "connected" | "disconnected" | "connecting";
398
+ responseSize: ResponseSize;
396
399
  }
397
400
 
398
401
  // ── Learning / feedback types ──
File without changes
@@ -0,0 +1,43 @@
1
+ """Base adapter and data classes for benchmark evaluation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ @dataclass
10
+ class BenchmarkQuestion:
11
+ id: str
12
+ text: str
13
+ gold_answer: str
14
+ category: str # single-session, multi-session, temporal, etc.
15
+ evidence_session_ids: list[str] = field(default_factory=list)
16
+ metadata: dict = field(default_factory=dict)
17
+
18
+
19
+ @dataclass
20
+ class BenchmarkInstance:
21
+ """A set of conversations + questions that share the same context."""
22
+ id: str
23
+ sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
24
+ questions: list[BenchmarkQuestion] = field(default_factory=list)
25
+ raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
26
+ metadata: dict = field(default_factory=dict)
27
+
28
+
29
+ class BenchmarkAdapter(ABC):
30
+ """Abstract adapter: converts a published benchmark into sinain's format."""
31
+
32
+ @property
33
+ @abstractmethod
34
+ def name(self) -> str:
35
+ """Benchmark name (e.g. 'longmemeval', 'locomo')."""
36
+
37
+ @abstractmethod
38
+ def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
39
+ """Download (if needed) and parse the benchmark dataset."""
40
+
41
+ @abstractmethod
42
+ def format_full_context(self, instance: BenchmarkInstance) -> str:
43
+ """Render the full conversation history as a text string for the baseline condition."""
@@ -0,0 +1,23 @@
1
+ """Benchmark configuration — models, paths, thresholds."""
2
+
3
+ from pathlib import Path
4
+
5
+ BENCHMARKS_DIR = Path(__file__).resolve().parent
6
+ DATA_DIR = BENCHMARKS_DIR / "data"
7
+ RESULTS_DIR = BENCHMARKS_DIR / "results"
8
+
9
+ # LLM models (via OpenRouter)
10
+ QA_MODEL = "google/gemini-2.5-flash"
11
+ JUDGE_MODEL = "openai/gpt-4o"
12
+
13
+ # Retrieval
14
+ K_VALUES = [1, 3, 5, 10]
15
+ MAX_FACTS_PER_QUERY = 10
16
+
17
+ # Ingestion
18
+ DISTILLER_TIMEOUT_S = 30
19
+ INTEGRATOR_TIMEOUT_S = 60
20
+
21
+ # Dataset URLs
22
+ LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
23
+ LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
@@ -0,0 +1,146 @@
1
+ """Evaluation pipeline — score answers and compute aggregate metrics.
2
+
3
+ Combines:
4
+ - LLM-as-Judge (QA scoring, 1-5 scale)
5
+ - Retrieval metrics (Recall@k, NDCG@k)
6
+ - Token F1 overlap (mechanical, free)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ import re
13
+ from collections import defaultdict
14
+
15
+ from .base_adapter import BenchmarkQuestion
16
+ from .config import K_VALUES
17
+
18
+
19
+ # ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
20
+
21
+ def _tokenize(text: str) -> list[str]:
22
+ """Simple whitespace + punctuation tokenizer."""
23
+ return re.findall(r"\w+", text.lower())
24
+
25
+
26
+ def token_f1(predicted: str, gold: str | int) -> float:
27
+ """Compute token-level F1 between predicted and gold answers."""
28
+ pred_tokens = set(_tokenize(str(predicted)))
29
+ gold_tokens = set(_tokenize(str(gold)))
30
+ if not gold_tokens or not pred_tokens:
31
+ return 0.0
32
+ overlap = pred_tokens & gold_tokens
33
+ if not overlap:
34
+ return 0.0
35
+ precision = len(overlap) / len(pred_tokens)
36
+ recall = len(overlap) / len(gold_tokens)
37
+ return 2 * precision * recall / (precision + recall)
38
+
39
+
40
+ # ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
41
+
42
+ def dcg_at_k(relevant_positions: list[int], k: int) -> float:
43
+ """Discounted Cumulative Gain at k."""
44
+ score = 0.0
45
+ for pos in relevant_positions:
46
+ if pos < k:
47
+ score += 1.0 / math.log2(pos + 2)
48
+ return score
49
+
50
+
51
+ def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
52
+ """Normalized DCG at k."""
53
+ dcg = dcg_at_k(relevant_positions, k)
54
+ ideal_positions = list(range(min(num_relevant, k)))
55
+ idcg = dcg_at_k(ideal_positions, k)
56
+ return dcg / idcg if idcg > 0 else 0.0
57
+
58
+
59
+ def compute_retrieval_metrics(
60
+ retrieved_ids: list[str],
61
+ expected_ids: list[str],
62
+ k_values: list[int] | None = None,
63
+ ) -> dict:
64
+ """Compute Recall@k and NDCG@k for a single question."""
65
+ ks = k_values or K_VALUES
66
+ expected_set = set(expected_ids)
67
+ relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
68
+
69
+ result = {}
70
+ for k in ks:
71
+ hit = any(pos < k for pos in relevant_positions)
72
+ result[f"recall@{k}"] = 1.0 if hit else 0.0
73
+ result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
74
+ return result
75
+
76
+
77
+ # ── Aggregate metrics ─────────────────────────────────────────────────────────
78
+
79
+ def aggregate_results(per_question: list[dict]) -> dict:
80
+ """Compute aggregate metrics from per-question results.
81
+
82
+ Each per_question entry has:
83
+ {id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
84
+ """
85
+ if not per_question:
86
+ return {"error": "no results"}
87
+
88
+ # Per-condition scores
89
+ condition_scores: dict[str, list[float]] = defaultdict(list)
90
+ condition_f1s: dict[str, list[float]] = defaultdict(list)
91
+ # Per-category per-condition
92
+ cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
93
+ # Retrieval
94
+ retrieval_metrics: dict[str, list[float]] = defaultdict(list)
95
+
96
+ for q in per_question:
97
+ cat = q.get("category", "unknown")
98
+
99
+ for cond, data in q.get("answers", {}).items():
100
+ if data.get("score") is not None:
101
+ condition_scores[cond].append(data["score"])
102
+ cat_scores[cat][cond].append(data["score"])
103
+ if data.get("f1") is not None:
104
+ condition_f1s[cond].append(data["f1"])
105
+
106
+ for metric, val in q.get("retrieval", {}).items():
107
+ if isinstance(val, (int, float)):
108
+ retrieval_metrics[metric].append(val)
109
+
110
+ def _mean(lst: list[float]) -> float:
111
+ return round(sum(lst) / len(lst), 4) if lst else 0.0
112
+
113
+ # Build summary
114
+ conditions = {}
115
+ for cond in sorted(condition_scores):
116
+ conditions[cond] = {
117
+ "mean_score": _mean(condition_scores[cond]),
118
+ "mean_f1": _mean(condition_f1s.get(cond, [])),
119
+ "n": len(condition_scores[cond]),
120
+ }
121
+
122
+ # IPR: sinain-memory vs full-context
123
+ sm_scores = condition_scores.get("sinain-memory", [])
124
+ fc_scores = condition_scores.get("full-context", [])
125
+ ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
126
+
127
+ # Category breakdown
128
+ categories = {}
129
+ for cat in sorted(cat_scores):
130
+ categories[cat] = {}
131
+ for cond in sorted(cat_scores[cat]):
132
+ categories[cat][cond] = {
133
+ "mean_score": _mean(cat_scores[cat][cond]),
134
+ "n": len(cat_scores[cat][cond]),
135
+ }
136
+
137
+ # Retrieval summary
138
+ retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
139
+
140
+ return {
141
+ "total_questions": len(per_question),
142
+ "conditions": conditions,
143
+ "ipr": round(ipr, 4) if ipr else None,
144
+ "categories": categories,
145
+ "retrieval": retrieval,
146
+ }