@geravant/sinain 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/src/escalation/escalator.ts +1 -0
- package/sinain-core/src/escalation/message-builder.ts +45 -118
- package/sinain-core/src/overlay/commands.ts +16 -3
- package/sinain-core/src/overlay/ws-handler.ts +4 -1
- package/sinain-core/src/types.ts +3 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
- package/sinain-memory/eval/benchmarks/config.py +23 -0
- package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
- package/sinain-memory/eval/benchmarks/ingest.py +152 -0
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
- package/sinain-memory/eval/benchmarks/query.py +172 -0
- package/sinain-memory/eval/benchmarks/report.py +87 -0
- package/sinain-memory/eval/benchmarks/runner.py +276 -0
- package/sinain-memory/koog-config.json +11 -0
package/package.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ContextWindow, AgentEntry, EscalationMode, FeedbackRecord, UserCommand } from "../types.js";
|
|
1
|
+
import type { ContextWindow, AgentEntry, EscalationMode, FeedbackRecord, UserCommand, ResponseSize } from "../types.js";
|
|
2
2
|
import { normalizeAppName } from "../agent/context-window.js";
|
|
3
3
|
import { levelFor, applyLevel } from "../privacy/index.js";
|
|
4
4
|
|
|
@@ -67,11 +67,18 @@ export function isCodingContext(context: ContextWindow): CodingContextResult {
|
|
|
67
67
|
};
|
|
68
68
|
}
|
|
69
69
|
|
|
70
|
-
function
|
|
70
|
+
function sizeInstruction(size: ResponseSize): string {
|
|
71
|
+
switch (size) {
|
|
72
|
+
case "small": return "1-2 sentences";
|
|
73
|
+
case "large": return "3-5 sentences";
|
|
74
|
+
default: return "2-3 sentences";
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function getInstructions(context: ContextWindow): string {
|
|
71
79
|
const { coding, needsSolution } = isCodingContext(context);
|
|
72
80
|
|
|
73
81
|
if (needsSolution) {
|
|
74
|
-
// Coding challenge/problem - be very action-oriented
|
|
75
82
|
return `The user is working on a coding problem. Be PROACTIVE and SOLVE IT:
|
|
76
83
|
|
|
77
84
|
1. Provide a solution approach and working code based on what you can see
|
|
@@ -92,13 +99,10 @@ Response should be actionable: working code with brief explanation.`;
|
|
|
92
99
|
- If it's a non-code file (config, markdown, email): share a relevant insight, action item, or connection to their current project
|
|
93
100
|
- If context is minimal: tell a short clever joke (tech humor — never repeat recent ones)
|
|
94
101
|
|
|
95
|
-
NEVER just describe what the user is doing. Every response must teach, suggest, or connect dots
|
|
96
|
-
(2-5 sentences, or more + code if there's an error or code question).`;
|
|
102
|
+
NEVER just describe what the user is doing. Every response must teach, suggest, or connect dots.`;
|
|
97
103
|
}
|
|
98
104
|
|
|
99
|
-
|
|
100
|
-
if (mode === "focus" || mode === "rich") {
|
|
101
|
-
return `Based on the above, ALWAYS provide a useful response for the user's HUD.
|
|
105
|
+
return `Based on the above, ALWAYS provide a useful response for the user's HUD.
|
|
102
106
|
Important: Do NOT respond with NO_REPLY — a response is always required.
|
|
103
107
|
|
|
104
108
|
- If there's an error: investigate and suggest a fix
|
|
@@ -109,40 +113,25 @@ Important: Do NOT respond with NO_REPLY — a response is always required.
|
|
|
109
113
|
|
|
110
114
|
NEVER just describe what the user is doing — they can see their own screen.
|
|
111
115
|
NEVER respond with "standing by", "monitoring", or similar filler.
|
|
112
|
-
Every response must teach something, suggest something, or connect dots the user hasn't noticed
|
|
113
|
-
(2-5 sentences). Be specific and actionable.`;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
return `Based on the above, proactively help the user:
|
|
117
|
-
- If there's an error: investigate and suggest a fix
|
|
118
|
-
- If they seem stuck: offer guidance
|
|
119
|
-
- If they're coding: provide relevant insights
|
|
120
|
-
- Keep your response concise and actionable (2-5 sentences)`;
|
|
116
|
+
Every response must teach something, suggest something, or connect dots the user hasn't noticed.`;
|
|
121
117
|
}
|
|
122
118
|
|
|
123
119
|
/**
|
|
124
|
-
* Build a structured escalation message with
|
|
125
|
-
*
|
|
126
|
-
* Expected message sizes:
|
|
127
|
-
* lean (selective): ~7 KB / ~1,700 tokens
|
|
128
|
-
* standard (focus): ~25 KB / ~6,000 tokens
|
|
129
|
-
* rich: ~111 KB / ~28,000 tokens
|
|
120
|
+
* Build a structured escalation message with full context (rich mode).
|
|
130
121
|
*
|
|
131
|
-
*
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
* - Error escalations prioritize error sections
|
|
135
|
-
* - Question escalations prioritize audio sections
|
|
136
|
-
* - App context is always included
|
|
122
|
+
* Always includes all sections (screen, audio, errors).
|
|
123
|
+
* Response length is controlled by the `responseSize` parameter (small/medium/large)
|
|
124
|
+
* which is set by the user via the HUD overlay slider.
|
|
137
125
|
*/
|
|
138
126
|
export function buildEscalationMessage(
|
|
139
127
|
digest: string,
|
|
140
128
|
context: ContextWindow,
|
|
141
129
|
entry: AgentEntry,
|
|
142
|
-
|
|
130
|
+
_mode: EscalationMode,
|
|
143
131
|
escalationReason?: string,
|
|
144
132
|
recentFeedback?: FeedbackRecord[],
|
|
145
133
|
userCommand?: UserCommand,
|
|
134
|
+
responseSize: ResponseSize = "medium",
|
|
146
135
|
): string {
|
|
147
136
|
const sections: string[] = [];
|
|
148
137
|
|
|
@@ -167,7 +156,6 @@ export function buildEscalationMessage(
|
|
|
167
156
|
// Errors — extracted from OCR, full stack traces in rich mode
|
|
168
157
|
const errors = context.screen.filter(e => hasErrorPattern(e.ocr));
|
|
169
158
|
const hasErrors = errors.length > 0;
|
|
170
|
-
const hasQuestion = escalationReason?.startsWith("question:");
|
|
171
159
|
|
|
172
160
|
// Privacy levels for agent_gateway destination
|
|
173
161
|
let ocrLevel: import("../types.js").PrivacyLevel = "full";
|
|
@@ -183,99 +171,35 @@ export function buildEscalationMessage(
|
|
|
183
171
|
const applyAudio = (text: string) => applyLevel(text.slice(0, context.preset.maxTranscriptChars), audioLevel, "audio");
|
|
184
172
|
const applyTitle = (title: string | undefined) => title ? applyLevel(title, titlesLevel, "titles") : "";
|
|
185
173
|
|
|
186
|
-
//
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
sections.push("## Errors (high priority)");
|
|
192
|
-
for (const e of errors) {
|
|
193
|
-
sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
|
|
194
|
-
}
|
|
195
|
-
// Include screen context (reduced)
|
|
196
|
-
if (context.screen.length > 0) {
|
|
197
|
-
sections.push("## Screen (recent OCR)");
|
|
198
|
-
for (const e of context.screen.slice(0, 5)) { // Limit in selective mode
|
|
199
|
-
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
200
|
-
const app = normalizeAppName(e.meta.app);
|
|
201
|
-
const title = applyTitle(e.meta.windowTitle);
|
|
202
|
-
const titlePart = title ? ` [${title}]` : "";
|
|
203
|
-
sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
// Question-triggered: prioritize audio, then screen
|
|
208
|
-
else if (hasQuestion) {
|
|
209
|
-
if (context.audio.length > 0) {
|
|
210
|
-
sections.push("## Audio (recent transcripts)");
|
|
211
|
-
for (const e of context.audio) {
|
|
212
|
-
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
213
|
-
sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
// Include screen context (reduced)
|
|
217
|
-
if (context.screen.length > 0) {
|
|
218
|
-
sections.push("## Screen (recent OCR)");
|
|
219
|
-
for (const e of context.screen.slice(0, 5)) {
|
|
220
|
-
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
221
|
-
const app = normalizeAppName(e.meta.app);
|
|
222
|
-
const title = applyTitle(e.meta.windowTitle);
|
|
223
|
-
const titlePart = title ? ` [${title}]` : "";
|
|
224
|
-
sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
// Other triggers: balanced sections
|
|
229
|
-
else {
|
|
230
|
-
if (context.screen.length > 0) {
|
|
231
|
-
sections.push("## Screen (recent OCR)");
|
|
232
|
-
for (const e of context.screen) {
|
|
233
|
-
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
234
|
-
const app = normalizeAppName(e.meta.app);
|
|
235
|
-
const title = applyTitle(e.meta.windowTitle);
|
|
236
|
-
const titlePart = title ? ` [${title}]` : "";
|
|
237
|
-
sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
if (context.audio.length > 0) {
|
|
241
|
-
sections.push("## Audio (recent transcripts)");
|
|
242
|
-
for (const e of context.audio) {
|
|
243
|
-
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
244
|
-
sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
} else {
|
|
249
|
-
// Focus/rich mode: include all sections
|
|
250
|
-
if (hasErrors) {
|
|
251
|
-
sections.push("## Errors (high priority)");
|
|
252
|
-
for (const e of errors) {
|
|
253
|
-
sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
|
|
254
|
-
}
|
|
174
|
+
// Always include all sections (rich mode)
|
|
175
|
+
if (hasErrors) {
|
|
176
|
+
sections.push("## Errors (high priority)");
|
|
177
|
+
for (const e of errors) {
|
|
178
|
+
sections.push(`\`\`\`\n${applyOcr(e.ocr)}\n\`\`\``);
|
|
255
179
|
}
|
|
180
|
+
}
|
|
256
181
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
}
|
|
182
|
+
if (context.screen.length > 0) {
|
|
183
|
+
sections.push("## Screen (recent OCR)");
|
|
184
|
+
for (const e of context.screen) {
|
|
185
|
+
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
186
|
+
const app = normalizeAppName(e.meta.app);
|
|
187
|
+
const title = applyTitle(e.meta.windowTitle);
|
|
188
|
+
const titlePart = title ? ` [${title}]` : "";
|
|
189
|
+
sections.push(`- [${ago}s ago] [${app}]${titlePart} ${applyOcr(e.ocr)}`);
|
|
266
190
|
}
|
|
191
|
+
}
|
|
267
192
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
}
|
|
193
|
+
if (context.audio.length > 0) {
|
|
194
|
+
sections.push("## Audio (recent transcripts)");
|
|
195
|
+
for (const e of context.audio) {
|
|
196
|
+
const ago = Math.round((Date.now() - e.ts) / 1000);
|
|
197
|
+
sections.push(`- [${ago}s ago] "${applyAudio(e.text)}"`);
|
|
274
198
|
}
|
|
275
199
|
}
|
|
276
200
|
|
|
277
|
-
//
|
|
278
|
-
sections.push(getInstructions(
|
|
201
|
+
// Context-aware instructions (no size — that's in the response length section below)
|
|
202
|
+
sections.push(getInstructions(context));
|
|
279
203
|
|
|
280
204
|
// Stale escalation hint — forces a proactive response after prolonged silence
|
|
281
205
|
if (escalationReason === "stale") {
|
|
@@ -293,7 +217,10 @@ the local analyzer reported idle/no-change. Provide a PROACTIVE response:
|
|
|
293
217
|
sections.push(formatInlineFeedback(recentFeedback));
|
|
294
218
|
}
|
|
295
219
|
|
|
296
|
-
|
|
220
|
+
// Response length — single authoritative size instruction, placed last for salience
|
|
221
|
+
const limit = sizeInstruction(responseSize);
|
|
222
|
+
sections.push(`## Response Length
|
|
223
|
+
Your response MUST be ${limit}. This appears on the user's HUD overlay — be specific and actionable.`);
|
|
297
224
|
|
|
298
225
|
return sections.join("\n\n");
|
|
299
226
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { execFile } from "node:child_process";
|
|
2
|
-
import type { InboundMessage } from "../types.js";
|
|
2
|
+
import type { InboundMessage, ResponseSize } from "../types.js";
|
|
3
3
|
import type { WsHandler } from "./ws-handler.js";
|
|
4
4
|
import type { AudioPipeline } from "../audio/pipeline.js";
|
|
5
5
|
import type { CoreConfig } from "../types.js";
|
|
@@ -101,7 +101,7 @@ export function setupCommands(deps: CommandDeps): void {
|
|
|
101
101
|
break;
|
|
102
102
|
}
|
|
103
103
|
case "command": {
|
|
104
|
-
handleCommand(msg
|
|
104
|
+
handleCommand(msg, deps);
|
|
105
105
|
log(TAG, `command processed: ${msg.action}`);
|
|
106
106
|
break;
|
|
107
107
|
}
|
|
@@ -109,8 +109,11 @@ export function setupCommands(deps: CommandDeps): void {
|
|
|
109
109
|
});
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
|
|
112
|
+
const VALID_RESPONSE_SIZES = new Set<ResponseSize>(["small", "medium", "large"]);
|
|
113
|
+
|
|
114
|
+
function handleCommand(msg: InboundMessage & { action: string }, deps: CommandDeps): void {
|
|
113
115
|
const { wsHandler, systemAudioPipeline, micPipeline } = deps;
|
|
116
|
+
const action = msg.action;
|
|
114
117
|
|
|
115
118
|
switch (action) {
|
|
116
119
|
case "toggle_audio": {
|
|
@@ -173,6 +176,16 @@ function handleCommand(action: string, deps: CommandDeps): void {
|
|
|
173
176
|
log(TAG, `escalation toggled ${nowActive ? "ON" : "OFF"}`);
|
|
174
177
|
break;
|
|
175
178
|
}
|
|
179
|
+
case "set_response_size": {
|
|
180
|
+
const size = (msg as any).responseSize as string;
|
|
181
|
+
if (VALID_RESPONSE_SIZES.has(size as ResponseSize)) {
|
|
182
|
+
wsHandler.updateState({ responseSize: size as ResponseSize });
|
|
183
|
+
log(TAG, `response size set to ${size}`);
|
|
184
|
+
} else {
|
|
185
|
+
log(TAG, `invalid response size: ${size}`);
|
|
186
|
+
}
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
176
189
|
case "open_settings": {
|
|
177
190
|
const envPath = loadedEnvPath || `${process.env.HOME || process.env.USERPROFILE}/.sinain/.env`;
|
|
178
191
|
const cmd = process.platform === "win32" ? "notepad" : "open";
|
|
@@ -39,6 +39,7 @@ export class WsHandler {
|
|
|
39
39
|
screen: "off",
|
|
40
40
|
escalation: "active",
|
|
41
41
|
connection: "disconnected",
|
|
42
|
+
responseSize: "medium",
|
|
42
43
|
};
|
|
43
44
|
private replayBuffer: FeedMessage[] = [];
|
|
44
45
|
private spawnTaskBuffer: Map<string, SpawnTaskMessage> = new Map();
|
|
@@ -75,6 +76,7 @@ export class WsHandler {
|
|
|
75
76
|
screen: this.state.screen,
|
|
76
77
|
escalation: this.state.escalation,
|
|
77
78
|
connection: this.state.connection,
|
|
79
|
+
responseSize: this.state.responseSize,
|
|
78
80
|
});
|
|
79
81
|
|
|
80
82
|
// Replay recent feed messages for late-joining clients
|
|
@@ -151,13 +153,14 @@ export class WsHandler {
|
|
|
151
153
|
|
|
152
154
|
/** Send a status update to all connected overlays. */
|
|
153
155
|
broadcastStatus(): void {
|
|
154
|
-
const msg: StatusMessage & { envPath?: string; escalation?: string } = {
|
|
156
|
+
const msg: StatusMessage & { envPath?: string; escalation?: string; responseSize?: string } = {
|
|
155
157
|
type: "status",
|
|
156
158
|
audio: this.state.audio,
|
|
157
159
|
mic: this.state.mic,
|
|
158
160
|
screen: this.state.screen,
|
|
159
161
|
escalation: this.state.escalation,
|
|
160
162
|
connection: this.state.connection,
|
|
163
|
+
responseSize: this.state.responseSize,
|
|
161
164
|
};
|
|
162
165
|
if (loadedEnvPath) msg.envPath = loadedEnvPath;
|
|
163
166
|
this.broadcastMessage(msg);
|
package/sinain-core/src/types.ts
CHANGED
|
@@ -20,6 +20,7 @@ export interface StatusMessage {
|
|
|
20
20
|
screen: string;
|
|
21
21
|
escalation?: string;
|
|
22
22
|
connection: string;
|
|
23
|
+
responseSize?: string;
|
|
23
24
|
}
|
|
24
25
|
|
|
25
26
|
/** sinain-core → Overlay: heartbeat ping */
|
|
@@ -244,6 +245,7 @@ export interface StopResult {
|
|
|
244
245
|
|
|
245
246
|
export type EscalationMode = "off" | "selective" | "focus" | "rich";
|
|
246
247
|
export type ContextRichness = "lean" | "standard" | "rich";
|
|
248
|
+
export type ResponseSize = "small" | "medium" | "large";
|
|
247
249
|
|
|
248
250
|
export type AnalysisProvider = "openrouter" | "ollama";
|
|
249
251
|
|
|
@@ -393,6 +395,7 @@ export interface BridgeState {
|
|
|
393
395
|
screen: "active" | "off";
|
|
394
396
|
escalation: "active" | "paused";
|
|
395
397
|
connection: "connected" | "disconnected" | "connecting";
|
|
398
|
+
responseSize: ResponseSize;
|
|
396
399
|
}
|
|
397
400
|
|
|
398
401
|
// ── Learning / feedback types ──
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BenchmarkQuestion:
|
|
11
|
+
id: str
|
|
12
|
+
text: str
|
|
13
|
+
gold_answer: str
|
|
14
|
+
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
+
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
+
metadata: dict = field(default_factory=dict)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class BenchmarkInstance:
|
|
21
|
+
"""A set of conversations + questions that share the same context."""
|
|
22
|
+
id: str
|
|
23
|
+
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
+
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
+
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
+
metadata: dict = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BenchmarkAdapter(ABC):
|
|
30
|
+
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def name(self) -> str:
|
|
35
|
+
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
+
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
+
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
+
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
+
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
+
|
|
9
|
+
# LLM models (via OpenRouter)
|
|
10
|
+
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
+
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
+
|
|
13
|
+
# Retrieval
|
|
14
|
+
K_VALUES = [1, 3, 5, 10]
|
|
15
|
+
MAX_FACTS_PER_QUERY = 10
|
|
16
|
+
|
|
17
|
+
# Ingestion
|
|
18
|
+
DISTILLER_TIMEOUT_S = 30
|
|
19
|
+
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
+
|
|
21
|
+
# Dataset URLs
|
|
22
|
+
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
+
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Evaluation pipeline — score answers and compute aggregate metrics.
|
|
2
|
+
|
|
3
|
+
Combines:
|
|
4
|
+
- LLM-as-Judge (QA scoring, 1-5 scale)
|
|
5
|
+
- Retrieval metrics (Recall@k, NDCG@k)
|
|
6
|
+
- Token F1 overlap (mechanical, free)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
from .base_adapter import BenchmarkQuestion
|
|
16
|
+
from .config import K_VALUES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
def _tokenize(text: str) -> list[str]:
|
|
22
|
+
"""Simple whitespace + punctuation tokenizer."""
|
|
23
|
+
return re.findall(r"\w+", text.lower())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def token_f1(predicted: str, gold: str | int) -> float:
|
|
27
|
+
"""Compute token-level F1 between predicted and gold answers."""
|
|
28
|
+
pred_tokens = set(_tokenize(str(predicted)))
|
|
29
|
+
gold_tokens = set(_tokenize(str(gold)))
|
|
30
|
+
if not gold_tokens or not pred_tokens:
|
|
31
|
+
return 0.0
|
|
32
|
+
overlap = pred_tokens & gold_tokens
|
|
33
|
+
if not overlap:
|
|
34
|
+
return 0.0
|
|
35
|
+
precision = len(overlap) / len(pred_tokens)
|
|
36
|
+
recall = len(overlap) / len(gold_tokens)
|
|
37
|
+
return 2 * precision * recall / (precision + recall)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
|
|
41
|
+
|
|
42
|
+
def dcg_at_k(relevant_positions: list[int], k: int) -> float:
|
|
43
|
+
"""Discounted Cumulative Gain at k."""
|
|
44
|
+
score = 0.0
|
|
45
|
+
for pos in relevant_positions:
|
|
46
|
+
if pos < k:
|
|
47
|
+
score += 1.0 / math.log2(pos + 2)
|
|
48
|
+
return score
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
|
|
52
|
+
"""Normalized DCG at k."""
|
|
53
|
+
dcg = dcg_at_k(relevant_positions, k)
|
|
54
|
+
ideal_positions = list(range(min(num_relevant, k)))
|
|
55
|
+
idcg = dcg_at_k(ideal_positions, k)
|
|
56
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compute_retrieval_metrics(
|
|
60
|
+
retrieved_ids: list[str],
|
|
61
|
+
expected_ids: list[str],
|
|
62
|
+
k_values: list[int] | None = None,
|
|
63
|
+
) -> dict:
|
|
64
|
+
"""Compute Recall@k and NDCG@k for a single question."""
|
|
65
|
+
ks = k_values or K_VALUES
|
|
66
|
+
expected_set = set(expected_ids)
|
|
67
|
+
relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
|
|
68
|
+
|
|
69
|
+
result = {}
|
|
70
|
+
for k in ks:
|
|
71
|
+
hit = any(pos < k for pos in relevant_positions)
|
|
72
|
+
result[f"recall@{k}"] = 1.0 if hit else 0.0
|
|
73
|
+
result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Aggregate metrics ─────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def aggregate_results(per_question: list[dict]) -> dict:
|
|
80
|
+
"""Compute aggregate metrics from per-question results.
|
|
81
|
+
|
|
82
|
+
Each per_question entry has:
|
|
83
|
+
{id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
|
|
84
|
+
"""
|
|
85
|
+
if not per_question:
|
|
86
|
+
return {"error": "no results"}
|
|
87
|
+
|
|
88
|
+
# Per-condition scores
|
|
89
|
+
condition_scores: dict[str, list[float]] = defaultdict(list)
|
|
90
|
+
condition_f1s: dict[str, list[float]] = defaultdict(list)
|
|
91
|
+
# Per-category per-condition
|
|
92
|
+
cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
|
93
|
+
# Retrieval
|
|
94
|
+
retrieval_metrics: dict[str, list[float]] = defaultdict(list)
|
|
95
|
+
|
|
96
|
+
for q in per_question:
|
|
97
|
+
cat = q.get("category", "unknown")
|
|
98
|
+
|
|
99
|
+
for cond, data in q.get("answers", {}).items():
|
|
100
|
+
if data.get("score") is not None:
|
|
101
|
+
condition_scores[cond].append(data["score"])
|
|
102
|
+
cat_scores[cat][cond].append(data["score"])
|
|
103
|
+
if data.get("f1") is not None:
|
|
104
|
+
condition_f1s[cond].append(data["f1"])
|
|
105
|
+
|
|
106
|
+
for metric, val in q.get("retrieval", {}).items():
|
|
107
|
+
if isinstance(val, (int, float)):
|
|
108
|
+
retrieval_metrics[metric].append(val)
|
|
109
|
+
|
|
110
|
+
def _mean(lst: list[float]) -> float:
|
|
111
|
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
|
112
|
+
|
|
113
|
+
# Build summary
|
|
114
|
+
conditions = {}
|
|
115
|
+
for cond in sorted(condition_scores):
|
|
116
|
+
conditions[cond] = {
|
|
117
|
+
"mean_score": _mean(condition_scores[cond]),
|
|
118
|
+
"mean_f1": _mean(condition_f1s.get(cond, [])),
|
|
119
|
+
"n": len(condition_scores[cond]),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# IPR: sinain-memory vs full-context
|
|
123
|
+
sm_scores = condition_scores.get("sinain-memory", [])
|
|
124
|
+
fc_scores = condition_scores.get("full-context", [])
|
|
125
|
+
ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
|
|
126
|
+
|
|
127
|
+
# Category breakdown
|
|
128
|
+
categories = {}
|
|
129
|
+
for cat in sorted(cat_scores):
|
|
130
|
+
categories[cat] = {}
|
|
131
|
+
for cond in sorted(cat_scores[cat]):
|
|
132
|
+
categories[cat][cond] = {
|
|
133
|
+
"mean_score": _mean(cat_scores[cat][cond]),
|
|
134
|
+
"n": len(cat_scores[cat][cond]),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Retrieval summary
|
|
138
|
+
retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"total_questions": len(per_question),
|
|
142
|
+
"conditions": conditions,
|
|
143
|
+
"ipr": round(ipr, 4) if ipr else None,
|
|
144
|
+
"categories": categories,
|
|
145
|
+
"retrieval": retrieval,
|
|
146
|
+
}
|