@geravant/sinain 1.10.1 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-agent/CLAUDE.md +1 -1
- package/sinain-agent/run.sh +66 -7
- package/sinain-core/src/agent/analyzer.ts +4 -27
- package/sinain-core/src/agent/loop.ts +10 -40
- package/sinain-core/src/agent/situation-writer.ts +0 -16
- package/sinain-core/src/config.ts +1 -9
- package/sinain-core/src/escalation/escalator.ts +44 -16
- package/sinain-core/src/escalation/message-builder.ts +45 -118
- package/sinain-core/src/index.ts +20 -36
- package/sinain-core/src/learning/local-curation.ts +4 -4
- package/sinain-core/src/overlay/commands.ts +46 -13
- package/sinain-core/src/overlay/ws-handler.ts +13 -1
- package/sinain-core/src/server.ts +121 -0
- package/sinain-core/src/types.ts +25 -28
- package/sinain-mcp-server/index.ts +28 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -21
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
- package/sinain-memory/eval/benchmarks/config.py +23 -0
- package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
- package/sinain-memory/eval/benchmarks/ingest.py +152 -0
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
- package/sinain-memory/eval/benchmarks/query.py +172 -0
- package/sinain-memory/eval/benchmarks/report.py +87 -0
- package/sinain-memory/eval/benchmarks/runner.py +276 -0
- package/sinain-memory/koog-config.json +11 -0
- package/sinain-core/src/agent/traits.ts +0 -520
package/sinain-core/src/types.ts
CHANGED
|
@@ -18,7 +18,9 @@ export interface StatusMessage {
|
|
|
18
18
|
audio: string;
|
|
19
19
|
mic: string;
|
|
20
20
|
screen: string;
|
|
21
|
+
escalation?: string;
|
|
21
22
|
connection: string;
|
|
23
|
+
responseSize?: string;
|
|
22
24
|
}
|
|
23
25
|
|
|
24
26
|
/** sinain-core → Overlay: heartbeat ping */
|
|
@@ -28,7 +30,7 @@ export interface PingMessage {
|
|
|
28
30
|
}
|
|
29
31
|
|
|
30
32
|
/** sinain-core → Overlay: spawn task lifecycle update */
|
|
31
|
-
export type SpawnTaskStatus = "spawned" | "polling" | "completed" | "failed" | "timeout";
|
|
33
|
+
export type SpawnTaskStatus = "spawned" | "polling" | "completed" | "failed" | "timeout" | "awaiting_input" | "awaiting_permission";
|
|
32
34
|
|
|
33
35
|
export interface SpawnTaskMessage {
|
|
34
36
|
type: "spawn_task";
|
|
@@ -38,6 +40,10 @@ export interface SpawnTaskMessage {
|
|
|
38
40
|
startedAt: number;
|
|
39
41
|
completedAt?: number;
|
|
40
42
|
resultPreview?: string;
|
|
43
|
+
/** Question the spawn is asking the user (status=awaiting_input) */
|
|
44
|
+
question?: string;
|
|
45
|
+
/** Tool permission request (status=awaiting_permission) */
|
|
46
|
+
permission?: { tool: string; input: Record<string, unknown> };
|
|
41
47
|
}
|
|
42
48
|
|
|
43
49
|
/** Overlay → sinain-core: user typed a message */
|
|
@@ -78,6 +84,20 @@ export interface SpawnCommandMessage {
|
|
|
78
84
|
text: string;
|
|
79
85
|
}
|
|
80
86
|
|
|
87
|
+
/** Overlay → sinain-core: reply to a spawn question */
|
|
88
|
+
export interface SpawnReplyMessage {
|
|
89
|
+
type: "spawn_reply";
|
|
90
|
+
taskId: string;
|
|
91
|
+
text: string;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Overlay → sinain-core: reply to a spawn permission request */
|
|
95
|
+
export interface SpawnPermissionReplyMessage {
|
|
96
|
+
type: "spawn_permission_reply";
|
|
97
|
+
taskId: string;
|
|
98
|
+
decision: "allow" | "deny";
|
|
99
|
+
}
|
|
100
|
+
|
|
81
101
|
/** Cost update broadcast to overlay. */
|
|
82
102
|
export interface CostMessage {
|
|
83
103
|
type: "cost";
|
|
@@ -108,7 +128,7 @@ export interface CostSnapshot {
|
|
|
108
128
|
}
|
|
109
129
|
|
|
110
130
|
export type OutboundMessage = FeedMessage | StatusMessage | PingMessage | SpawnTaskMessage | CostMessage;
|
|
111
|
-
export type InboundMessage = UserMessage | CommandMessage | PongMessage | ProfilingMessage | UserCommandMessage | SpawnCommandMessage;
|
|
131
|
+
export type InboundMessage = UserMessage | CommandMessage | PongMessage | ProfilingMessage | UserCommandMessage | SpawnCommandMessage | SpawnReplyMessage | SpawnPermissionReplyMessage;
|
|
112
132
|
|
|
113
133
|
/** Abstraction for user commands (text now, voice later). */
|
|
114
134
|
export interface UserCommand {
|
|
@@ -163,27 +183,6 @@ export interface AudioPipelineConfig {
|
|
|
163
183
|
gainDb: number;
|
|
164
184
|
}
|
|
165
185
|
|
|
166
|
-
export interface TraitConfig {
|
|
167
|
-
enabled: boolean;
|
|
168
|
-
configPath: string; // path to ~/.sinain/traits.json
|
|
169
|
-
entropyHigh: boolean; // Phase 2: boosts entropy roll to 15%
|
|
170
|
-
logDir: string; // path to ~/.sinain-core/traits/
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
export interface TraitLogEntry {
|
|
174
|
-
ts: string;
|
|
175
|
-
tickId: number;
|
|
176
|
-
enabled: boolean;
|
|
177
|
-
voice: string;
|
|
178
|
-
voice_stat: number;
|
|
179
|
-
voice_confidence: number;
|
|
180
|
-
activation_scores: Record<string, number>;
|
|
181
|
-
context_app: string;
|
|
182
|
-
hud_length: number;
|
|
183
|
-
synthesis: boolean;
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
|
|
187
186
|
export interface AudioChunk {
|
|
188
187
|
buffer: Buffer;
|
|
189
188
|
source: string;
|
|
@@ -246,6 +245,7 @@ export interface StopResult {
|
|
|
246
245
|
|
|
247
246
|
export type EscalationMode = "off" | "selective" | "focus" | "rich";
|
|
248
247
|
export type ContextRichness = "lean" | "standard" | "rich";
|
|
248
|
+
export type ResponseSize = "small" | "medium" | "large";
|
|
249
249
|
|
|
250
250
|
export type AnalysisProvider = "openrouter" | "ollama";
|
|
251
251
|
|
|
@@ -282,9 +282,6 @@ export interface AgentResult {
|
|
|
282
282
|
tokensOut: number;
|
|
283
283
|
model: string;
|
|
284
284
|
parsedOk: boolean;
|
|
285
|
-
voice?: string;
|
|
286
|
-
voice_stat?: number;
|
|
287
|
-
voice_confidence?: number;
|
|
288
285
|
/** Actual USD cost returned by OpenRouter (undefined if not available). */
|
|
289
286
|
cost?: number;
|
|
290
287
|
}
|
|
@@ -396,8 +393,9 @@ export interface BridgeState {
|
|
|
396
393
|
audio: "active" | "muted";
|
|
397
394
|
mic: "active" | "muted";
|
|
398
395
|
screen: "active" | "off";
|
|
399
|
-
|
|
396
|
+
escalation: "active" | "paused";
|
|
400
397
|
connection: "connected" | "disconnected" | "connecting";
|
|
398
|
+
responseSize: ResponseSize;
|
|
401
399
|
}
|
|
402
400
|
|
|
403
401
|
// ── Learning / feedback types ──
|
|
@@ -479,6 +477,5 @@ export interface CoreConfig {
|
|
|
479
477
|
costDisplayEnabled: boolean;
|
|
480
478
|
traceDir: string;
|
|
481
479
|
learningConfig: LearningConfig;
|
|
482
|
-
traitConfig: TraitConfig;
|
|
483
480
|
privacyConfig: PrivacyConfig;
|
|
484
481
|
}
|
|
@@ -451,6 +451,34 @@ server.tool(
|
|
|
451
451
|
},
|
|
452
452
|
);
|
|
453
453
|
|
|
454
|
+
// 15. sinain_ask_user — blocking question to the user via overlay
|
|
455
|
+
server.tool(
|
|
456
|
+
"sinain_ask_user",
|
|
457
|
+
"Ask the user a question and wait for their reply. Use when you need clarification, confirmation, or a decision. The question appears on the user's HUD overlay and blocks until they respond.",
|
|
458
|
+
{
|
|
459
|
+
question: z.string().describe("The question to ask the user"),
|
|
460
|
+
},
|
|
461
|
+
async ({ question }) => {
|
|
462
|
+
// Use the spawn task ID from the environment if available
|
|
463
|
+
const taskId = process.env.SINAIN_SPAWN_TASK_ID || `ask-${Date.now()}`;
|
|
464
|
+
try {
|
|
465
|
+
const resp = await fetch(`${SINAIN_CORE_URL}/spawn/ask`, {
|
|
466
|
+
method: "POST",
|
|
467
|
+
headers: { "Content-Type": "application/json" },
|
|
468
|
+
body: JSON.stringify({ taskId, question }),
|
|
469
|
+
signal: AbortSignal.timeout(6 * 60_000), // 6 min (server times out at 5)
|
|
470
|
+
});
|
|
471
|
+
const data = await resp.json() as { ok: boolean; answer?: string };
|
|
472
|
+
if (data.ok && data.answer) {
|
|
473
|
+
return textResult(`User replied: ${data.answer}`);
|
|
474
|
+
}
|
|
475
|
+
return textResult("User did not reply.");
|
|
476
|
+
} catch (err: any) {
|
|
477
|
+
return textResult(`Failed to ask user: ${err.message}`);
|
|
478
|
+
}
|
|
479
|
+
},
|
|
480
|
+
);
|
|
481
|
+
|
|
454
482
|
// ---------------------------------------------------------------------------
|
|
455
483
|
// Startup
|
|
456
484
|
// ---------------------------------------------------------------------------
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -206,27 +206,6 @@ def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
|
|
|
206
206
|
f"missing playbook comments: {', '.join(missing)}")
|
|
207
207
|
|
|
208
208
|
|
|
209
|
-
# ---------------------------------------------------------------------------
|
|
210
|
-
# Trait voice assertions (sinain-core wiring verification)
|
|
211
|
-
# ---------------------------------------------------------------------------
|
|
212
|
-
|
|
213
|
-
def assert_situation_has_active_voice(
|
|
214
|
-
situation_content: str, expected_trait: str | None = None
|
|
215
|
-
) -> dict:
|
|
216
|
-
"""Check SITUATION.md contains an Active Voice section (after trait wiring).
|
|
217
|
-
|
|
218
|
-
Called by tick_evaluator.py when processing live ticks that have SITUATION.md
|
|
219
|
-
content and a trait was selected for that tick.
|
|
220
|
-
"""
|
|
221
|
-
has_section = "## Active Voice" in situation_content
|
|
222
|
-
if not has_section:
|
|
223
|
-
return _result("situation_has_active_voice", False, "no '## Active Voice' section")
|
|
224
|
-
if expected_trait and expected_trait not in situation_content:
|
|
225
|
-
return _result("situation_has_active_voice", False,
|
|
226
|
-
f"section present but '{expected_trait}' not found")
|
|
227
|
-
return _result("situation_has_active_voice", True, "Active Voice section present")
|
|
228
|
-
|
|
229
|
-
|
|
230
209
|
# ---------------------------------------------------------------------------
|
|
231
210
|
# Runner: execute all applicable assertions for a tick
|
|
232
211
|
# ---------------------------------------------------------------------------
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BenchmarkQuestion:
|
|
11
|
+
id: str
|
|
12
|
+
text: str
|
|
13
|
+
gold_answer: str
|
|
14
|
+
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
+
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
+
metadata: dict = field(default_factory=dict)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class BenchmarkInstance:
|
|
21
|
+
"""A set of conversations + questions that share the same context."""
|
|
22
|
+
id: str
|
|
23
|
+
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
+
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
+
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
+
metadata: dict = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BenchmarkAdapter(ABC):
|
|
30
|
+
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def name(self) -> str:
|
|
35
|
+
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
+
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
+
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
+
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
+
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
+
|
|
9
|
+
# LLM models (via OpenRouter)
|
|
10
|
+
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
+
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
+
|
|
13
|
+
# Retrieval
|
|
14
|
+
K_VALUES = [1, 3, 5, 10]
|
|
15
|
+
MAX_FACTS_PER_QUERY = 10
|
|
16
|
+
|
|
17
|
+
# Ingestion
|
|
18
|
+
DISTILLER_TIMEOUT_S = 30
|
|
19
|
+
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
+
|
|
21
|
+
# Dataset URLs
|
|
22
|
+
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
+
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Evaluation pipeline — score answers and compute aggregate metrics.
|
|
2
|
+
|
|
3
|
+
Combines:
|
|
4
|
+
- LLM-as-Judge (QA scoring, 1-5 scale)
|
|
5
|
+
- Retrieval metrics (Recall@k, NDCG@k)
|
|
6
|
+
- Token F1 overlap (mechanical, free)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
from .base_adapter import BenchmarkQuestion
|
|
16
|
+
from .config import K_VALUES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
def _tokenize(text: str) -> list[str]:
|
|
22
|
+
"""Simple whitespace + punctuation tokenizer."""
|
|
23
|
+
return re.findall(r"\w+", text.lower())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def token_f1(predicted: str, gold: str | int) -> float:
|
|
27
|
+
"""Compute token-level F1 between predicted and gold answers."""
|
|
28
|
+
pred_tokens = set(_tokenize(str(predicted)))
|
|
29
|
+
gold_tokens = set(_tokenize(str(gold)))
|
|
30
|
+
if not gold_tokens or not pred_tokens:
|
|
31
|
+
return 0.0
|
|
32
|
+
overlap = pred_tokens & gold_tokens
|
|
33
|
+
if not overlap:
|
|
34
|
+
return 0.0
|
|
35
|
+
precision = len(overlap) / len(pred_tokens)
|
|
36
|
+
recall = len(overlap) / len(gold_tokens)
|
|
37
|
+
return 2 * precision * recall / (precision + recall)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
|
|
41
|
+
|
|
42
|
+
def dcg_at_k(relevant_positions: list[int], k: int) -> float:
|
|
43
|
+
"""Discounted Cumulative Gain at k."""
|
|
44
|
+
score = 0.0
|
|
45
|
+
for pos in relevant_positions:
|
|
46
|
+
if pos < k:
|
|
47
|
+
score += 1.0 / math.log2(pos + 2)
|
|
48
|
+
return score
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
|
|
52
|
+
"""Normalized DCG at k."""
|
|
53
|
+
dcg = dcg_at_k(relevant_positions, k)
|
|
54
|
+
ideal_positions = list(range(min(num_relevant, k)))
|
|
55
|
+
idcg = dcg_at_k(ideal_positions, k)
|
|
56
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compute_retrieval_metrics(
|
|
60
|
+
retrieved_ids: list[str],
|
|
61
|
+
expected_ids: list[str],
|
|
62
|
+
k_values: list[int] | None = None,
|
|
63
|
+
) -> dict:
|
|
64
|
+
"""Compute Recall@k and NDCG@k for a single question."""
|
|
65
|
+
ks = k_values or K_VALUES
|
|
66
|
+
expected_set = set(expected_ids)
|
|
67
|
+
relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
|
|
68
|
+
|
|
69
|
+
result = {}
|
|
70
|
+
for k in ks:
|
|
71
|
+
hit = any(pos < k for pos in relevant_positions)
|
|
72
|
+
result[f"recall@{k}"] = 1.0 if hit else 0.0
|
|
73
|
+
result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Aggregate metrics ─────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def aggregate_results(per_question: list[dict]) -> dict:
|
|
80
|
+
"""Compute aggregate metrics from per-question results.
|
|
81
|
+
|
|
82
|
+
Each per_question entry has:
|
|
83
|
+
{id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
|
|
84
|
+
"""
|
|
85
|
+
if not per_question:
|
|
86
|
+
return {"error": "no results"}
|
|
87
|
+
|
|
88
|
+
# Per-condition scores
|
|
89
|
+
condition_scores: dict[str, list[float]] = defaultdict(list)
|
|
90
|
+
condition_f1s: dict[str, list[float]] = defaultdict(list)
|
|
91
|
+
# Per-category per-condition
|
|
92
|
+
cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
|
93
|
+
# Retrieval
|
|
94
|
+
retrieval_metrics: dict[str, list[float]] = defaultdict(list)
|
|
95
|
+
|
|
96
|
+
for q in per_question:
|
|
97
|
+
cat = q.get("category", "unknown")
|
|
98
|
+
|
|
99
|
+
for cond, data in q.get("answers", {}).items():
|
|
100
|
+
if data.get("score") is not None:
|
|
101
|
+
condition_scores[cond].append(data["score"])
|
|
102
|
+
cat_scores[cat][cond].append(data["score"])
|
|
103
|
+
if data.get("f1") is not None:
|
|
104
|
+
condition_f1s[cond].append(data["f1"])
|
|
105
|
+
|
|
106
|
+
for metric, val in q.get("retrieval", {}).items():
|
|
107
|
+
if isinstance(val, (int, float)):
|
|
108
|
+
retrieval_metrics[metric].append(val)
|
|
109
|
+
|
|
110
|
+
def _mean(lst: list[float]) -> float:
|
|
111
|
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
|
112
|
+
|
|
113
|
+
# Build summary
|
|
114
|
+
conditions = {}
|
|
115
|
+
for cond in sorted(condition_scores):
|
|
116
|
+
conditions[cond] = {
|
|
117
|
+
"mean_score": _mean(condition_scores[cond]),
|
|
118
|
+
"mean_f1": _mean(condition_f1s.get(cond, [])),
|
|
119
|
+
"n": len(condition_scores[cond]),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# IPR: sinain-memory vs full-context
|
|
123
|
+
sm_scores = condition_scores.get("sinain-memory", [])
|
|
124
|
+
fc_scores = condition_scores.get("full-context", [])
|
|
125
|
+
ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
|
|
126
|
+
|
|
127
|
+
# Category breakdown
|
|
128
|
+
categories = {}
|
|
129
|
+
for cat in sorted(cat_scores):
|
|
130
|
+
categories[cat] = {}
|
|
131
|
+
for cond in sorted(cat_scores[cat]):
|
|
132
|
+
categories[cat][cond] = {
|
|
133
|
+
"mean_score": _mean(cat_scores[cat][cond]),
|
|
134
|
+
"n": len(cat_scores[cat][cond]),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Retrieval summary
|
|
138
|
+
retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"total_questions": len(per_question),
|
|
142
|
+
"conditions": conditions,
|
|
143
|
+
"ipr": round(ipr, 4) if ipr else None,
|
|
144
|
+
"categories": categories,
|
|
145
|
+
"retrieval": retrieval,
|
|
146
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Ingestion pipeline — benchmark conversations → sinain triplestore.
|
|
2
|
+
|
|
3
|
+
Runs session_distiller.py + knowledge_integrator.py via subprocess (exact production path).
|
|
4
|
+
Caches results aggressively to avoid repeated LLM calls.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import shutil
|
|
13
|
+
import tempfile
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from subprocess import run, PIPE, TimeoutExpired
|
|
16
|
+
|
|
17
|
+
from .base_adapter import BenchmarkInstance
|
|
18
|
+
from .config import DISTILLER_TIMEOUT_S, INTEGRATOR_TIMEOUT_S
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _scripts_dir() -> Path:
|
|
22
|
+
"""Locate sinain-memory scripts directory."""
|
|
23
|
+
return Path(__file__).resolve().parent.parent.parent
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _content_hash(sessions: list[list[dict]]) -> str:
|
|
27
|
+
"""Hash session content for caching."""
|
|
28
|
+
raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
|
|
29
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _run_script(script_name: str, args: list[str], timeout: int) -> str | None:
|
|
33
|
+
"""Run a Python script from sinain-memory, return stdout or None on failure."""
|
|
34
|
+
script_path = _scripts_dir() / script_name
|
|
35
|
+
if not script_path.exists():
|
|
36
|
+
print(f"[ingest] {script_name} not found at {script_path}")
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
env = {**os.environ, "PYTHONPATH": str(_scripts_dir())}
|
|
40
|
+
# Ensure a working model is available (common.py defaults may reference unreleased models)
|
|
41
|
+
if "SINAIN_BENCH_MODEL" in os.environ:
|
|
42
|
+
env["SINAIN_FAST_MODEL"] = os.environ["SINAIN_BENCH_MODEL"]
|
|
43
|
+
try:
|
|
44
|
+
result = run(
|
|
45
|
+
["python3", str(script_path)] + args,
|
|
46
|
+
capture_output=True, text=True, timeout=timeout, env=env,
|
|
47
|
+
)
|
|
48
|
+
if result.returncode != 0:
|
|
49
|
+
print(f"[ingest] {script_name} failed: {result.stderr[:200]}")
|
|
50
|
+
return None
|
|
51
|
+
return result.stdout.strip()
|
|
52
|
+
except TimeoutExpired:
|
|
53
|
+
print(f"[ingest] {script_name} timed out ({timeout}s)")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def ingest_instance(
|
|
58
|
+
instance: BenchmarkInstance,
|
|
59
|
+
cache_dir: Path,
|
|
60
|
+
) -> Path | None:
|
|
61
|
+
"""Ingest a benchmark instance into a triplestore. Returns db_path or None.
|
|
62
|
+
|
|
63
|
+
Uses caching: if the same haystack was already ingested, returns the cached DB.
|
|
64
|
+
"""
|
|
65
|
+
ch = _content_hash(instance.sessions)
|
|
66
|
+
cache_path = cache_dir / "stores" / f"{ch}.db"
|
|
67
|
+
|
|
68
|
+
if cache_path.exists():
|
|
69
|
+
return cache_path
|
|
70
|
+
|
|
71
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
# Create temp memory directory
|
|
74
|
+
tmp = tempfile.mkdtemp(prefix="sinain-bench-")
|
|
75
|
+
mem_dir = Path(tmp) / "memory"
|
|
76
|
+
for subdir in ["", "playbook-logs", "playbook-archive"]:
|
|
77
|
+
(mem_dir / subdir).mkdir(parents=True, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
# Write a minimal playbook so integrator doesn't fail
|
|
80
|
+
(mem_dir / "sinain-playbook.md").write_text("# Sinain Playbook\n\n(benchmark run)\n")
|
|
81
|
+
|
|
82
|
+
success = False
|
|
83
|
+
try:
|
|
84
|
+
# Batch sessions into chunks of ~10 for fewer LLM calls.
|
|
85
|
+
# Each chunk becomes one distiller call with a combined transcript.
|
|
86
|
+
BATCH_SIZE = 10
|
|
87
|
+
num_sessions = len(instance.sessions)
|
|
88
|
+
batch_idx = 0
|
|
89
|
+
|
|
90
|
+
for start in range(0, num_sessions, BATCH_SIZE):
|
|
91
|
+
batch = instance.sessions[start:start + BATCH_SIZE]
|
|
92
|
+
# Flatten batch into one transcript
|
|
93
|
+
combined: list[dict] = []
|
|
94
|
+
for session in batch:
|
|
95
|
+
combined.extend(session)
|
|
96
|
+
if len(combined) < 3:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
first_ts = combined[0].get("ts", "2025-01-01T10:00:00Z")
|
|
100
|
+
meta = json.dumps({
|
|
101
|
+
"ts": first_ts,
|
|
102
|
+
"sessionKey": f"benchmark-batch-{batch_idx}",
|
|
103
|
+
"durationMs": len(combined) * 30000,
|
|
104
|
+
})
|
|
105
|
+
batch_idx += 1
|
|
106
|
+
|
|
107
|
+
# Step 1: Distill the batch
|
|
108
|
+
digest_json = _run_script("session_distiller.py", [
|
|
109
|
+
"--memory-dir", str(mem_dir),
|
|
110
|
+
"--transcript", json.dumps(combined),
|
|
111
|
+
"--session-meta", meta,
|
|
112
|
+
], DISTILLER_TIMEOUT_S)
|
|
113
|
+
|
|
114
|
+
if not digest_json:
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
digest = json.loads(digest_json)
|
|
119
|
+
except json.JSONDecodeError:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
if digest.get("isEmpty") or digest.get("error"):
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Step 2: Integrate into knowledge graph
|
|
126
|
+
_run_script("knowledge_integrator.py", [
|
|
127
|
+
"--memory-dir", str(mem_dir),
|
|
128
|
+
"--digest", json.dumps(digest),
|
|
129
|
+
], INTEGRATOR_TIMEOUT_S)
|
|
130
|
+
|
|
131
|
+
# Copy the resulting DB to cache
|
|
132
|
+
db_path = mem_dir / "knowledge-graph.db"
|
|
133
|
+
if db_path.exists() and db_path.stat().st_size > 0:
|
|
134
|
+
shutil.copy2(db_path, cache_path)
|
|
135
|
+
success = True
|
|
136
|
+
|
|
137
|
+
finally:
|
|
138
|
+
shutil.rmtree(tmp, ignore_errors=True)
|
|
139
|
+
|
|
140
|
+
return cache_path if success else None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_knowledge_doc(db_path: Path) -> str:
|
|
144
|
+
"""Render a sinain-knowledge.md style document from a triplestore."""
|
|
145
|
+
import sys
|
|
146
|
+
sys.path.insert(0, str(_scripts_dir()))
|
|
147
|
+
from graph_query import query_top_facts, format_facts_text
|
|
148
|
+
|
|
149
|
+
facts = query_top_facts(str(db_path), limit=30)
|
|
150
|
+
if not facts:
|
|
151
|
+
return "(no knowledge available)"
|
|
152
|
+
return format_facts_text(facts, max_chars=6000)
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""LLM-as-Judge: QA answer quality evaluator (LongMemEval-compatible, 1-5 scale).
|
|
2
|
+
|
|
3
|
+
Uses GPT-4o via OpenRouter for comparability with published results.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# Add sinain-memory to path for common imports
|
|
12
|
+
_koog_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
|
|
13
|
+
if _koog_dir not in sys.path:
|
|
14
|
+
sys.path.insert(0, _koog_dir)
|
|
15
|
+
|
|
16
|
+
from common import LLMError, call_llm, extract_json # noqa: E402
|
|
17
|
+
|
|
18
|
+
SYSTEM_PROMPT = """\
|
|
19
|
+
You are evaluating whether a predicted answer correctly answers a question.
|
|
20
|
+
The gold (reference) answer is provided.
|
|
21
|
+
|
|
22
|
+
Score on a scale of 1-5:
|
|
23
|
+
5: Perfect — captures all key information from the gold answer, no errors
|
|
24
|
+
4: Mostly correct — minor omissions or imprecision, main point is right
|
|
25
|
+
3: Partially correct — captures some key points but misses important details
|
|
26
|
+
2: Related but mostly wrong — touches the topic but answer is largely incorrect
|
|
27
|
+
1: Completely wrong, contradicts the gold answer, or says "I don't know" when the answer exists
|
|
28
|
+
|
|
29
|
+
Special cases:
|
|
30
|
+
- If the gold answer indicates abstention is correct (e.g. "I don't know" or "not mentioned"),
|
|
31
|
+
then a predicted "I don't know" scores 5.
|
|
32
|
+
- Numeric answers within 10% of gold = full credit.
|
|
33
|
+
- Getting the gist right but missing specifics = 3-4 depending on importance.
|
|
34
|
+
|
|
35
|
+
Respond with ONLY a JSON object: {"score": <1-5>, "reasoning": "brief explanation"}"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def judge_qa(
|
|
39
|
+
question: str,
|
|
40
|
+
gold_answer: str,
|
|
41
|
+
predicted_answer: str,
|
|
42
|
+
*,
|
|
43
|
+
condition: str = "",
|
|
44
|
+
model: str | None = None,
|
|
45
|
+
) -> dict | None:
|
|
46
|
+
"""Score a QA answer. Returns {"score": 1-5, "reasoning": str} or None on failure."""
|
|
47
|
+
user_parts = [
|
|
48
|
+
f"## Question\n{question}",
|
|
49
|
+
f"\n## Gold Answer\n{gold_answer}",
|
|
50
|
+
f"\n## Predicted Answer\n{predicted_answer}",
|
|
51
|
+
]
|
|
52
|
+
if condition:
|
|
53
|
+
user_parts.append(f"\n## Context Condition: {condition}")
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
kwargs: dict = {
|
|
57
|
+
"system_prompt": SYSTEM_PROMPT,
|
|
58
|
+
"user_prompt": "\n".join(user_parts),
|
|
59
|
+
"max_tokens": 200,
|
|
60
|
+
"json_mode": True,
|
|
61
|
+
}
|
|
62
|
+
if model:
|
|
63
|
+
kwargs["model"] = model
|
|
64
|
+
else:
|
|
65
|
+
kwargs["script"] = "meeting_benchmark"
|
|
66
|
+
|
|
67
|
+
raw = call_llm(**kwargs)
|
|
68
|
+
result = extract_json(raw)
|
|
69
|
+
|
|
70
|
+
score = result.get("score")
|
|
71
|
+
reasoning = result.get("reasoning", "")
|
|
72
|
+
|
|
73
|
+
if not isinstance(score, (int, float)) or not (1 <= score <= 5):
|
|
74
|
+
print(f"[warn] qa_judge returned invalid score: {score}", file=sys.stderr)
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
return {"score": int(score), "reasoning": str(reasoning)[:300]}
|
|
78
|
+
|
|
79
|
+
except (ValueError, LLMError, KeyError) as e:
|
|
80
|
+
print(f"[warn] qa_judge call failed: {e}", file=sys.stderr)
|
|
81
|
+
return None
|