@geravant/sinain 1.11.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +963 -0
- package/sinain-core/package.json +1 -0
- package/sinain-core/src/buffers/feed-buffer.ts +32 -0
- package/sinain-core/src/embedding/service.ts +66 -0
- package/sinain-core/src/escalation/escalator.ts +1 -0
- package/sinain-core/src/escalation/message-builder.ts +45 -118
- package/sinain-core/src/index.ts +19 -2
- package/sinain-core/src/learning/local-curation.ts +137 -7
- package/sinain-core/src/overlay/commands.ts +16 -3
- package/sinain-core/src/overlay/ws-handler.ts +4 -1
- package/sinain-core/src/server.ts +31 -0
- package/sinain-core/src/types.ts +3 -0
- package/sinain-memory/README.md +105 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/embed_client.py +117 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +43 -0
- package/sinain-memory/eval/benchmarks/config.py +23 -0
- package/sinain-memory/eval/benchmarks/evaluate.py +146 -0
- package/sinain-memory/eval/benchmarks/ingest.py +152 -0
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +81 -0
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +177 -0
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
- package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
- package/sinain-memory/eval/benchmarks/query.py +193 -0
- package/sinain-memory/eval/benchmarks/report.py +87 -0
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
- package/sinain-memory/eval/benchmarks/runner.py +283 -0
- package/sinain-memory/graph_query.py +257 -15
- package/sinain-memory/knowledge_integrator.py +365 -72
- package/sinain-memory/koog-config.json +11 -0
- package/sinain-memory/memory-config.json +1 -1
- package/sinain-memory/session_distiller.py +43 -19
- package/sinain-memory/triplestore.py +60 -0
|
@@ -39,6 +39,7 @@ export class WsHandler {
|
|
|
39
39
|
screen: "off",
|
|
40
40
|
escalation: "active",
|
|
41
41
|
connection: "disconnected",
|
|
42
|
+
responseSize: "medium",
|
|
42
43
|
};
|
|
43
44
|
private replayBuffer: FeedMessage[] = [];
|
|
44
45
|
private spawnTaskBuffer: Map<string, SpawnTaskMessage> = new Map();
|
|
@@ -75,6 +76,7 @@ export class WsHandler {
|
|
|
75
76
|
screen: this.state.screen,
|
|
76
77
|
escalation: this.state.escalation,
|
|
77
78
|
connection: this.state.connection,
|
|
79
|
+
responseSize: this.state.responseSize,
|
|
78
80
|
});
|
|
79
81
|
|
|
80
82
|
// Replay recent feed messages for late-joining clients
|
|
@@ -151,13 +153,14 @@ export class WsHandler {
|
|
|
151
153
|
|
|
152
154
|
/** Send a status update to all connected overlays. */
|
|
153
155
|
broadcastStatus(): void {
|
|
154
|
-
const msg: StatusMessage & { envPath?: string; escalation?: string } = {
|
|
156
|
+
const msg: StatusMessage & { envPath?: string; escalation?: string; responseSize?: string } = {
|
|
155
157
|
type: "status",
|
|
156
158
|
audio: this.state.audio,
|
|
157
159
|
mic: this.state.mic,
|
|
158
160
|
screen: this.state.screen,
|
|
159
161
|
escalation: this.state.escalation,
|
|
160
162
|
connection: this.state.connection,
|
|
163
|
+
responseSize: this.state.responseSize,
|
|
161
164
|
};
|
|
162
165
|
if (loadedEnvPath) msg.envPath = loadedEnvPath;
|
|
163
166
|
this.broadcastMessage(msg);
|
|
@@ -184,6 +184,8 @@ export interface ServerDeps {
|
|
|
184
184
|
onSpawnCommand?: (text: string) => void;
|
|
185
185
|
getSpawnPending?: () => { id: string; task: string; label: string; ts: number } | null;
|
|
186
186
|
respondSpawn?: (id: string, result: string) => { ok: boolean; error?: string };
|
|
187
|
+
embedTexts?: (texts: string[]) => Promise<Float32Array[]>;
|
|
188
|
+
isEmbeddingReady?: () => boolean;
|
|
187
189
|
}
|
|
188
190
|
|
|
189
191
|
function readBody(req: IncomingMessage, maxBytes: number): Promise<string> {
|
|
@@ -519,6 +521,35 @@ export function createAppServer(deps: ServerDeps) {
|
|
|
519
521
|
return;
|
|
520
522
|
}
|
|
521
523
|
|
|
524
|
+
// ── /embed ── (used by knowledge_integrator.py and graph_query.py)
|
|
525
|
+
if (req.method === "POST" && url.pathname === "/embed") {
|
|
526
|
+
if (!deps.embedTexts || !deps.isEmbeddingReady?.()) {
|
|
527
|
+
res.writeHead(503);
|
|
528
|
+
res.end(JSON.stringify({ error: "embedding model loading" }));
|
|
529
|
+
return;
|
|
530
|
+
}
|
|
531
|
+
let body = "";
|
|
532
|
+
req.on("data", (c: Buffer) => { body += c; });
|
|
533
|
+
req.on("end", async () => {
|
|
534
|
+
try {
|
|
535
|
+
const { texts } = JSON.parse(body);
|
|
536
|
+
if (!Array.isArray(texts) || texts.length === 0) {
|
|
537
|
+
res.writeHead(400);
|
|
538
|
+
res.end(JSON.stringify({ error: "texts array required" }));
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
const embeddings = await deps.embedTexts!(texts);
|
|
542
|
+
// Return as base64-encoded float32 arrays for efficiency
|
|
543
|
+
const encoded = embeddings.map(e => Buffer.from(e.buffer).toString("base64"));
|
|
544
|
+
res.end(JSON.stringify({ embeddings: encoded, dims: 384 }));
|
|
545
|
+
} catch (err: any) {
|
|
546
|
+
res.writeHead(500);
|
|
547
|
+
res.end(JSON.stringify({ error: err.message?.slice(0, 200) }));
|
|
548
|
+
}
|
|
549
|
+
});
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
|
|
522
553
|
// ── /health ──
|
|
523
554
|
if (req.method === "GET" && url.pathname === "/health") {
|
|
524
555
|
res.end(JSON.stringify({
|
package/sinain-core/src/types.ts
CHANGED
|
@@ -20,6 +20,7 @@ export interface StatusMessage {
|
|
|
20
20
|
screen: string;
|
|
21
21
|
escalation?: string;
|
|
22
22
|
connection: string;
|
|
23
|
+
responseSize?: string;
|
|
23
24
|
}
|
|
24
25
|
|
|
25
26
|
/** sinain-core → Overlay: heartbeat ping */
|
|
@@ -244,6 +245,7 @@ export interface StopResult {
|
|
|
244
245
|
|
|
245
246
|
export type EscalationMode = "off" | "selective" | "focus" | "rich";
|
|
246
247
|
export type ContextRichness = "lean" | "standard" | "rich";
|
|
248
|
+
export type ResponseSize = "small" | "medium" | "large";
|
|
247
249
|
|
|
248
250
|
export type AnalysisProvider = "openrouter" | "ollama";
|
|
249
251
|
|
|
@@ -393,6 +395,7 @@ export interface BridgeState {
|
|
|
393
395
|
screen: "active" | "off";
|
|
394
396
|
escalation: "active" | "paused";
|
|
395
397
|
connection: "connected" | "disconnected" | "connecting";
|
|
398
|
+
responseSize: ResponseSize;
|
|
396
399
|
}
|
|
397
400
|
|
|
398
401
|
// ── Learning / feedback types ──
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# sinain-memory
|
|
2
|
+
|
|
3
|
+
Local-first knowledge pipeline for SinainHUD. Captures what the user sees and hears, distills it into a knowledge graph, and makes it retrievable for the agent's context.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
Two-step pipeline: **LLM extraction** (what to remember) + **deterministic integration** (how to store it).
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
Audio transcripts + Screen OCR
|
|
11
|
+
|
|
|
12
|
+
session_distiller.py (LLM)
|
|
13
|
+
Extracts: facts[], entities[], decisions[]
|
|
14
|
+
|
|
|
15
|
+
knowledge_integrator.py (code — no LLM)
|
|
16
|
+
- Converts facts to graph assertions (deterministic)
|
|
17
|
+
- Creates entity:* nodes with freeform types
|
|
18
|
+
- Links facts to entities via ref edges
|
|
19
|
+
- Infers cross-entity relationships from fact content
|
|
20
|
+
- Deduplicates via embedding similarity (cosine 0.78)
|
|
21
|
+
- Auto-curates playbook (tag overlap, no LLM)
|
|
22
|
+
|
|
|
23
|
+
triplestore.py (SQLite EAV)
|
|
24
|
+
- 4 covering indexes: EAVT, AEVT, VAET, AVET
|
|
25
|
+
- FTS5 full-text search on fact values
|
|
26
|
+
- Confidence decay (60-day half-life)
|
|
27
|
+
- Touched-entities tracking for cache invalidation
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Key Design Decisions
|
|
31
|
+
|
|
32
|
+
**Deterministic integration.** The integrator does NOT use an LLM. Early experiments showed that LLM-based integration produced 0-20 facts per run depending on model mood, token truncation, and format errors. The deterministic approach converts every distiller fact to a graph assertion — consistent, fast, and reliable.
|
|
33
|
+
|
|
34
|
+
**Two-layer entity model.** `fact:*` entities store individual claims (searchable via FTS5 and tags). `entity:*` nodes represent real-world entities connected by typed ref edges. The VAET index enables backref traversal: "find all facts about Citibank" is an O(log n) index lookup.
|
|
35
|
+
|
|
36
|
+
**Incremental distillation.** Long sessions (>8 min) trigger distillation when the feed buffer reaches capacity, before items are lost to the ring buffer. The `onFull` callback fires when 50% new items have accumulated since the last pass.
|
|
37
|
+
|
|
38
|
+
**Embedding-based dedup.** sinain-core hosts an in-process all-MiniLM-L6-v2 model (384 dims, 2-4ms per embedding). The `/embed` endpoint is used for both write-time dedup (prevent storing semantic duplicates) and read-time re-ranking (surface most relevant facts for a query).
|
|
39
|
+
|
|
40
|
+
## Triplestore
|
|
41
|
+
|
|
42
|
+
SQLite-backed EAV store inspired by Datomic/RhizomeDB with 4 covering indexes:
|
|
43
|
+
|
|
44
|
+
| Index | Query Pattern | Example |
|
|
45
|
+
|-------|-------------|---------|
|
|
46
|
+
| **EAVT** | What does entity X look like? | `store.entity("entity:citibank")` |
|
|
47
|
+
| **AEVT** | Which entities have attribute Y? | `store.entities_with_attr("type")` |
|
|
48
|
+
| **VAET** | What references entity Z? (backrefs) | `store.backrefs("entity:citibank")` |
|
|
49
|
+
| **AVET** | Find entity by attribute+value | `store.lookup("type", "person")` |
|
|
50
|
+
|
|
51
|
+
Additional features:
|
|
52
|
+
- **FTS5** full-text search on fact values with auto-sync triggers
|
|
53
|
+
- **Confidence decay**: exponential half-life (60 days) — facts lose relevance without reinforcement
|
|
54
|
+
- **Temporal queries**: `entity_as_of(id, date)` for point-in-time knowledge
|
|
55
|
+
- **Touched-entities index**: O(1) "was entity X modified since tx Y?" for cache invalidation
|
|
56
|
+
- **Soft retraction**: facts are marked retracted, not deleted — preserves history
|
|
57
|
+
|
|
58
|
+
## Retrieval
|
|
59
|
+
|
|
60
|
+
Hybrid retrieval with Reciprocal Rank Fusion (RRF):
|
|
61
|
+
|
|
62
|
+
1. **FTS5** keyword search on fact values
|
|
63
|
+
2. **Tag-based** entity matching via AVET index
|
|
64
|
+
3. **Top-confidence** facts as baseline
|
|
65
|
+
4. **Entity graph boost**: facts linked to query-mentioned entities via backrefs get an RRF score bonus
|
|
66
|
+
5. **Embedding re-ranking** (when sinain-core is running): semantic similarity between query and facts
|
|
67
|
+
6. **Confidence decay** applied as tiebreaker
|
|
68
|
+
|
|
69
|
+
Results are grouped by entity for cross-fact reasoning.
|
|
70
|
+
|
|
71
|
+
## Distiller Output Schema
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"whatHappened": "2-3 sentence summary",
|
|
76
|
+
"facts": ["self-contained factual sentence", ...],
|
|
77
|
+
"decisions": ["who decided what, with deadline", ...],
|
|
78
|
+
"entities": [{"name": "entity-slug", "type": "freeform-type"}, ...],
|
|
79
|
+
"patterns": ["reusable technique or workflow", ...],
|
|
80
|
+
"preferences": ["user preference or habit", ...],
|
|
81
|
+
"isEmpty": false
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Facts are guided by 5 diversity dimensions: **WHO** (people, roles), **WHAT** (properties, descriptions), **HOW MUCH** (numbers, dates), **WHAT CHANGED** (decisions, agreements), **WHAT'S NEXT** (commitments, plans).
|
|
86
|
+
|
|
87
|
+
## Files
|
|
88
|
+
|
|
89
|
+
| File | Role |
|
|
90
|
+
|------|------|
|
|
91
|
+
| `session_distiller.py` | LLM extraction: transcript to structured digest |
|
|
92
|
+
| `knowledge_integrator.py` | Deterministic storage: digest to graph ops + playbook |
|
|
93
|
+
| `triplestore.py` | SQLite EAV with 4 indexes + FTS5 + temporal |
|
|
94
|
+
| `graph_query.py` | Hybrid retrieval with RRF fusion |
|
|
95
|
+
| `embed_client.py` | Python client for sinain-core `/embed` endpoint |
|
|
96
|
+
| `common.py` | Shared LLM call utilities |
|
|
97
|
+
| `memory-config.json` | Model selection, token limits, timeouts |
|
|
98
|
+
|
|
99
|
+
## Configuration
|
|
100
|
+
|
|
101
|
+
| Env Var | Default | Description |
|
|
102
|
+
|---------|---------|-------------|
|
|
103
|
+
| `SINAIN_MEMORY_DIR` | `~/.sinain/memory` | Knowledge graph directory |
|
|
104
|
+
| `LEARNING_ENABLED` | `true` | Enable/disable distillation pipeline |
|
|
105
|
+
| `AGENT_ENABLED` | `true` | Set `false` for capture-only mode |
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Embedding client — calls sinain-core's /embed endpoint for vector operations.
|
|
2
|
+
|
|
3
|
+
Provides semantic similarity for:
|
|
4
|
+
- Write path: dedup before asserting facts (knowledge_integrator.py)
|
|
5
|
+
- Read path: semantic retrieval (graph_query.py)
|
|
6
|
+
|
|
7
|
+
Falls back gracefully if sinain-core is not running or model not loaded.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import json
|
|
12
|
+
import struct
|
|
13
|
+
import urllib.request
|
|
14
|
+
from functools import lru_cache
|
|
15
|
+
|
|
16
|
+
SINAIN_CORE_URL = "http://localhost:9500"
|
|
17
|
+
EMBED_TIMEOUT_S = 5
|
|
18
|
+
SIMILARITY_THRESHOLD = 0.78 # calibrated: catches rephrased facts, rejects different facts
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def embed(texts: list[str]) -> list[list[float]] | None:
|
|
22
|
+
"""Embed texts via sinain-core /embed endpoint. Returns None if unavailable."""
|
|
23
|
+
try:
|
|
24
|
+
data = json.dumps({"texts": texts}).encode()
|
|
25
|
+
req = urllib.request.Request(
|
|
26
|
+
f"{SINAIN_CORE_URL}/embed",
|
|
27
|
+
data=data,
|
|
28
|
+
headers={"Content-Type": "application/json"},
|
|
29
|
+
method="POST",
|
|
30
|
+
)
|
|
31
|
+
with urllib.request.urlopen(req, timeout=EMBED_TIMEOUT_S) as resp:
|
|
32
|
+
result = json.loads(resp.read())
|
|
33
|
+
# Decode base64 float32 arrays
|
|
34
|
+
embeddings = []
|
|
35
|
+
for b64 in result["embeddings"]:
|
|
36
|
+
raw = base64.b64decode(b64)
|
|
37
|
+
floats = list(struct.unpack(f"{len(raw)//4}f", raw))
|
|
38
|
+
embeddings.append(floats)
|
|
39
|
+
return embeddings
|
|
40
|
+
except Exception:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def cosine(a: list[float], b: list[float]) -> float:
|
|
45
|
+
"""Cosine similarity between two vectors."""
|
|
46
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
47
|
+
return dot # vectors are pre-normalized by the model
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def find_duplicates_batch(
|
|
51
|
+
new_texts: list[str],
|
|
52
|
+
existing_texts: list[str],
|
|
53
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
54
|
+
) -> dict[int, int]:
|
|
55
|
+
"""Find duplicates for multiple new texts against existing texts in one batch.
|
|
56
|
+
|
|
57
|
+
Returns {new_index: existing_index} for texts with similarity >= threshold.
|
|
58
|
+
Single HTTP call for all texts — avoids per-fact round trips.
|
|
59
|
+
"""
|
|
60
|
+
if not existing_texts or not new_texts:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
all_texts = new_texts + existing_texts
|
|
64
|
+
embeddings = embed(all_texts)
|
|
65
|
+
if embeddings is None:
|
|
66
|
+
return {}
|
|
67
|
+
|
|
68
|
+
n_new = len(new_texts)
|
|
69
|
+
result = {}
|
|
70
|
+
|
|
71
|
+
for i in range(n_new):
|
|
72
|
+
best_idx = None
|
|
73
|
+
best_sim = threshold
|
|
74
|
+
for j in range(n_new, len(embeddings)):
|
|
75
|
+
sim = cosine(embeddings[i], embeddings[j])
|
|
76
|
+
if sim > best_sim:
|
|
77
|
+
best_sim = sim
|
|
78
|
+
best_idx = j - n_new
|
|
79
|
+
if best_idx is not None:
|
|
80
|
+
result[i] = best_idx
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def find_duplicate(
|
|
86
|
+
new_text: str,
|
|
87
|
+
existing_texts: list[str],
|
|
88
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
89
|
+
) -> int | None:
|
|
90
|
+
"""Find the index of the most similar existing text, or None if no match."""
|
|
91
|
+
result = find_duplicates_batch([new_text], existing_texts, threshold)
|
|
92
|
+
return result.get(0)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def rank_by_similarity(
|
|
96
|
+
query: str,
|
|
97
|
+
texts: list[str],
|
|
98
|
+
) -> list[tuple[int, float]] | None:
|
|
99
|
+
"""Rank texts by semantic similarity to query. Returns [(index, score), ...] descending.
|
|
100
|
+
|
|
101
|
+
Returns None if embedding service unavailable (caller should fall back to keyword).
|
|
102
|
+
"""
|
|
103
|
+
if not texts:
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
all_texts = [query] + texts
|
|
107
|
+
embeddings = embed(all_texts)
|
|
108
|
+
if embeddings is None:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
query_emb = embeddings[0]
|
|
112
|
+
scored = []
|
|
113
|
+
for i, emb in enumerate(embeddings[1:]):
|
|
114
|
+
scored.append((i, cosine(query_emb, emb)))
|
|
115
|
+
|
|
116
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
117
|
+
return scored
|
|
Binary file
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BenchmarkQuestion:
|
|
11
|
+
id: str
|
|
12
|
+
text: str
|
|
13
|
+
gold_answer: str
|
|
14
|
+
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
+
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
+
metadata: dict = field(default_factory=dict)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class BenchmarkInstance:
|
|
21
|
+
"""A set of conversations + questions that share the same context."""
|
|
22
|
+
id: str
|
|
23
|
+
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
+
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
+
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
+
metadata: dict = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BenchmarkAdapter(ABC):
|
|
30
|
+
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def name(self) -> str:
|
|
35
|
+
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
+
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
+
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
+
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
+
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
+
|
|
9
|
+
# LLM models (via OpenRouter)
|
|
10
|
+
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
+
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
+
|
|
13
|
+
# Retrieval
|
|
14
|
+
K_VALUES = [1, 3, 5, 10]
|
|
15
|
+
MAX_FACTS_PER_QUERY = 10
|
|
16
|
+
|
|
17
|
+
# Ingestion
|
|
18
|
+
DISTILLER_TIMEOUT_S = 30
|
|
19
|
+
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
+
|
|
21
|
+
# Dataset URLs
|
|
22
|
+
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
+
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Evaluation pipeline — score answers and compute aggregate metrics.
|
|
2
|
+
|
|
3
|
+
Combines:
|
|
4
|
+
- LLM-as-Judge (QA scoring, 1-5 scale)
|
|
5
|
+
- Retrieval metrics (Recall@k, NDCG@k)
|
|
6
|
+
- Token F1 overlap (mechanical, free)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
from .base_adapter import BenchmarkQuestion
|
|
16
|
+
from .config import K_VALUES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
|
|
20
|
+
|
|
21
|
+
def _tokenize(text: str) -> list[str]:
|
|
22
|
+
"""Simple whitespace + punctuation tokenizer."""
|
|
23
|
+
return re.findall(r"\w+", text.lower())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def token_f1(predicted: str, gold: str | int) -> float:
|
|
27
|
+
"""Compute token-level F1 between predicted and gold answers."""
|
|
28
|
+
pred_tokens = set(_tokenize(str(predicted)))
|
|
29
|
+
gold_tokens = set(_tokenize(str(gold)))
|
|
30
|
+
if not gold_tokens or not pred_tokens:
|
|
31
|
+
return 0.0
|
|
32
|
+
overlap = pred_tokens & gold_tokens
|
|
33
|
+
if not overlap:
|
|
34
|
+
return 0.0
|
|
35
|
+
precision = len(overlap) / len(pred_tokens)
|
|
36
|
+
recall = len(overlap) / len(gold_tokens)
|
|
37
|
+
return 2 * precision * recall / (precision + recall)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
|
|
41
|
+
|
|
42
|
+
def dcg_at_k(relevant_positions: list[int], k: int) -> float:
|
|
43
|
+
"""Discounted Cumulative Gain at k."""
|
|
44
|
+
score = 0.0
|
|
45
|
+
for pos in relevant_positions:
|
|
46
|
+
if pos < k:
|
|
47
|
+
score += 1.0 / math.log2(pos + 2)
|
|
48
|
+
return score
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
|
|
52
|
+
"""Normalized DCG at k."""
|
|
53
|
+
dcg = dcg_at_k(relevant_positions, k)
|
|
54
|
+
ideal_positions = list(range(min(num_relevant, k)))
|
|
55
|
+
idcg = dcg_at_k(ideal_positions, k)
|
|
56
|
+
return dcg / idcg if idcg > 0 else 0.0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compute_retrieval_metrics(
|
|
60
|
+
retrieved_ids: list[str],
|
|
61
|
+
expected_ids: list[str],
|
|
62
|
+
k_values: list[int] | None = None,
|
|
63
|
+
) -> dict:
|
|
64
|
+
"""Compute Recall@k and NDCG@k for a single question."""
|
|
65
|
+
ks = k_values or K_VALUES
|
|
66
|
+
expected_set = set(expected_ids)
|
|
67
|
+
relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
|
|
68
|
+
|
|
69
|
+
result = {}
|
|
70
|
+
for k in ks:
|
|
71
|
+
hit = any(pos < k for pos in relevant_positions)
|
|
72
|
+
result[f"recall@{k}"] = 1.0 if hit else 0.0
|
|
73
|
+
result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Aggregate metrics ─────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def aggregate_results(per_question: list[dict]) -> dict:
|
|
80
|
+
"""Compute aggregate metrics from per-question results.
|
|
81
|
+
|
|
82
|
+
Each per_question entry has:
|
|
83
|
+
{id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
|
|
84
|
+
"""
|
|
85
|
+
if not per_question:
|
|
86
|
+
return {"error": "no results"}
|
|
87
|
+
|
|
88
|
+
# Per-condition scores
|
|
89
|
+
condition_scores: dict[str, list[float]] = defaultdict(list)
|
|
90
|
+
condition_f1s: dict[str, list[float]] = defaultdict(list)
|
|
91
|
+
# Per-category per-condition
|
|
92
|
+
cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
|
93
|
+
# Retrieval
|
|
94
|
+
retrieval_metrics: dict[str, list[float]] = defaultdict(list)
|
|
95
|
+
|
|
96
|
+
for q in per_question:
|
|
97
|
+
cat = q.get("category", "unknown")
|
|
98
|
+
|
|
99
|
+
for cond, data in q.get("answers", {}).items():
|
|
100
|
+
if data.get("score") is not None:
|
|
101
|
+
condition_scores[cond].append(data["score"])
|
|
102
|
+
cat_scores[cat][cond].append(data["score"])
|
|
103
|
+
if data.get("f1") is not None:
|
|
104
|
+
condition_f1s[cond].append(data["f1"])
|
|
105
|
+
|
|
106
|
+
for metric, val in q.get("retrieval", {}).items():
|
|
107
|
+
if isinstance(val, (int, float)):
|
|
108
|
+
retrieval_metrics[metric].append(val)
|
|
109
|
+
|
|
110
|
+
def _mean(lst: list[float]) -> float:
|
|
111
|
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
|
112
|
+
|
|
113
|
+
# Build summary
|
|
114
|
+
conditions = {}
|
|
115
|
+
for cond in sorted(condition_scores):
|
|
116
|
+
conditions[cond] = {
|
|
117
|
+
"mean_score": _mean(condition_scores[cond]),
|
|
118
|
+
"mean_f1": _mean(condition_f1s.get(cond, [])),
|
|
119
|
+
"n": len(condition_scores[cond]),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# IPR: sinain-memory vs full-context
|
|
123
|
+
sm_scores = condition_scores.get("sinain-memory", [])
|
|
124
|
+
fc_scores = condition_scores.get("full-context", [])
|
|
125
|
+
ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
|
|
126
|
+
|
|
127
|
+
# Category breakdown
|
|
128
|
+
categories = {}
|
|
129
|
+
for cat in sorted(cat_scores):
|
|
130
|
+
categories[cat] = {}
|
|
131
|
+
for cond in sorted(cat_scores[cat]):
|
|
132
|
+
categories[cat][cond] = {
|
|
133
|
+
"mean_score": _mean(cat_scores[cat][cond]),
|
|
134
|
+
"n": len(cat_scores[cat][cond]),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Retrieval summary
|
|
138
|
+
retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"total_questions": len(per_question),
|
|
142
|
+
"conditions": conditions,
|
|
143
|
+
"ipr": round(ipr, 4) if ipr else None,
|
|
144
|
+
"categories": categories,
|
|
145
|
+
"retrieval": retrieval,
|
|
146
|
+
}
|