@geravant/sinain 1.12.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/config-shared.js +1 -0
- package/package.json +4 -1
- package/sinain-agent/run.sh +36 -4
- package/sinain-core/package-lock.json +963 -0
- package/sinain-core/package.json +1 -0
- package/sinain-core/src/buffers/feed-buffer.ts +34 -0
- package/sinain-core/src/embedding/service.ts +66 -0
- package/sinain-core/src/index.ts +65 -17
- package/sinain-core/src/learning/local-curation.ts +137 -7
- package/sinain-core/src/server.ts +31 -0
- package/sinain-memory/README.md +105 -0
- package/sinain-memory/embed_client.py +117 -0
- package/sinain-memory/graph_query.py +269 -18
- package/sinain-memory/knowledge_integrator.py +551 -74
- package/sinain-memory/memory-config.json +1 -1
- package/sinain-memory/session_distiller.py +43 -19
- package/sinain-memory/triplestore.py +60 -0
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/query.py +0 -172
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/runner.py +0 -276
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"eval_reporter": { "model": "smart", "maxTokens": 1000 },
|
|
15
15
|
"triple_extractor": { "model": "fast", "maxTokens": 1500, "timeout": 30 },
|
|
16
16
|
"session_distiller": { "model": "smart", "maxTokens": 1500, "timeout": 30 },
|
|
17
|
-
"knowledge_integrator": { "model": "smart", "maxTokens":
|
|
17
|
+
"knowledge_integrator": { "model": "smart", "maxTokens": 4000, "timeout": 60 }
|
|
18
18
|
},
|
|
19
19
|
"defaults": { "model": "fast", "maxTokens": 1500 },
|
|
20
20
|
"triplestore": {
|
|
@@ -28,7 +28,7 @@ from common import (
|
|
|
28
28
|
|
|
29
29
|
SYSTEM_PROMPT = """\
|
|
30
30
|
You are a session distiller for a personal AI overlay system (sinain).
|
|
31
|
-
Your job: analyze a session transcript and extract
|
|
31
|
+
Your job: analyze a session transcript and extract ALL knowledge worth remembering.
|
|
32
32
|
|
|
33
33
|
The transcript contains feed items from sinain-core:
|
|
34
34
|
- audio: transcribed speech from the user's environment
|
|
@@ -37,24 +37,42 @@ The transcript contains feed items from sinain-core:
|
|
|
37
37
|
- system: system events and status messages
|
|
38
38
|
|
|
39
39
|
Extract:
|
|
40
|
-
1. whatHappened: 2-3 sentences summarizing what
|
|
41
|
-
2.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
40
|
+
1. whatHappened: 2-3 sentences summarizing what occurred in this session
|
|
41
|
+
2. facts: up to 15 concrete factual claims. Each must be a self-contained sentence. \
|
|
42
|
+
IMPORTANT — spread across these dimensions (do not let one theme dominate):
|
|
43
|
+
- WHO: people mentioned, their roles, backgrounds, relationships to each other
|
|
44
|
+
- WHAT: specific claims, properties, descriptions of things discussed
|
|
45
|
+
- HOW MUCH: any numbers, quantities, dates, durations, counts stated
|
|
46
|
+
- WHAT CHANGED: decisions made, agreements reached, state changes
|
|
47
|
+
- WHAT'S NEXT: commitments, action items, plans, deadlines
|
|
48
|
+
If you have 5+ facts about one dimension and 0 about another that was discussed, \
|
|
49
|
+
you are missing something. Breadth over depth.
|
|
50
|
+
Good: "The CTO of Al-Futaim previously worked at Citibank for 17 years as Director of IT in Singapore"
|
|
51
|
+
Good: "Citibank has 2400 IntelliJ subscriptions and heavy TeamCity usage"
|
|
52
|
+
Good: "The meeting is 45 minutes, scheduled for Tuesday"
|
|
53
|
+
Bad: "client-understanding-key: True"
|
|
54
|
+
Bad: five variations of "Al-Futaim is moving to the cloud"
|
|
55
|
+
3. decisions: up to 5 decisions or agreements made (who decided what, with any deadline)
|
|
56
|
+
4. entities: named things discussed or interacted with — as objects with name \
|
|
57
|
+
(lowercase-hyphenated slug) and type (freeform — person, org, tool, file, concept, \
|
|
58
|
+
service, framework, error, whatever fits the context).
|
|
59
|
+
Examples: {"name": "citibank", "type": "org"}, {"name": "auth-module", "type": "file"}, \
|
|
60
|
+
{"name": "react-native", "type": "framework"}
|
|
61
|
+
5. patterns: up to 3 reusable techniques or workflows (if any — skip if none)
|
|
62
|
+
6. preferences: up to 3 user preferences or habits observed
|
|
63
|
+
|
|
64
|
+
If existing entities are provided, reference them by name to enable reinforcement.
|
|
65
|
+
Focus on CONCRETE, SPECIFIC knowledge. Skip vague observations.
|
|
66
|
+
If the session was idle or empty, say so briefly.
|
|
49
67
|
|
|
50
68
|
Respond with ONLY a JSON object:
|
|
51
69
|
{
|
|
52
70
|
"whatHappened": "string",
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
71
|
+
"facts": ["self-contained factual sentence", ...],
|
|
72
|
+
"decisions": ["decision sentence with who/what/when", ...],
|
|
73
|
+
"entities": [{"name": "citibank", "type": "org"}, {"name": "artom", "type": "person"}, ...],
|
|
74
|
+
"patterns": ["reusable technique or workflow", ...],
|
|
75
|
+
"preferences": ["user preference or habit", ...],
|
|
58
76
|
"isEmpty": false
|
|
59
77
|
}"""
|
|
60
78
|
|
|
@@ -95,6 +113,7 @@ def main() -> None:
|
|
|
95
113
|
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
96
114
|
parser.add_argument("--transcript", required=True, help="JSON array of feed items")
|
|
97
115
|
parser.add_argument("--session-meta", default="{}", help="JSON session metadata")
|
|
116
|
+
parser.add_argument("--existing-entities", default="", help="Compact summary of existing knowledge graph entities")
|
|
98
117
|
args = parser.parse_args()
|
|
99
118
|
|
|
100
119
|
# Parse inputs
|
|
@@ -111,11 +130,11 @@ def main() -> None:
|
|
|
111
130
|
if not items or len(items) < 2:
|
|
112
131
|
output_json({
|
|
113
132
|
"whatHappened": "Empty or trivial session",
|
|
133
|
+
"facts": [],
|
|
134
|
+
"decisions": [],
|
|
135
|
+
"entities": [],
|
|
114
136
|
"patterns": [],
|
|
115
|
-
"antiPatterns": [],
|
|
116
137
|
"preferences": [],
|
|
117
|
-
"entities": [],
|
|
118
|
-
"toolInsights": [],
|
|
119
138
|
"isEmpty": True,
|
|
120
139
|
})
|
|
121
140
|
return
|
|
@@ -130,11 +149,16 @@ def main() -> None:
|
|
|
130
149
|
lines = [l for l in playbook.splitlines() if l.strip() and not l.startswith("<!--")]
|
|
131
150
|
playbook_summary = f"\n\n## Current Playbook (for reference — don't repeat known patterns)\n{chr(10).join(lines[:30])}"
|
|
132
151
|
|
|
152
|
+
# Include existing entities for retrieve-before-extract (Mem0 pattern)
|
|
153
|
+
existing_section = ""
|
|
154
|
+
if args.existing_entities and args.existing_entities.strip():
|
|
155
|
+
existing_section = f"\n\n## Existing Knowledge (reinforce or update these if the session confirms/changes them)\n{args.existing_entities}"
|
|
156
|
+
|
|
133
157
|
user_prompt = f"""## Session Transcript ({len(items)} items)
|
|
134
158
|
{transcript_text}
|
|
135
159
|
|
|
136
160
|
## Session Metadata
|
|
137
|
-
{json.dumps(meta, indent=2)}{playbook_summary}"""
|
|
161
|
+
{json.dumps(meta, indent=2)}{playbook_summary}{existing_section}"""
|
|
138
162
|
|
|
139
163
|
try:
|
|
140
164
|
raw = call_llm_with_fallback(
|
|
@@ -79,6 +79,40 @@ CREATE INDEX IF NOT EXISTS idx_avet
|
|
|
79
79
|
ON triples(attribute, value, entity_id, tx_id);
|
|
80
80
|
"""
|
|
81
81
|
|
|
82
|
+
_FTS_SQL = """
|
|
83
|
+
-- Full-text search on fact values (for hybrid retrieval)
|
|
84
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS triples_fts
|
|
85
|
+
USING fts5(entity_id, value, content=triples, content_rowid=id);
|
|
86
|
+
|
|
87
|
+
-- Triggers to keep FTS in sync with triples table
|
|
88
|
+
CREATE TRIGGER IF NOT EXISTS triples_ai AFTER INSERT ON triples BEGIN
|
|
89
|
+
INSERT INTO triples_fts(rowid, entity_id, value) VALUES (new.id, new.entity_id, new.value);
|
|
90
|
+
END;
|
|
91
|
+
|
|
92
|
+
CREATE TRIGGER IF NOT EXISTS triples_ad AFTER DELETE ON triples BEGIN
|
|
93
|
+
INSERT INTO triples_fts(triples_fts, rowid, entity_id, value) VALUES ('delete', old.id, old.entity_id, old.value);
|
|
94
|
+
END;
|
|
95
|
+
|
|
96
|
+
CREATE TRIGGER IF NOT EXISTS triples_au AFTER UPDATE ON triples BEGIN
|
|
97
|
+
INSERT INTO triples_fts(triples_fts, rowid, entity_id, value) VALUES ('delete', old.id, old.entity_id, old.value);
|
|
98
|
+
INSERT INTO triples_fts(rowid, entity_id, value) VALUES (new.id, new.entity_id, new.value);
|
|
99
|
+
END;
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
_TOUCHED_SQL = """
|
|
103
|
+
-- Track which entities are modified per transaction (for fast novelty checks)
|
|
104
|
+
CREATE TABLE IF NOT EXISTS touched_entities (
|
|
105
|
+
tx_id INTEGER NOT NULL,
|
|
106
|
+
entity_id TEXT NOT NULL,
|
|
107
|
+
PRIMARY KEY (tx_id, entity_id)
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
CREATE TRIGGER IF NOT EXISTS track_touched AFTER INSERT ON triples BEGIN
|
|
111
|
+
INSERT OR IGNORE INTO touched_entities (tx_id, entity_id)
|
|
112
|
+
VALUES (new.tx_id, new.entity_id);
|
|
113
|
+
END;
|
|
114
|
+
"""
|
|
115
|
+
|
|
82
116
|
|
|
83
117
|
def _now_iso() -> str:
|
|
84
118
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
@@ -119,6 +153,14 @@ class TripleStore:
|
|
|
119
153
|
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
120
154
|
self._conn.execute("PRAGMA busy_timeout=10000")
|
|
121
155
|
self._conn.executescript(_SCHEMA_SQL)
|
|
156
|
+
try:
|
|
157
|
+
self._conn.executescript(_FTS_SQL)
|
|
158
|
+
except sqlite3.OperationalError:
|
|
159
|
+
pass # FTS5 not available on this Python build — degrade gracefully
|
|
160
|
+
try:
|
|
161
|
+
self._conn.executescript(_TOUCHED_SQL)
|
|
162
|
+
except sqlite3.OperationalError:
|
|
163
|
+
pass
|
|
122
164
|
self._migrate()
|
|
123
165
|
self._conn.commit()
|
|
124
166
|
|
|
@@ -409,6 +451,24 @@ class TripleStore:
|
|
|
409
451
|
).fetchall()
|
|
410
452
|
return [dict(r) for r in rows]
|
|
411
453
|
|
|
454
|
+
# ----- Touched entities (fast novelty check) -----
|
|
455
|
+
|
|
456
|
+
def was_touched(self, entity_id: str, since_tx: int) -> bool:
|
|
457
|
+
"""Check if entity was modified since a given transaction. O(1) via index."""
|
|
458
|
+
row = self._conn.execute(
|
|
459
|
+
"SELECT 1 FROM touched_entities WHERE entity_id = ? AND tx_id > ? LIMIT 1",
|
|
460
|
+
(entity_id, since_tx),
|
|
461
|
+
).fetchone()
|
|
462
|
+
return row is not None
|
|
463
|
+
|
|
464
|
+
def touched_entities_since(self, since_tx: int) -> list[str]:
|
|
465
|
+
"""Return entity_ids modified since a transaction."""
|
|
466
|
+
rows = self._conn.execute(
|
|
467
|
+
"SELECT DISTINCT entity_id FROM touched_entities WHERE tx_id > ?",
|
|
468
|
+
(since_tx,),
|
|
469
|
+
).fetchall()
|
|
470
|
+
return [r[0] for r in rows]
|
|
471
|
+
|
|
412
472
|
# ----- Stats -----
|
|
413
473
|
|
|
414
474
|
def stats(self) -> dict:
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
Binary file
|
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
"""Behavioral assertion library for sinain-koog tick evaluation.
|
|
2
|
-
|
|
3
|
-
Each assertion function validates a runtime invariant of the pipeline.
|
|
4
|
-
Returns ``{"name": str, "passed": bool, "detail": str}``.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _result(name: str, passed: bool, detail: str) -> dict:
|
|
11
|
-
return {"name": name, "passed": passed, "detail": detail}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# ---------------------------------------------------------------------------
|
|
15
|
-
# Playbook curator assertions
|
|
16
|
-
# ---------------------------------------------------------------------------
|
|
17
|
-
|
|
18
|
-
def assert_playbook_under_limit(curator_result: dict, limit: int = 50) -> dict:
|
|
19
|
-
"""Verify playbook body stays under the line limit."""
|
|
20
|
-
lines = curator_result.get("playbookLines", 0)
|
|
21
|
-
if lines <= limit:
|
|
22
|
-
return _result("playbook_under_limit", True, f"body has {lines} lines (limit {limit})")
|
|
23
|
-
return _result("playbook_under_limit", False, f"body has {lines} lines, exceeds limit of {limit}")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def assert_curator_respected_directive(curator_result: dict, directive: str) -> dict:
|
|
27
|
-
"""Check that curator changes align with the curate directive."""
|
|
28
|
-
changes = curator_result.get("changes", {})
|
|
29
|
-
added = len(changes.get("added", []))
|
|
30
|
-
pruned = len(changes.get("pruned", []))
|
|
31
|
-
|
|
32
|
-
if directive == "aggressive_prune":
|
|
33
|
-
# Should have pruned items
|
|
34
|
-
if pruned > 0:
|
|
35
|
-
return _result("curator_respected_directive", True,
|
|
36
|
-
f"aggressive_prune: pruned {pruned} items")
|
|
37
|
-
if added == 0 and pruned == 0:
|
|
38
|
-
return _result("curator_respected_directive", True,
|
|
39
|
-
"aggressive_prune: no changes (acceptable if playbook already lean)")
|
|
40
|
-
return _result("curator_respected_directive", False,
|
|
41
|
-
f"aggressive_prune: added {added} but pruned {pruned} — expected pruning")
|
|
42
|
-
|
|
43
|
-
if directive == "stability":
|
|
44
|
-
# Should not aggressively prune established patterns
|
|
45
|
-
if pruned > added + 2:
|
|
46
|
-
return _result("curator_respected_directive", False,
|
|
47
|
-
f"stability: pruned {pruned} items (only added {added}) — too aggressive for stability mode")
|
|
48
|
-
return _result("curator_respected_directive", True,
|
|
49
|
-
f"stability: added {added}, pruned {pruned} — conservative")
|
|
50
|
-
|
|
51
|
-
# normal / insufficient_data — any reasonable mix is fine
|
|
52
|
-
return _result("curator_respected_directive", True,
|
|
53
|
-
f"{directive}: added {added}, pruned {pruned}")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# ---------------------------------------------------------------------------
|
|
57
|
-
# Signal analyzer assertions
|
|
58
|
-
# ---------------------------------------------------------------------------
|
|
59
|
-
|
|
60
|
-
def assert_no_repeat_action(signal_result: dict, recent_logs: list[dict], window: int = 3) -> dict:
|
|
61
|
-
"""Verify recommendedAction doesn't repeat the last N ticks' actions."""
|
|
62
|
-
action = signal_result.get("recommendedAction")
|
|
63
|
-
if action is None or action.get("action") == "skip":
|
|
64
|
-
return _result("no_repeat_action", True, "no action recommended (skip/null)")
|
|
65
|
-
|
|
66
|
-
task = (action.get("task") or "").lower().strip()
|
|
67
|
-
if not task:
|
|
68
|
-
return _result("no_repeat_action", True, "no task description to compare")
|
|
69
|
-
|
|
70
|
-
# Collect recent action tasks
|
|
71
|
-
recent_tasks: list[str] = []
|
|
72
|
-
for log in recent_logs[:window]:
|
|
73
|
-
log_actions = log.get("actionsConsidered", [])
|
|
74
|
-
for a in log_actions:
|
|
75
|
-
if a.get("chosen"):
|
|
76
|
-
recent_tasks.append((a.get("reason") or a.get("task") or "").lower().strip())
|
|
77
|
-
|
|
78
|
-
# Check for near-duplicate (substring match to catch rephrasing)
|
|
79
|
-
for prev_task in recent_tasks:
|
|
80
|
-
if not prev_task:
|
|
81
|
-
continue
|
|
82
|
-
# If >60% of words overlap, consider it a repeat
|
|
83
|
-
task_words = set(task.split())
|
|
84
|
-
prev_words = set(prev_task.split())
|
|
85
|
-
if not task_words or not prev_words:
|
|
86
|
-
continue
|
|
87
|
-
overlap = len(task_words & prev_words) / max(len(task_words), len(prev_words))
|
|
88
|
-
if overlap > 0.6:
|
|
89
|
-
return _result("no_repeat_action", False,
|
|
90
|
-
f"action task '{task[:60]}' overlaps with recent '{prev_task[:60]}' ({overlap:.0%} word overlap)")
|
|
91
|
-
|
|
92
|
-
return _result("no_repeat_action", True,
|
|
93
|
-
f"action task is distinct from last {window} ticks")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def assert_signal_confidence_threshold(signal_result: dict, threshold: float = 0.5) -> dict:
|
|
97
|
-
"""Verify actions are only recommended above the confidence threshold."""
|
|
98
|
-
action = signal_result.get("recommendedAction")
|
|
99
|
-
if action is None or action.get("action") == "skip":
|
|
100
|
-
return _result("signal_confidence_threshold", True, "no action recommended")
|
|
101
|
-
|
|
102
|
-
confidence = action.get("confidence")
|
|
103
|
-
if confidence is None:
|
|
104
|
-
return _result("signal_confidence_threshold", False,
|
|
105
|
-
"action recommended but no confidence value provided")
|
|
106
|
-
|
|
107
|
-
if confidence >= threshold:
|
|
108
|
-
return _result("signal_confidence_threshold", True,
|
|
109
|
-
f"confidence {confidence:.2f} >= threshold {threshold}")
|
|
110
|
-
return _result("signal_confidence_threshold", False,
|
|
111
|
-
f"confidence {confidence:.2f} < threshold {threshold}")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# ---------------------------------------------------------------------------
|
|
115
|
-
# Insight synthesizer assertions
|
|
116
|
-
# ---------------------------------------------------------------------------
|
|
117
|
-
|
|
118
|
-
def assert_insight_char_limit(synth_result: dict, limit: int = 500) -> dict:
|
|
119
|
-
"""Verify suggestion+insight stays under the character limit."""
|
|
120
|
-
if synth_result.get("skip", False):
|
|
121
|
-
return _result("insight_char_limit", True, "output skipped")
|
|
122
|
-
|
|
123
|
-
suggestion = synth_result.get("suggestion", "")
|
|
124
|
-
insight = synth_result.get("insight", "")
|
|
125
|
-
total = len(suggestion) + len(insight)
|
|
126
|
-
|
|
127
|
-
if total <= limit:
|
|
128
|
-
return _result("insight_char_limit", True, f"total {total} chars (limit {limit})")
|
|
129
|
-
return _result("insight_char_limit", False, f"total {total} chars exceeds limit of {limit}")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def assert_skip_reason_specific(synth_result: dict) -> dict:
|
|
133
|
-
"""If skip=true, verify the reason is specific (not generic boilerplate)."""
|
|
134
|
-
if not synth_result.get("skip", False):
|
|
135
|
-
return _result("skip_reason_specific", True, "output not skipped")
|
|
136
|
-
|
|
137
|
-
reason = (synth_result.get("skipReason") or "").strip()
|
|
138
|
-
if not reason:
|
|
139
|
-
return _result("skip_reason_specific", False, "skip=true but no skipReason provided")
|
|
140
|
-
|
|
141
|
-
# Check against known-generic patterns
|
|
142
|
-
generic_phrases = [
|
|
143
|
-
"no new data",
|
|
144
|
-
"nothing new",
|
|
145
|
-
"no updates",
|
|
146
|
-
"insufficient data",
|
|
147
|
-
"not enough information",
|
|
148
|
-
"no changes",
|
|
149
|
-
]
|
|
150
|
-
reason_lower = reason.lower()
|
|
151
|
-
for phrase in generic_phrases:
|
|
152
|
-
if reason_lower == phrase or (len(reason_lower) < 30 and phrase in reason_lower):
|
|
153
|
-
return _result("skip_reason_specific", False,
|
|
154
|
-
f"skipReason is too generic: '{reason}'")
|
|
155
|
-
|
|
156
|
-
return _result("skip_reason_specific", True, f"skipReason is specific ({len(reason)} chars)")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
# ---------------------------------------------------------------------------
|
|
160
|
-
# Memory miner assertions
|
|
161
|
-
# ---------------------------------------------------------------------------
|
|
162
|
-
|
|
163
|
-
def assert_miner_references_sources(miner_result: dict, daily_files: list[str]) -> dict:
|
|
164
|
-
"""Verify mining findings reference actual source files that were provided."""
|
|
165
|
-
mined = miner_result.get("minedSources", [])
|
|
166
|
-
if not mined:
|
|
167
|
-
return _result("miner_references_sources", True, "no sources mined (early return)")
|
|
168
|
-
|
|
169
|
-
# daily_files contains basenames like "2026-02-21.md"
|
|
170
|
-
known_basenames = set(daily_files)
|
|
171
|
-
unknown = [s for s in mined if s not in known_basenames]
|
|
172
|
-
|
|
173
|
-
if unknown:
|
|
174
|
-
return _result("miner_references_sources", False,
|
|
175
|
-
f"minedSources references unknown files: {unknown}")
|
|
176
|
-
return _result("miner_references_sources", True,
|
|
177
|
-
f"all {len(mined)} mined sources are valid")
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# ---------------------------------------------------------------------------
|
|
181
|
-
# Cross-script / structural assertions
|
|
182
|
-
# ---------------------------------------------------------------------------
|
|
183
|
-
|
|
184
|
-
def assert_schema_valid(script_name: str, output: dict, schema_errors: list[str]) -> dict:
|
|
185
|
-
"""Wrap schema validation result as an assertion."""
|
|
186
|
-
if not schema_errors:
|
|
187
|
-
return _result(f"schema_valid_{script_name}", True, "output matches schema")
|
|
188
|
-
return _result(f"schema_valid_{script_name}", False,
|
|
189
|
-
f"{len(schema_errors)} schema errors: {'; '.join(schema_errors[:3])}")
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
|
|
193
|
-
"""Verify the playbook still has its mining-index header and effectiveness footer."""
|
|
194
|
-
has_header = "<!-- mining-index:" in playbook_text
|
|
195
|
-
has_footer = "<!-- effectiveness:" in playbook_text
|
|
196
|
-
|
|
197
|
-
if has_header and has_footer:
|
|
198
|
-
return _result("playbook_header_footer_intact", True,
|
|
199
|
-
"both mining-index and effectiveness comments present")
|
|
200
|
-
missing = []
|
|
201
|
-
if not has_header:
|
|
202
|
-
missing.append("mining-index")
|
|
203
|
-
if not has_footer:
|
|
204
|
-
missing.append("effectiveness")
|
|
205
|
-
return _result("playbook_header_footer_intact", False,
|
|
206
|
-
f"missing playbook comments: {', '.join(missing)}")
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# ---------------------------------------------------------------------------
|
|
210
|
-
# Runner: execute all applicable assertions for a tick
|
|
211
|
-
# ---------------------------------------------------------------------------
|
|
212
|
-
|
|
213
|
-
def run_tick_assertions(
|
|
214
|
-
log_entry: dict,
|
|
215
|
-
recent_logs: list[dict],
|
|
216
|
-
playbook_text: str,
|
|
217
|
-
daily_files: list[str],
|
|
218
|
-
) -> list[dict]:
|
|
219
|
-
"""Run all applicable assertions against a single tick's log entry.
|
|
220
|
-
|
|
221
|
-
Returns a list of assertion result dicts.
|
|
222
|
-
"""
|
|
223
|
-
results: list[dict] = []
|
|
224
|
-
|
|
225
|
-
# Signal analyzer assertions
|
|
226
|
-
signals = log_entry.get("signals")
|
|
227
|
-
if signals is not None:
|
|
228
|
-
results.append(assert_signal_confidence_threshold(
|
|
229
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
230
|
-
))
|
|
231
|
-
results.append(assert_no_repeat_action(
|
|
232
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
233
|
-
recent_logs,
|
|
234
|
-
))
|
|
235
|
-
|
|
236
|
-
# Curator assertions — playbookChanges can be {"note": "skipped"} or full output
|
|
237
|
-
curator = log_entry.get("playbookChanges")
|
|
238
|
-
if isinstance(curator, dict) and "changes" in curator:
|
|
239
|
-
curator_with_lines = {**curator}
|
|
240
|
-
if "playbookLines" not in curator_with_lines:
|
|
241
|
-
curator_with_lines["playbookLines"] = curator.get("playbookLines", 0)
|
|
242
|
-
results.append(assert_playbook_under_limit(curator_with_lines))
|
|
243
|
-
|
|
244
|
-
directive = log_entry.get("curateDirective", "normal")
|
|
245
|
-
results.append(assert_curator_respected_directive(curator_with_lines, directive))
|
|
246
|
-
|
|
247
|
-
# Insight synthesizer assertions — output can be null (pipeline-level skip)
|
|
248
|
-
output = log_entry.get("output")
|
|
249
|
-
if isinstance(output, dict):
|
|
250
|
-
results.append(assert_insight_char_limit(output))
|
|
251
|
-
results.append(assert_skip_reason_specific(output))
|
|
252
|
-
|
|
253
|
-
# Mining assertions — log uses miningFindings (str) and minedSources (list)
|
|
254
|
-
mining = log_entry.get("miningResult")
|
|
255
|
-
if mining is not None:
|
|
256
|
-
results.append(assert_miner_references_sources(mining, daily_files))
|
|
257
|
-
elif log_entry.get("minedSources"):
|
|
258
|
-
# Reconstruct mining result from flat log fields
|
|
259
|
-
results.append(assert_miner_references_sources(
|
|
260
|
-
{"minedSources": log_entry.get("minedSources", [])}, daily_files
|
|
261
|
-
))
|
|
262
|
-
|
|
263
|
-
# Playbook health (if we have playbook text)
|
|
264
|
-
if playbook_text:
|
|
265
|
-
results.append(assert_playbook_header_footer_intact(playbook_text))
|
|
266
|
-
|
|
267
|
-
return results
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class BenchmarkQuestion:
|
|
11
|
-
id: str
|
|
12
|
-
text: str
|
|
13
|
-
gold_answer: str
|
|
14
|
-
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
-
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
-
metadata: dict = field(default_factory=dict)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class BenchmarkInstance:
|
|
21
|
-
"""A set of conversations + questions that share the same context."""
|
|
22
|
-
id: str
|
|
23
|
-
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
-
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
-
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
-
metadata: dict = field(default_factory=dict)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class BenchmarkAdapter(ABC):
|
|
30
|
-
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
@abstractmethod
|
|
34
|
-
def name(self) -> str:
|
|
35
|
-
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
-
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
-
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
-
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
-
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
-
|
|
9
|
-
# LLM models (via OpenRouter)
|
|
10
|
-
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
-
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
-
|
|
13
|
-
# Retrieval
|
|
14
|
-
K_VALUES = [1, 3, 5, 10]
|
|
15
|
-
MAX_FACTS_PER_QUERY = 10
|
|
16
|
-
|
|
17
|
-
# Ingestion
|
|
18
|
-
DISTILLER_TIMEOUT_S = 30
|
|
19
|
-
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
-
|
|
21
|
-
# Dataset URLs
|
|
22
|
-
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
-
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|