@geravant/sinain 1.13.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -2
- package/config-shared.js +1 -0
- package/package.json +4 -1
- package/sinain-agent/run.sh +36 -4
- package/sinain-core/src/buffers/feed-buffer.ts +6 -4
- package/sinain-core/src/index.ts +50 -19
- package/sinain-memory/graph_query.py +12 -3
- package/sinain-memory/knowledge_integrator.py +194 -10
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
- package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
- package/sinain-memory/eval/benchmarks/query.py +0 -193
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
- package/sinain-memory/eval/benchmarks/runner.py +0 -283
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
"""Query pipeline — benchmark questions → LLM answers under 3 conditions.
|
|
2
|
-
|
|
3
|
-
Condition A (sinain-memory): answer from knowledge graph facts
|
|
4
|
-
Condition B (full-context): answer from full conversation history
|
|
5
|
-
Condition C (knowledge-doc): answer from portable knowledge document
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from __future__ import annotations
|
|
9
|
-
|
|
10
|
-
import sys
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
|
|
13
|
-
# Add sinain-memory to path
|
|
14
|
-
_koog_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
15
|
-
if _koog_dir not in sys.path:
|
|
16
|
-
sys.path.insert(0, _koog_dir)
|
|
17
|
-
|
|
18
|
-
from common import call_llm # noqa: E402
|
|
19
|
-
|
|
20
|
-
from .base_adapter import BenchmarkQuestion
|
|
21
|
-
from .config import QA_MODEL, MAX_FACTS_PER_QUERY
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _extract_keywords(query: str) -> list[str]:
|
|
25
|
-
"""Extract search keywords (reuses logic from retrieval_evaluator)."""
|
|
26
|
-
import re
|
|
27
|
-
words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
|
|
28
|
-
stopwords = {
|
|
29
|
-
"the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an",
|
|
30
|
-
"it", "was", "not", "how", "what", "when", "does", "did", "do", "my",
|
|
31
|
-
"your", "their", "have", "has", "had", "are", "were", "been", "being",
|
|
32
|
-
"about", "from", "with", "that", "this", "which", "who", "whom",
|
|
33
|
-
"where", "why", "can", "could", "would", "should",
|
|
34
|
-
}
|
|
35
|
-
return [w for w in words if len(w) > 2 and w not in stopwords]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _get_all_facts_text(db_path: str) -> str:
|
|
39
|
-
"""Dump ALL facts from the knowledge graph as formatted text.
|
|
40
|
-
|
|
41
|
-
Sinain triplestores are small (10-50 facts per session), so including
|
|
42
|
-
everything is feasible and avoids tag-matching failures.
|
|
43
|
-
"""
|
|
44
|
-
from graph_query import query_top_facts, format_facts_text
|
|
45
|
-
|
|
46
|
-
facts = query_top_facts(db_path, limit=50)
|
|
47
|
-
if not facts:
|
|
48
|
-
return "(no knowledge available)"
|
|
49
|
-
return format_facts_text(facts, max_chars=6000)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _query_knowledge(db_path: str, question: str) -> str:
|
|
53
|
-
"""Query sinain knowledge graph for facts relevant to a question.
|
|
54
|
-
|
|
55
|
-
Strategy: retrieve broadly, then re-rank by keyword overlap with the question.
|
|
56
|
-
This ensures specific facts (CTO background) beat generic ones (meeting schedule)
|
|
57
|
-
when the question asks about the CTO.
|
|
58
|
-
"""
|
|
59
|
-
from graph_query import query_facts_hybrid, query_top_facts, format_facts_text
|
|
60
|
-
|
|
61
|
-
# Retrieve a broad candidate set
|
|
62
|
-
candidates = query_facts_hybrid(db_path, question, max_facts=30)
|
|
63
|
-
if not candidates:
|
|
64
|
-
candidates = query_top_facts(db_path, limit=30)
|
|
65
|
-
if not candidates:
|
|
66
|
-
return "(no knowledge available)"
|
|
67
|
-
|
|
68
|
-
# Re-rank by embedding similarity if available, fall back to keyword overlap
|
|
69
|
-
try:
|
|
70
|
-
from embed_client import rank_by_similarity
|
|
71
|
-
fact_texts = [str(f.get("value", "")) for f in candidates]
|
|
72
|
-
ranked_indices = rank_by_similarity(question, fact_texts)
|
|
73
|
-
if ranked_indices is not None:
|
|
74
|
-
ranked = [candidates[i] for i, _ in ranked_indices[:MAX_FACTS_PER_QUERY]]
|
|
75
|
-
return format_facts_text(ranked, max_chars=3000)
|
|
76
|
-
except Exception:
|
|
77
|
-
pass
|
|
78
|
-
|
|
79
|
-
# Fallback: keyword overlap ranking
|
|
80
|
-
q_keywords = set(_extract_keywords(question))
|
|
81
|
-
def _relevance(fact: dict) -> float:
|
|
82
|
-
value = str(fact.get("value", "")).lower()
|
|
83
|
-
entity = str(fact.get("entity", "")).lower()
|
|
84
|
-
fact_words = set(_extract_keywords(value + " " + entity))
|
|
85
|
-
if not q_keywords:
|
|
86
|
-
return 0.0
|
|
87
|
-
return len(q_keywords & fact_words) / len(q_keywords)
|
|
88
|
-
|
|
89
|
-
ranked = sorted(candidates, key=_relevance, reverse=True)
|
|
90
|
-
return format_facts_text(ranked[:MAX_FACTS_PER_QUERY], max_chars=3000)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
|
|
94
|
-
"""Get facts retrieved for a question (for retrieval evaluation)."""
|
|
95
|
-
from graph_query import query_facts_hybrid, query_top_facts
|
|
96
|
-
|
|
97
|
-
facts = query_facts_hybrid(db_path, question, max_facts=k)
|
|
98
|
-
if facts:
|
|
99
|
-
return facts
|
|
100
|
-
|
|
101
|
-
# Fallback: top facts by confidence
|
|
102
|
-
return query_top_facts(db_path, limit=k)
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def compute_content_recall(
|
|
106
|
-
retrieved_facts: list[dict],
|
|
107
|
-
gold_answer: str,
|
|
108
|
-
k_values: list[int] | None = None,
|
|
109
|
-
) -> dict:
|
|
110
|
-
"""Content-based retrieval metric: do retrieved facts contain the answer?
|
|
111
|
-
|
|
112
|
-
Instead of matching entity IDs (which don't align between LongMemEval
|
|
113
|
-
session IDs and sinain entity IDs), we check whether the gold answer's
|
|
114
|
-
key terms appear in any retrieved fact's content.
|
|
115
|
-
"""
|
|
116
|
-
from .config import K_VALUES
|
|
117
|
-
ks = k_values or K_VALUES
|
|
118
|
-
|
|
119
|
-
gold_terms = set(_extract_keywords(str(gold_answer)))
|
|
120
|
-
if not gold_terms:
|
|
121
|
-
return {f"content_recall@{k}": 0.0 for k in ks}
|
|
122
|
-
|
|
123
|
-
result = {}
|
|
124
|
-
for k in ks:
|
|
125
|
-
top_k = retrieved_facts[:k]
|
|
126
|
-
# Check if any fact in top-k contains gold answer terms
|
|
127
|
-
for fact in top_k:
|
|
128
|
-
fact_text = f"{fact.get('entity', '')} {fact.get('value', '')}".lower()
|
|
129
|
-
fact_terms = set(_extract_keywords(fact_text))
|
|
130
|
-
overlap = gold_terms & fact_terms
|
|
131
|
-
if len(overlap) >= max(1, len(gold_terms) // 2): # ≥50% of gold terms
|
|
132
|
-
result[f"content_recall@{k}"] = 1.0
|
|
133
|
-
break
|
|
134
|
-
else:
|
|
135
|
-
result[f"content_recall@{k}"] = 0.0
|
|
136
|
-
|
|
137
|
-
return result
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def answer_question(
|
|
141
|
-
question: BenchmarkQuestion,
|
|
142
|
-
condition: str,
|
|
143
|
-
*,
|
|
144
|
-
db_path: str | None = None,
|
|
145
|
-
full_context: str | None = None,
|
|
146
|
-
knowledge_doc: str | None = None,
|
|
147
|
-
model: str | None = None,
|
|
148
|
-
) -> str:
|
|
149
|
-
"""Generate an answer for a benchmark question under a specific condition.
|
|
150
|
-
|
|
151
|
-
Returns the LLM's answer text.
|
|
152
|
-
"""
|
|
153
|
-
qa_model = model or QA_MODEL
|
|
154
|
-
|
|
155
|
-
if condition == "sinain-memory":
|
|
156
|
-
assert db_path, "db_path required for sinain-memory condition"
|
|
157
|
-
facts = _query_knowledge(db_path, question.text)
|
|
158
|
-
system = (
|
|
159
|
-
"Answer the question using ONLY the provided knowledge facts. "
|
|
160
|
-
"If the facts don't contain enough information to answer, say \"I don't know.\""
|
|
161
|
-
)
|
|
162
|
-
user = f"## Knowledge Facts\n{facts}\n\n## Question\n{question.text}"
|
|
163
|
-
|
|
164
|
-
elif condition == "full-context":
|
|
165
|
-
assert full_context, "full_context required for full-context condition"
|
|
166
|
-
system = (
|
|
167
|
-
"Answer the question based on the conversation history below. "
|
|
168
|
-
"Be concise and specific."
|
|
169
|
-
)
|
|
170
|
-
# Truncate context if too large (some models have limits)
|
|
171
|
-
ctx = full_context[:100_000] if len(full_context) > 100_000 else full_context
|
|
172
|
-
user = f"## Conversation History\n{ctx}\n\n## Question\n{question.text}"
|
|
173
|
-
|
|
174
|
-
elif condition == "knowledge-doc":
|
|
175
|
-
assert knowledge_doc, "knowledge_doc required for knowledge-doc condition"
|
|
176
|
-
system = (
|
|
177
|
-
"Answer the question using ONLY the knowledge document provided. "
|
|
178
|
-
"If the document doesn't contain enough information, say \"I don't know.\""
|
|
179
|
-
)
|
|
180
|
-
user = f"## Knowledge Document\n{knowledge_doc}\n\n## Question\n{question.text}"
|
|
181
|
-
|
|
182
|
-
else:
|
|
183
|
-
raise ValueError(f"Unknown condition: {condition}")
|
|
184
|
-
|
|
185
|
-
try:
|
|
186
|
-
return call_llm(
|
|
187
|
-
system_prompt=system,
|
|
188
|
-
user_prompt=user,
|
|
189
|
-
model=qa_model,
|
|
190
|
-
max_tokens=300,
|
|
191
|
-
).strip()
|
|
192
|
-
except Exception as e:
|
|
193
|
-
return f"(error: {e})"
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
"""Report generation — markdown, JSON, and LaTeX output."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def generate_markdown(benchmark_name: str, summary: dict, details: list[dict]) -> str:
|
|
10
|
-
"""Generate a publishable markdown report."""
|
|
11
|
-
lines = [
|
|
12
|
-
f"# Sinain Knowledge Graph — {benchmark_name} Results",
|
|
13
|
-
f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
|
|
14
|
-
"",
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
# Headline IPR
|
|
18
|
-
ipr = summary.get("ipr")
|
|
19
|
-
if ipr:
|
|
20
|
-
lines.append(f"**Information Preservation Rate (IPR)**: {ipr:.1%}")
|
|
21
|
-
lines.append("")
|
|
22
|
-
|
|
23
|
-
# Condition scores table
|
|
24
|
-
conditions = summary.get("conditions", {})
|
|
25
|
-
if conditions:
|
|
26
|
-
cond_names = sorted(conditions.keys())
|
|
27
|
-
header = "| Condition | Mean Score (1-5) | Mean F1 | N |"
|
|
28
|
-
sep = "|-----------|------------------|---------|---|"
|
|
29
|
-
lines.extend([header, sep])
|
|
30
|
-
for cond in cond_names:
|
|
31
|
-
c = conditions[cond]
|
|
32
|
-
lines.append(f"| {cond} | {c['mean_score']:.2f} | {c.get('mean_f1', 0):.2f} | {c['n']} |")
|
|
33
|
-
lines.append("")
|
|
34
|
-
|
|
35
|
-
# Retrieval metrics
|
|
36
|
-
retrieval = summary.get("retrieval", {})
|
|
37
|
-
if retrieval:
|
|
38
|
-
lines.append("## Retrieval Quality")
|
|
39
|
-
lines.append("| Metric | Score |")
|
|
40
|
-
lines.append("|--------|-------|")
|
|
41
|
-
for k, v in sorted(retrieval.items()):
|
|
42
|
-
lines.append(f"| {k} | {v:.1%} |")
|
|
43
|
-
lines.append("")
|
|
44
|
-
|
|
45
|
-
# Category breakdown
|
|
46
|
-
categories = summary.get("categories", {})
|
|
47
|
-
if categories:
|
|
48
|
-
lines.append("## By Category")
|
|
49
|
-
cond_names = sorted(set(c for cat in categories.values() for c in cat))
|
|
50
|
-
header = "| Category | " + " | ".join(cond_names) + " |"
|
|
51
|
-
sep = "|----------|" + "|".join(["------"] * len(cond_names)) + "|"
|
|
52
|
-
lines.extend([header, sep])
|
|
53
|
-
for cat in sorted(categories):
|
|
54
|
-
cells = []
|
|
55
|
-
for cond in cond_names:
|
|
56
|
-
if cond in categories[cat]:
|
|
57
|
-
cells.append(f"{categories[cat][cond]['mean_score']:.2f} (n={categories[cat][cond]['n']})")
|
|
58
|
-
else:
|
|
59
|
-
cells.append("-")
|
|
60
|
-
lines.append(f"| {cat} | " + " | ".join(cells) + " |")
|
|
61
|
-
lines.append("")
|
|
62
|
-
|
|
63
|
-
# Failures (worst questions)
|
|
64
|
-
if details:
|
|
65
|
-
sm_details = [d for d in details if d.get("answers", {}).get("sinain-memory", {}).get("score") is not None]
|
|
66
|
-
sm_details.sort(key=lambda d: d["answers"]["sinain-memory"]["score"])
|
|
67
|
-
if sm_details:
|
|
68
|
-
lines.append("## Hardest Questions for sinain-memory (bottom 5)")
|
|
69
|
-
for d in sm_details[:5]:
|
|
70
|
-
sm = d["answers"]["sinain-memory"]
|
|
71
|
-
fc = d["answers"].get("full-context", {})
|
|
72
|
-
lines.append(f"- **{d['id']}** [{d['category']}]: score={sm['score']}/5 "
|
|
73
|
-
f"(full-context: {fc.get('score', '?')}/5)")
|
|
74
|
-
lines.append(f" Q: {d['question'][:100]}...")
|
|
75
|
-
lines.append("")
|
|
76
|
-
|
|
77
|
-
return "\n".join(lines)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def generate_json(benchmark_name: str, summary: dict, details: list[dict]) -> str:
|
|
81
|
-
"""Generate JSON report."""
|
|
82
|
-
return json.dumps({
|
|
83
|
-
"benchmark": benchmark_name,
|
|
84
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
85
|
-
"summary": summary,
|
|
86
|
-
"details": details,
|
|
87
|
-
}, indent=2, ensure_ascii=False)
|
|
@@ -1,318 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
# ── Meeting Memory Benchmark — end-to-end capture + evaluate ─────────────────
|
|
5
|
-
# 1. Opens meeting recording fullscreen in QuickTime
|
|
6
|
-
# 2. Starts sinain (audio + sense capture, no agent, no overlay)
|
|
7
|
-
# 3. Waits for recording to finish
|
|
8
|
-
# 4. Stops sinain → saves pending session
|
|
9
|
-
# 5. Restarts sinain → distills pending session into knowledge graph
|
|
10
|
-
# 6. Runs evaluation harness against the distilled DB
|
|
11
|
-
#
|
|
12
|
-
# Usage: ./run_meeting_bench.sh <mp4_path>
|
|
13
|
-
# Output: eval/benchmarks/results/meeting_results.md
|
|
14
|
-
|
|
15
|
-
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
16
|
-
SINAIN_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
|
|
17
|
-
KOOG_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
18
|
-
|
|
19
|
-
BOLD='\033[1m'
|
|
20
|
-
GREEN='\033[0;32m'
|
|
21
|
-
YELLOW='\033[0;33m'
|
|
22
|
-
RED='\033[0;31m'
|
|
23
|
-
CYAN='\033[0;36m'
|
|
24
|
-
RESET='\033[0m'
|
|
25
|
-
|
|
26
|
-
log() { echo -e "${BOLD}[bench]${RESET} $*"; }
|
|
27
|
-
ok() { echo -e "${BOLD}[bench]${RESET} ${GREEN}✓${RESET} $*"; }
|
|
28
|
-
warn() { echo -e "${BOLD}[bench]${RESET} ${YELLOW}⚠${RESET} $*"; }
|
|
29
|
-
fail() { echo -e "${BOLD}[bench]${RESET} ${RED}✗${RESET} $*"; exit 1; }
|
|
30
|
-
|
|
31
|
-
# ── Args ─────────────────────────────────────────────────────────────────────
|
|
32
|
-
RECORDING="${1:-}"
|
|
33
|
-
if [ -z "$RECORDING" ] || [ ! -f "$RECORDING" ]; then
|
|
34
|
-
fail "Usage: $0 <path-to-mp4>"
|
|
35
|
-
fi
|
|
36
|
-
|
|
37
|
-
# ── Setup ────────────────────────────────────────────────────────────────────
|
|
38
|
-
BENCH_DIR="/tmp/sinain-bench-$(date +%s)"
|
|
39
|
-
mkdir -p "$BENCH_DIR"
|
|
40
|
-
log "Benchmark directory: ${CYAN}${BENCH_DIR}${RESET}"
|
|
41
|
-
|
|
42
|
-
# Source .env for API keys and audio config (safe parser from start-local.sh)
|
|
43
|
-
for _env_file in "$SINAIN_ROOT/.env" "$SINAIN_ROOT/sinain-core/.env" "$HOME/.sinain/.env"; do
|
|
44
|
-
if [ -f "$_env_file" ]; then
|
|
45
|
-
log "Loading $_env_file"
|
|
46
|
-
while IFS='=' read -r _k _v; do
|
|
47
|
-
[[ -z "$_k" || "$_k" =~ ^[[:space:]]*# ]] && continue
|
|
48
|
-
_k=$(echo "$_k" | xargs)
|
|
49
|
-
_v=$(echo "$_v" | xargs)
|
|
50
|
-
_v="${_v%%#*}"
|
|
51
|
-
_v=$(echo "$_v" | xargs)
|
|
52
|
-
[[ -z "$_v" ]] && continue
|
|
53
|
-
if [ -z "${!_k+x}" ]; then export "$_k=$_v"; fi
|
|
54
|
-
done < "$_env_file"
|
|
55
|
-
break
|
|
56
|
-
fi
|
|
57
|
-
done
|
|
58
|
-
|
|
59
|
-
# Bench-specific overrides
|
|
60
|
-
export SINAIN_MEMORY_DIR="$BENCH_DIR"
|
|
61
|
-
export AGENT_ENABLED=false
|
|
62
|
-
export ESCALATION_MODE=off
|
|
63
|
-
|
|
64
|
-
# Local whisper setup (from start-local.sh)
|
|
65
|
-
MODEL_DIR="$HOME/models"
|
|
66
|
-
MODEL_NAME="ggml-large-v3-turbo.bin"
|
|
67
|
-
export LOCAL_WHISPER_MODEL="${LOCAL_WHISPER_MODEL:-$MODEL_DIR/$MODEL_NAME}"
|
|
68
|
-
export LOCAL_WHISPER_BIN="${LOCAL_WHISPER_BIN:-whisper-cli}"
|
|
69
|
-
export TRANSCRIPTION_BACKEND=local
|
|
70
|
-
|
|
71
|
-
# ── Get recording duration ───────────────────────────────────────────────────
|
|
72
|
-
DURATION_RAW=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$RECORDING" 2>/dev/null || echo "")
|
|
73
|
-
if [ -n "$DURATION_RAW" ]; then
|
|
74
|
-
DURATION=$(echo "$DURATION_RAW" | cut -d. -f1)
|
|
75
|
-
else
|
|
76
|
-
DURATION=1620 # fallback: 27 min
|
|
77
|
-
fi
|
|
78
|
-
log "Recording duration: ${DURATION}s (~$((DURATION / 60))m)"
|
|
79
|
-
|
|
80
|
-
# ── Cleanup handler ──────────────────────────────────────────────────────────
|
|
81
|
-
CORE_PID=""
|
|
82
|
-
SENSE_PID=""
|
|
83
|
-
|
|
84
|
-
cleanup() {
|
|
85
|
-
log "Cleaning up..."
|
|
86
|
-
[ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
|
|
87
|
-
[ -n "$CORE_PID" ] && kill "$CORE_PID" 2>/dev/null || true
|
|
88
|
-
sleep 2
|
|
89
|
-
[ -n "$SENSE_PID" ] && kill -9 "$SENSE_PID" 2>/dev/null || true
|
|
90
|
-
[ -n "$CORE_PID" ] && kill -9 "$CORE_PID" 2>/dev/null || true
|
|
91
|
-
# Kill anything on port 9500
|
|
92
|
-
lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
|
|
93
|
-
# Close QuickTime
|
|
94
|
-
osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
|
|
95
|
-
}
|
|
96
|
-
trap cleanup EXIT
|
|
97
|
-
|
|
98
|
-
# ── Kill stale sinain processes ──────────────────────────────────────────────
|
|
99
|
-
log "Killing stale processes..."
|
|
100
|
-
pkill -f "tsx.*src/index.ts" 2>/dev/null || true
|
|
101
|
-
pkill -f "python3 -m sense_client" 2>/dev/null || true
|
|
102
|
-
pkill -f "Python -m sense_client" 2>/dev/null || true
|
|
103
|
-
pkill -f "tools/sck-capture/sck-capture" 2>/dev/null || true
|
|
104
|
-
lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
|
|
105
|
-
sleep 2
|
|
106
|
-
|
|
107
|
-
# ── Phase 1a: Open video fullscreen ─────────────────────────────────────────
|
|
108
|
-
log "Opening recording in QuickTime (fullscreen)..."
|
|
109
|
-
open -a "QuickTime Player" "$RECORDING"
|
|
110
|
-
sleep 3
|
|
111
|
-
osascript -e '
|
|
112
|
-
tell application "QuickTime Player"
|
|
113
|
-
present front document
|
|
114
|
-
delay 1
|
|
115
|
-
play front document
|
|
116
|
-
end tell
|
|
117
|
-
' 2>/dev/null || warn "Could not auto-play — check QuickTime"
|
|
118
|
-
ok "Video playing fullscreen"
|
|
119
|
-
|
|
120
|
-
# ── Phase 1b: Start sinain-core ──────────────────────────────────────────────
|
|
121
|
-
log "Starting sinain-core (capture-only, local whisper)..."
|
|
122
|
-
(cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
|
|
123
|
-
sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
|
|
124
|
-
CORE_PID=$!
|
|
125
|
-
|
|
126
|
-
# Wait for health
|
|
127
|
-
CORE_OK=false
|
|
128
|
-
for i in $(seq 1 20); do
|
|
129
|
-
if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
|
|
130
|
-
CORE_OK=true
|
|
131
|
-
break
|
|
132
|
-
fi
|
|
133
|
-
sleep 1
|
|
134
|
-
done
|
|
135
|
-
if $CORE_OK; then
|
|
136
|
-
ok "sinain-core healthy on :9500"
|
|
137
|
-
else
|
|
138
|
-
fail "sinain-core did not start"
|
|
139
|
-
fi
|
|
140
|
-
|
|
141
|
-
# ── Phase 1c: Start sense_client ─────────────────────────────────────────────
|
|
142
|
-
log "Starting sense_client (screen capture + OCR)..."
|
|
143
|
-
|
|
144
|
-
# Propagate privacy mode
|
|
145
|
-
export PRIVACY_OCR_OPENROUTER="${PRIVACY_OCR_OPENROUTER:-full}"
|
|
146
|
-
export PRIVACY_IMAGES_OPENROUTER="${PRIVACY_IMAGES_OPENROUTER:-full}"
|
|
147
|
-
|
|
148
|
-
(cd "$SINAIN_ROOT" && python3 -m sense_client 2>&1) | \
|
|
149
|
-
sed -u "s/^/$(printf "${YELLOW}[sense]${RESET} ")/" &
|
|
150
|
-
SENSE_PID=$!
|
|
151
|
-
sleep 2
|
|
152
|
-
|
|
153
|
-
if kill -0 "$SENSE_PID" 2>/dev/null; then
|
|
154
|
-
ok "sense_client running"
|
|
155
|
-
else
|
|
156
|
-
warn "sense_client failed to start — continuing with audio only"
|
|
157
|
-
SENSE_PID=""
|
|
158
|
-
fi
|
|
159
|
-
|
|
160
|
-
# ── Phase 1d: Wait for recording to finish ───────────────────────────────────
|
|
161
|
-
BUFFER=60 # extra time for trailing transcription/OCR
|
|
162
|
-
TOTAL_WAIT=$((DURATION + BUFFER))
|
|
163
|
-
log "Waiting ${TOTAL_WAIT}s for recording + buffer..."
|
|
164
|
-
log " (recording ends at $(date -v+${DURATION}S '+%H:%M:%S'), buffer until $(date -v+${TOTAL_WAIT}S '+%H:%M:%S'))"
|
|
165
|
-
|
|
166
|
-
# Progress updates every 5 minutes
|
|
167
|
-
ELAPSED=0
|
|
168
|
-
while [ $ELAPSED -lt $TOTAL_WAIT ]; do
|
|
169
|
-
SLEEP_CHUNK=300
|
|
170
|
-
if [ $((ELAPSED + SLEEP_CHUNK)) -gt $TOTAL_WAIT ]; then
|
|
171
|
-
SLEEP_CHUNK=$((TOTAL_WAIT - ELAPSED))
|
|
172
|
-
fi
|
|
173
|
-
sleep $SLEEP_CHUNK
|
|
174
|
-
ELAPSED=$((ELAPSED + SLEEP_CHUNK))
|
|
175
|
-
REMAINING=$((TOTAL_WAIT - ELAPSED))
|
|
176
|
-
if [ $REMAINING -gt 0 ]; then
|
|
177
|
-
# Check feed count
|
|
178
|
-
FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
|
|
179
|
-
log " ${ELAPSED}s elapsed, ${REMAINING}s remaining — feed items: ${FEED_COUNT}"
|
|
180
|
-
fi
|
|
181
|
-
done
|
|
182
|
-
|
|
183
|
-
ok "Recording capture complete"
|
|
184
|
-
|
|
185
|
-
# Check what we captured
|
|
186
|
-
FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
|
|
187
|
-
log "Captured ${FEED_COUNT} feed items"
|
|
188
|
-
|
|
189
|
-
# ── Phase 1e: Stop sinain (saves pending session) ───────────────────────────
|
|
190
|
-
log "Stopping sinain (saving pending session)..."
|
|
191
|
-
[ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
|
|
192
|
-
SENSE_PID=""
|
|
193
|
-
|
|
194
|
-
# Send SIGINT directly to the tsx/node process (not the pipe wrapper)
|
|
195
|
-
# The pipe means $CORE_PID is sed, not tsx — so we pkill the actual process
|
|
196
|
-
pkill -INT -f "tsx src/index.ts" 2>/dev/null || true
|
|
197
|
-
log "Sent SIGINT to tsx, waiting for graceful shutdown..."
|
|
198
|
-
sleep 10
|
|
199
|
-
|
|
200
|
-
# Force if still alive
|
|
201
|
-
pkill -9 -f "tsx src/index.ts" 2>/dev/null || true
|
|
202
|
-
kill -9 "$CORE_PID" 2>/dev/null || true
|
|
203
|
-
CORE_PID=""
|
|
204
|
-
lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
|
|
205
|
-
sleep 2
|
|
206
|
-
|
|
207
|
-
# Close QuickTime
|
|
208
|
-
osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
|
|
209
|
-
|
|
210
|
-
# Verify pending session was saved (or inline distillation already consumed it)
|
|
211
|
-
if [ -f "$BENCH_DIR/pending-session.json" ]; then
|
|
212
|
-
PENDING_ITEMS=$(python3 -c "import json; print(len(json.load(open('$BENCH_DIR/pending-session.json')).get('items',[])))" 2>/dev/null || echo "?")
|
|
213
|
-
ok "Pending session saved: ${PENDING_ITEMS} items"
|
|
214
|
-
elif [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
|
|
215
|
-
ok "Inline distillation completed (pending-session.json already consumed)"
|
|
216
|
-
else
|
|
217
|
-
warn "No pending-session.json and no knowledge-graph.db — will retry with longer shutdown"
|
|
218
|
-
# Try again: start core briefly, let it capture a few items, then shut down gracefully
|
|
219
|
-
log "Starting core for a brief capture + shutdown cycle..."
|
|
220
|
-
(cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) > /tmp/sinain-bench-retry.log &
|
|
221
|
-
RETRY_PID=$!
|
|
222
|
-
sleep 15 # let it start and capture a few items
|
|
223
|
-
# Get the actual node PID and send SIGINT
|
|
224
|
-
NODE_PID=$(pgrep -f "tsx src/index.ts" 2>/dev/null | head -1 || true)
|
|
225
|
-
if [ -n "$NODE_PID" ]; then
|
|
226
|
-
kill -INT "$NODE_PID" 2>/dev/null || true
|
|
227
|
-
sleep 10
|
|
228
|
-
kill -9 "$NODE_PID" 2>/dev/null || true
|
|
229
|
-
fi
|
|
230
|
-
kill -9 "$RETRY_PID" 2>/dev/null || true
|
|
231
|
-
lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
|
|
232
|
-
sleep 2
|
|
233
|
-
if [ -f "$BENCH_DIR/pending-session.json" ] || [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
|
|
234
|
-
ok "Recovery succeeded"
|
|
235
|
-
else
|
|
236
|
-
fail "Could not capture any session data"
|
|
237
|
-
fi
|
|
238
|
-
fi
|
|
239
|
-
|
|
240
|
-
# ── Phase 1f: Restart for distillation ───────────────────────────────────────
|
|
241
|
-
log "Restarting sinain-core for distillation..."
|
|
242
|
-
(cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
|
|
243
|
-
sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
|
|
244
|
-
CORE_PID=$!
|
|
245
|
-
|
|
246
|
-
# Wait for health
|
|
247
|
-
for i in $(seq 1 20); do
|
|
248
|
-
if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
|
|
249
|
-
break
|
|
250
|
-
fi
|
|
251
|
-
sleep 1
|
|
252
|
-
done
|
|
253
|
-
|
|
254
|
-
# Wait for distillation to complete (knowledge-graph.db appears or grows)
|
|
255
|
-
log "Waiting for distillation..."
|
|
256
|
-
for i in $(seq 1 120); do
|
|
257
|
-
if [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
|
|
258
|
-
DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
|
|
259
|
-
if [ "$DB_SIZE" -gt 4096 ]; then
|
|
260
|
-
ok "Distillation complete (DB: ${DB_SIZE} bytes)"
|
|
261
|
-
break
|
|
262
|
-
fi
|
|
263
|
-
fi
|
|
264
|
-
# Also check if pending-session.json is gone (distillation consumed it)
|
|
265
|
-
if [ ! -f "$BENCH_DIR/pending-session.json" ] && [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
|
|
266
|
-
DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
|
|
267
|
-
ok "Distillation complete (DB: ${DB_SIZE} bytes)"
|
|
268
|
-
break
|
|
269
|
-
fi
|
|
270
|
-
sleep 5
|
|
271
|
-
done
|
|
272
|
-
|
|
273
|
-
# Keep core running for /embed endpoint during evaluation
|
|
274
|
-
log "Keeping sinain-core running for embedding service during evaluation..."
|
|
275
|
-
|
|
276
|
-
# ── Phase 2: Evaluate ────────────────────────────────────────────────────────
|
|
277
|
-
log ""
|
|
278
|
-
log "═══════════════════════════════════════════════"
|
|
279
|
-
log " Phase 2: Evaluation"
|
|
280
|
-
log "═══════════════════════════════════════════════"
|
|
281
|
-
log ""
|
|
282
|
-
|
|
283
|
-
DB_PATH="$BENCH_DIR/knowledge-graph.db"
|
|
284
|
-
if [ ! -f "$DB_PATH" ]; then
|
|
285
|
-
fail "No knowledge-graph.db found — distillation may have failed"
|
|
286
|
-
fi
|
|
287
|
-
|
|
288
|
-
# Show what's in the DB
|
|
289
|
-
log "Knowledge graph contents:"
|
|
290
|
-
cd "$KOOG_DIR"
|
|
291
|
-
python3 -c "
|
|
292
|
-
from triplestore import TripleStore
|
|
293
|
-
ts = TripleStore('$DB_PATH')
|
|
294
|
-
facts = ts.all_facts()
|
|
295
|
-
print(f' Total facts: {len(facts)}')
|
|
296
|
-
entities = set()
|
|
297
|
-
for f in facts:
|
|
298
|
-
entities.add(f.get('entity', ''))
|
|
299
|
-
print(f' Unique entities: {len(entities)}')
|
|
300
|
-
for e in sorted(entities)[:10]:
|
|
301
|
-
print(f' - {e}')
|
|
302
|
-
if len(entities) > 10:
|
|
303
|
-
print(f' ... and {len(entities) - 10} more')
|
|
304
|
-
" 2>/dev/null || warn "Could not inspect DB"
|
|
305
|
-
|
|
306
|
-
# Run evaluation
|
|
307
|
-
log "Running QA evaluation..."
|
|
308
|
-
python3 eval/benchmarks/meeting_runner.py \
|
|
309
|
-
--db "$DB_PATH" \
|
|
310
|
-
--conditions sinain-memory,full-context \
|
|
311
|
-
--format json,markdown
|
|
312
|
-
|
|
313
|
-
log ""
|
|
314
|
-
log "═══════════════════════════════════════════════"
|
|
315
|
-
log " Done!"
|
|
316
|
-
log " Results: eval/benchmarks/results/meeting_results.md"
|
|
317
|
-
log " DB: $DB_PATH"
|
|
318
|
-
log "═══════════════════════════════════════════════"
|