@geravant/sinain 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  6. package/sinain-core/src/index.ts +50 -19
  7. package/sinain-memory/graph_query.py +12 -3
  8. package/sinain-memory/knowledge_integrator.py +194 -10
  9. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  10. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  12. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  13. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  14. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/__init__.py +0 -0
  16. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/assertions.py +0 -267
  18. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  19. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  21. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  31. package/sinain-memory/eval/benchmarks/config.py +0 -23
  32. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  33. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  34. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  35. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  38. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  39. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  40. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  41. package/sinain-memory/eval/benchmarks/query.py +0 -193
  42. package/sinain-memory/eval/benchmarks/report.py +0 -87
  43. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  44. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  45. package/sinain-memory/eval/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/judges/base_judge.py +0 -61
  47. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  48. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  49. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  50. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  51. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  52. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  53. package/sinain-memory/eval/schemas.py +0 -247
  54. package/sinain-memory/tests/__init__.py +0 -0
  55. package/sinain-memory/tests/conftest.py +0 -189
  56. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  57. package/sinain-memory/tests/test_embedder.py +0 -210
  58. package/sinain-memory/tests/test_extract_json.py +0 -124
  59. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  60. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  61. package/sinain-memory/tests/test_module_management.py +0 -458
  62. package/sinain-memory/tests/test_parsers.py +0 -96
  63. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  64. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  65. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  66. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  67. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,193 +0,0 @@
1
- """Query pipeline — benchmark questions → LLM answers under 3 conditions.
2
-
3
- Condition A (sinain-memory): answer from knowledge graph facts
4
- Condition B (full-context): answer from full conversation history
5
- Condition C (knowledge-doc): answer from portable knowledge document
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import sys
11
- from pathlib import Path
12
-
13
- # Add sinain-memory to path
14
- _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
15
- if _koog_dir not in sys.path:
16
- sys.path.insert(0, _koog_dir)
17
-
18
- from common import call_llm # noqa: E402
19
-
20
- from .base_adapter import BenchmarkQuestion
21
- from .config import QA_MODEL, MAX_FACTS_PER_QUERY
22
-
23
-
24
- def _extract_keywords(query: str) -> list[str]:
25
- """Extract search keywords (reuses logic from retrieval_evaluator)."""
26
- import re
27
- words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
28
- stopwords = {
29
- "the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an",
30
- "it", "was", "not", "how", "what", "when", "does", "did", "do", "my",
31
- "your", "their", "have", "has", "had", "are", "were", "been", "being",
32
- "about", "from", "with", "that", "this", "which", "who", "whom",
33
- "where", "why", "can", "could", "would", "should",
34
- }
35
- return [w for w in words if len(w) > 2 and w not in stopwords]
36
-
37
-
38
- def _get_all_facts_text(db_path: str) -> str:
39
- """Dump ALL facts from the knowledge graph as formatted text.
40
-
41
- Sinain triplestores are small (10-50 facts per session), so including
42
- everything is feasible and avoids tag-matching failures.
43
- """
44
- from graph_query import query_top_facts, format_facts_text
45
-
46
- facts = query_top_facts(db_path, limit=50)
47
- if not facts:
48
- return "(no knowledge available)"
49
- return format_facts_text(facts, max_chars=6000)
50
-
51
-
52
- def _query_knowledge(db_path: str, question: str) -> str:
53
- """Query sinain knowledge graph for facts relevant to a question.
54
-
55
- Strategy: retrieve broadly, then re-rank by keyword overlap with the question.
56
- This ensures specific facts (CTO background) beat generic ones (meeting schedule)
57
- when the question asks about the CTO.
58
- """
59
- from graph_query import query_facts_hybrid, query_top_facts, format_facts_text
60
-
61
- # Retrieve a broad candidate set
62
- candidates = query_facts_hybrid(db_path, question, max_facts=30)
63
- if not candidates:
64
- candidates = query_top_facts(db_path, limit=30)
65
- if not candidates:
66
- return "(no knowledge available)"
67
-
68
- # Re-rank by embedding similarity if available, fall back to keyword overlap
69
- try:
70
- from embed_client import rank_by_similarity
71
- fact_texts = [str(f.get("value", "")) for f in candidates]
72
- ranked_indices = rank_by_similarity(question, fact_texts)
73
- if ranked_indices is not None:
74
- ranked = [candidates[i] for i, _ in ranked_indices[:MAX_FACTS_PER_QUERY]]
75
- return format_facts_text(ranked, max_chars=3000)
76
- except Exception:
77
- pass
78
-
79
- # Fallback: keyword overlap ranking
80
- q_keywords = set(_extract_keywords(question))
81
- def _relevance(fact: dict) -> float:
82
- value = str(fact.get("value", "")).lower()
83
- entity = str(fact.get("entity", "")).lower()
84
- fact_words = set(_extract_keywords(value + " " + entity))
85
- if not q_keywords:
86
- return 0.0
87
- return len(q_keywords & fact_words) / len(q_keywords)
88
-
89
- ranked = sorted(candidates, key=_relevance, reverse=True)
90
- return format_facts_text(ranked[:MAX_FACTS_PER_QUERY], max_chars=3000)
91
-
92
-
93
- def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
94
- """Get facts retrieved for a question (for retrieval evaluation)."""
95
- from graph_query import query_facts_hybrid, query_top_facts
96
-
97
- facts = query_facts_hybrid(db_path, question, max_facts=k)
98
- if facts:
99
- return facts
100
-
101
- # Fallback: top facts by confidence
102
- return query_top_facts(db_path, limit=k)
103
-
104
-
105
- def compute_content_recall(
106
- retrieved_facts: list[dict],
107
- gold_answer: str,
108
- k_values: list[int] | None = None,
109
- ) -> dict:
110
- """Content-based retrieval metric: do retrieved facts contain the answer?
111
-
112
- Instead of matching entity IDs (which don't align between LongMemEval
113
- session IDs and sinain entity IDs), we check whether the gold answer's
114
- key terms appear in any retrieved fact's content.
115
- """
116
- from .config import K_VALUES
117
- ks = k_values or K_VALUES
118
-
119
- gold_terms = set(_extract_keywords(str(gold_answer)))
120
- if not gold_terms:
121
- return {f"content_recall@{k}": 0.0 for k in ks}
122
-
123
- result = {}
124
- for k in ks:
125
- top_k = retrieved_facts[:k]
126
- # Check if any fact in top-k contains gold answer terms
127
- for fact in top_k:
128
- fact_text = f"{fact.get('entity', '')} {fact.get('value', '')}".lower()
129
- fact_terms = set(_extract_keywords(fact_text))
130
- overlap = gold_terms & fact_terms
131
- if len(overlap) >= max(1, len(gold_terms) // 2): # ≥50% of gold terms
132
- result[f"content_recall@{k}"] = 1.0
133
- break
134
- else:
135
- result[f"content_recall@{k}"] = 0.0
136
-
137
- return result
138
-
139
-
140
- def answer_question(
141
- question: BenchmarkQuestion,
142
- condition: str,
143
- *,
144
- db_path: str | None = None,
145
- full_context: str | None = None,
146
- knowledge_doc: str | None = None,
147
- model: str | None = None,
148
- ) -> str:
149
- """Generate an answer for a benchmark question under a specific condition.
150
-
151
- Returns the LLM's answer text.
152
- """
153
- qa_model = model or QA_MODEL
154
-
155
- if condition == "sinain-memory":
156
- assert db_path, "db_path required for sinain-memory condition"
157
- facts = _query_knowledge(db_path, question.text)
158
- system = (
159
- "Answer the question using ONLY the provided knowledge facts. "
160
- "If the facts don't contain enough information to answer, say \"I don't know.\""
161
- )
162
- user = f"## Knowledge Facts\n{facts}\n\n## Question\n{question.text}"
163
-
164
- elif condition == "full-context":
165
- assert full_context, "full_context required for full-context condition"
166
- system = (
167
- "Answer the question based on the conversation history below. "
168
- "Be concise and specific."
169
- )
170
- # Truncate context if too large (some models have limits)
171
- ctx = full_context[:100_000] if len(full_context) > 100_000 else full_context
172
- user = f"## Conversation History\n{ctx}\n\n## Question\n{question.text}"
173
-
174
- elif condition == "knowledge-doc":
175
- assert knowledge_doc, "knowledge_doc required for knowledge-doc condition"
176
- system = (
177
- "Answer the question using ONLY the knowledge document provided. "
178
- "If the document doesn't contain enough information, say \"I don't know.\""
179
- )
180
- user = f"## Knowledge Document\n{knowledge_doc}\n\n## Question\n{question.text}"
181
-
182
- else:
183
- raise ValueError(f"Unknown condition: {condition}")
184
-
185
- try:
186
- return call_llm(
187
- system_prompt=system,
188
- user_prompt=user,
189
- model=qa_model,
190
- max_tokens=300,
191
- ).strip()
192
- except Exception as e:
193
- return f"(error: {e})"
@@ -1,87 +0,0 @@
1
- """Report generation — markdown, JSON, and LaTeX output."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- from datetime import datetime, timezone
7
-
8
-
9
- def generate_markdown(benchmark_name: str, summary: dict, details: list[dict]) -> str:
10
- """Generate a publishable markdown report."""
11
- lines = [
12
- f"# Sinain Knowledge Graph — {benchmark_name} Results",
13
- f"\nGenerated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
14
- "",
15
- ]
16
-
17
- # Headline IPR
18
- ipr = summary.get("ipr")
19
- if ipr:
20
- lines.append(f"**Information Preservation Rate (IPR)**: {ipr:.1%}")
21
- lines.append("")
22
-
23
- # Condition scores table
24
- conditions = summary.get("conditions", {})
25
- if conditions:
26
- cond_names = sorted(conditions.keys())
27
- header = "| Condition | Mean Score (1-5) | Mean F1 | N |"
28
- sep = "|-----------|------------------|---------|---|"
29
- lines.extend([header, sep])
30
- for cond in cond_names:
31
- c = conditions[cond]
32
- lines.append(f"| {cond} | {c['mean_score']:.2f} | {c.get('mean_f1', 0):.2f} | {c['n']} |")
33
- lines.append("")
34
-
35
- # Retrieval metrics
36
- retrieval = summary.get("retrieval", {})
37
- if retrieval:
38
- lines.append("## Retrieval Quality")
39
- lines.append("| Metric | Score |")
40
- lines.append("|--------|-------|")
41
- for k, v in sorted(retrieval.items()):
42
- lines.append(f"| {k} | {v:.1%} |")
43
- lines.append("")
44
-
45
- # Category breakdown
46
- categories = summary.get("categories", {})
47
- if categories:
48
- lines.append("## By Category")
49
- cond_names = sorted(set(c for cat in categories.values() for c in cat))
50
- header = "| Category | " + " | ".join(cond_names) + " |"
51
- sep = "|----------|" + "|".join(["------"] * len(cond_names)) + "|"
52
- lines.extend([header, sep])
53
- for cat in sorted(categories):
54
- cells = []
55
- for cond in cond_names:
56
- if cond in categories[cat]:
57
- cells.append(f"{categories[cat][cond]['mean_score']:.2f} (n={categories[cat][cond]['n']})")
58
- else:
59
- cells.append("-")
60
- lines.append(f"| {cat} | " + " | ".join(cells) + " |")
61
- lines.append("")
62
-
63
- # Failures (worst questions)
64
- if details:
65
- sm_details = [d for d in details if d.get("answers", {}).get("sinain-memory", {}).get("score") is not None]
66
- sm_details.sort(key=lambda d: d["answers"]["sinain-memory"]["score"])
67
- if sm_details:
68
- lines.append("## Hardest Questions for sinain-memory (bottom 5)")
69
- for d in sm_details[:5]:
70
- sm = d["answers"]["sinain-memory"]
71
- fc = d["answers"].get("full-context", {})
72
- lines.append(f"- **{d['id']}** [{d['category']}]: score={sm['score']}/5 "
73
- f"(full-context: {fc.get('score', '?')}/5)")
74
- lines.append(f" Q: {d['question'][:100]}...")
75
- lines.append("")
76
-
77
- return "\n".join(lines)
78
-
79
-
80
- def generate_json(benchmark_name: str, summary: dict, details: list[dict]) -> str:
81
- """Generate JSON report."""
82
- return json.dumps({
83
- "benchmark": benchmark_name,
84
- "timestamp": datetime.now(timezone.utc).isoformat(),
85
- "summary": summary,
86
- "details": details,
87
- }, indent=2, ensure_ascii=False)
@@ -1,318 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- # ── Meeting Memory Benchmark — end-to-end capture + evaluate ─────────────────
5
- # 1. Opens meeting recording fullscreen in QuickTime
6
- # 2. Starts sinain (audio + sense capture, no agent, no overlay)
7
- # 3. Waits for recording to finish
8
- # 4. Stops sinain → saves pending session
9
- # 5. Restarts sinain → distills pending session into knowledge graph
10
- # 6. Runs evaluation harness against the distilled DB
11
- #
12
- # Usage: ./run_meeting_bench.sh <mp4_path>
13
- # Output: eval/benchmarks/results/meeting_results.md
14
-
15
- SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
16
- SINAIN_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
17
- KOOG_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
18
-
19
- BOLD='\033[1m'
20
- GREEN='\033[0;32m'
21
- YELLOW='\033[0;33m'
22
- RED='\033[0;31m'
23
- CYAN='\033[0;36m'
24
- RESET='\033[0m'
25
-
26
- log() { echo -e "${BOLD}[bench]${RESET} $*"; }
27
- ok() { echo -e "${BOLD}[bench]${RESET} ${GREEN}✓${RESET} $*"; }
28
- warn() { echo -e "${BOLD}[bench]${RESET} ${YELLOW}⚠${RESET} $*"; }
29
- fail() { echo -e "${BOLD}[bench]${RESET} ${RED}✗${RESET} $*"; exit 1; }
30
-
31
- # ── Args ─────────────────────────────────────────────────────────────────────
32
- RECORDING="${1:-}"
33
- if [ -z "$RECORDING" ] || [ ! -f "$RECORDING" ]; then
34
- fail "Usage: $0 <path-to-mp4>"
35
- fi
36
-
37
- # ── Setup ────────────────────────────────────────────────────────────────────
38
- BENCH_DIR="/tmp/sinain-bench-$(date +%s)"
39
- mkdir -p "$BENCH_DIR"
40
- log "Benchmark directory: ${CYAN}${BENCH_DIR}${RESET}"
41
-
42
- # Source .env for API keys and audio config (safe parser from start-local.sh)
43
- for _env_file in "$SINAIN_ROOT/.env" "$SINAIN_ROOT/sinain-core/.env" "$HOME/.sinain/.env"; do
44
- if [ -f "$_env_file" ]; then
45
- log "Loading $_env_file"
46
- while IFS='=' read -r _k _v; do
47
- [[ -z "$_k" || "$_k" =~ ^[[:space:]]*# ]] && continue
48
- _k=$(echo "$_k" | xargs)
49
- _v=$(echo "$_v" | xargs)
50
- _v="${_v%%#*}"
51
- _v=$(echo "$_v" | xargs)
52
- [[ -z "$_v" ]] && continue
53
- if [ -z "${!_k+x}" ]; then export "$_k=$_v"; fi
54
- done < "$_env_file"
55
- break
56
- fi
57
- done
58
-
59
- # Bench-specific overrides
60
- export SINAIN_MEMORY_DIR="$BENCH_DIR"
61
- export AGENT_ENABLED=false
62
- export ESCALATION_MODE=off
63
-
64
- # Local whisper setup (from start-local.sh)
65
- MODEL_DIR="$HOME/models"
66
- MODEL_NAME="ggml-large-v3-turbo.bin"
67
- export LOCAL_WHISPER_MODEL="${LOCAL_WHISPER_MODEL:-$MODEL_DIR/$MODEL_NAME}"
68
- export LOCAL_WHISPER_BIN="${LOCAL_WHISPER_BIN:-whisper-cli}"
69
- export TRANSCRIPTION_BACKEND=local
70
-
71
- # ── Get recording duration ───────────────────────────────────────────────────
72
- DURATION_RAW=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$RECORDING" 2>/dev/null || echo "")
73
- if [ -n "$DURATION_RAW" ]; then
74
- DURATION=$(echo "$DURATION_RAW" | cut -d. -f1)
75
- else
76
- DURATION=1620 # fallback: 27 min
77
- fi
78
- log "Recording duration: ${DURATION}s (~$((DURATION / 60))m)"
79
-
80
- # ── Cleanup handler ──────────────────────────────────────────────────────────
81
- CORE_PID=""
82
- SENSE_PID=""
83
-
84
- cleanup() {
85
- log "Cleaning up..."
86
- [ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
87
- [ -n "$CORE_PID" ] && kill "$CORE_PID" 2>/dev/null || true
88
- sleep 2
89
- [ -n "$SENSE_PID" ] && kill -9 "$SENSE_PID" 2>/dev/null || true
90
- [ -n "$CORE_PID" ] && kill -9 "$CORE_PID" 2>/dev/null || true
91
- # Kill anything on port 9500
92
- lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
93
- # Close QuickTime
94
- osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
95
- }
96
- trap cleanup EXIT
97
-
98
- # ── Kill stale sinain processes ──────────────────────────────────────────────
99
- log "Killing stale processes..."
100
- pkill -f "tsx.*src/index.ts" 2>/dev/null || true
101
- pkill -f "python3 -m sense_client" 2>/dev/null || true
102
- pkill -f "Python -m sense_client" 2>/dev/null || true
103
- pkill -f "tools/sck-capture/sck-capture" 2>/dev/null || true
104
- lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
105
- sleep 2
106
-
107
- # ── Phase 1a: Open video fullscreen ─────────────────────────────────────────
108
- log "Opening recording in QuickTime (fullscreen)..."
109
- open -a "QuickTime Player" "$RECORDING"
110
- sleep 3
111
- osascript -e '
112
- tell application "QuickTime Player"
113
- present front document
114
- delay 1
115
- play front document
116
- end tell
117
- ' 2>/dev/null || warn "Could not auto-play — check QuickTime"
118
- ok "Video playing fullscreen"
119
-
120
- # ── Phase 1b: Start sinain-core ──────────────────────────────────────────────
121
- log "Starting sinain-core (capture-only, local whisper)..."
122
- (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
123
- sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
124
- CORE_PID=$!
125
-
126
- # Wait for health
127
- CORE_OK=false
128
- for i in $(seq 1 20); do
129
- if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
130
- CORE_OK=true
131
- break
132
- fi
133
- sleep 1
134
- done
135
- if $CORE_OK; then
136
- ok "sinain-core healthy on :9500"
137
- else
138
- fail "sinain-core did not start"
139
- fi
140
-
141
- # ── Phase 1c: Start sense_client ─────────────────────────────────────────────
142
- log "Starting sense_client (screen capture + OCR)..."
143
-
144
- # Propagate privacy mode
145
- export PRIVACY_OCR_OPENROUTER="${PRIVACY_OCR_OPENROUTER:-full}"
146
- export PRIVACY_IMAGES_OPENROUTER="${PRIVACY_IMAGES_OPENROUTER:-full}"
147
-
148
- (cd "$SINAIN_ROOT" && python3 -m sense_client 2>&1) | \
149
- sed -u "s/^/$(printf "${YELLOW}[sense]${RESET} ")/" &
150
- SENSE_PID=$!
151
- sleep 2
152
-
153
- if kill -0 "$SENSE_PID" 2>/dev/null; then
154
- ok "sense_client running"
155
- else
156
- warn "sense_client failed to start — continuing with audio only"
157
- SENSE_PID=""
158
- fi
159
-
160
- # ── Phase 1d: Wait for recording to finish ───────────────────────────────────
161
- BUFFER=60 # extra time for trailing transcription/OCR
162
- TOTAL_WAIT=$((DURATION + BUFFER))
163
- log "Waiting ${TOTAL_WAIT}s for recording + buffer..."
164
- log " (recording ends at $(date -v+${DURATION}S '+%H:%M:%S'), buffer until $(date -v+${TOTAL_WAIT}S '+%H:%M:%S'))"
165
-
166
- # Progress updates every 5 minutes
167
- ELAPSED=0
168
- while [ $ELAPSED -lt $TOTAL_WAIT ]; do
169
- SLEEP_CHUNK=300
170
- if [ $((ELAPSED + SLEEP_CHUNK)) -gt $TOTAL_WAIT ]; then
171
- SLEEP_CHUNK=$((TOTAL_WAIT - ELAPSED))
172
- fi
173
- sleep $SLEEP_CHUNK
174
- ELAPSED=$((ELAPSED + SLEEP_CHUNK))
175
- REMAINING=$((TOTAL_WAIT - ELAPSED))
176
- if [ $REMAINING -gt 0 ]; then
177
- # Check feed count
178
- FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
179
- log " ${ELAPSED}s elapsed, ${REMAINING}s remaining — feed items: ${FEED_COUNT}"
180
- fi
181
- done
182
-
183
- ok "Recording capture complete"
184
-
185
- # Check what we captured
186
- FEED_COUNT=$(curl -sf http://localhost:9500/feed 2>/dev/null | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('messages',[])))" 2>/dev/null || echo "?")
187
- log "Captured ${FEED_COUNT} feed items"
188
-
189
- # ── Phase 1e: Stop sinain (saves pending session) ───────────────────────────
190
- log "Stopping sinain (saving pending session)..."
191
- [ -n "$SENSE_PID" ] && kill "$SENSE_PID" 2>/dev/null || true
192
- SENSE_PID=""
193
-
194
- # Send SIGINT directly to the tsx/node process (not the pipe wrapper)
195
- # The pipe means $CORE_PID is sed, not tsx — so we pkill the actual process
196
- pkill -INT -f "tsx src/index.ts" 2>/dev/null || true
197
- log "Sent SIGINT to tsx, waiting for graceful shutdown..."
198
- sleep 10
199
-
200
- # Force if still alive
201
- pkill -9 -f "tsx src/index.ts" 2>/dev/null || true
202
- kill -9 "$CORE_PID" 2>/dev/null || true
203
- CORE_PID=""
204
- lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
205
- sleep 2
206
-
207
- # Close QuickTime
208
- osascript -e 'tell application "QuickTime Player" to quit' 2>/dev/null || true
209
-
210
- # Verify pending session was saved (or inline distillation already consumed it)
211
- if [ -f "$BENCH_DIR/pending-session.json" ]; then
212
- PENDING_ITEMS=$(python3 -c "import json; print(len(json.load(open('$BENCH_DIR/pending-session.json')).get('items',[])))" 2>/dev/null || echo "?")
213
- ok "Pending session saved: ${PENDING_ITEMS} items"
214
- elif [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
215
- ok "Inline distillation completed (pending-session.json already consumed)"
216
- else
217
- warn "No pending-session.json and no knowledge-graph.db — will retry with longer shutdown"
218
- # Try again: start core briefly, let it capture a few items, then shut down gracefully
219
- log "Starting core for a brief capture + shutdown cycle..."
220
- (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) > /tmp/sinain-bench-retry.log &
221
- RETRY_PID=$!
222
- sleep 15 # let it start and capture a few items
223
- # Get the actual node PID and send SIGINT
224
- NODE_PID=$(pgrep -f "tsx src/index.ts" 2>/dev/null | head -1 || true)
225
- if [ -n "$NODE_PID" ]; then
226
- kill -INT "$NODE_PID" 2>/dev/null || true
227
- sleep 10
228
- kill -9 "$NODE_PID" 2>/dev/null || true
229
- fi
230
- kill -9 "$RETRY_PID" 2>/dev/null || true
231
- lsof -i :9500 -sTCP:LISTEN -t 2>/dev/null | xargs kill -9 2>/dev/null || true
232
- sleep 2
233
- if [ -f "$BENCH_DIR/pending-session.json" ] || [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
234
- ok "Recovery succeeded"
235
- else
236
- fail "Could not capture any session data"
237
- fi
238
- fi
239
-
240
- # ── Phase 1f: Restart for distillation ───────────────────────────────────────
241
- log "Restarting sinain-core for distillation..."
242
- (cd "$SINAIN_ROOT/sinain-core" && npx tsx src/index.ts 2>&1) | \
243
- sed -u "s/^/$(printf "${CYAN}[core]${RESET} ")/" &
244
- CORE_PID=$!
245
-
246
- # Wait for health
247
- for i in $(seq 1 20); do
248
- if curl -sf http://localhost:9500/health >/dev/null 2>&1; then
249
- break
250
- fi
251
- sleep 1
252
- done
253
-
254
- # Wait for distillation to complete (knowledge-graph.db appears or grows)
255
- log "Waiting for distillation..."
256
- for i in $(seq 1 120); do
257
- if [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
258
- DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
259
- if [ "$DB_SIZE" -gt 4096 ]; then
260
- ok "Distillation complete (DB: ${DB_SIZE} bytes)"
261
- break
262
- fi
263
- fi
264
- # Also check if pending-session.json is gone (distillation consumed it)
265
- if [ ! -f "$BENCH_DIR/pending-session.json" ] && [ -f "$BENCH_DIR/knowledge-graph.db" ]; then
266
- DB_SIZE=$(stat -f%z "$BENCH_DIR/knowledge-graph.db" 2>/dev/null || echo "0")
267
- ok "Distillation complete (DB: ${DB_SIZE} bytes)"
268
- break
269
- fi
270
- sleep 5
271
- done
272
-
273
- # Keep core running for /embed endpoint during evaluation
274
- log "Keeping sinain-core running for embedding service during evaluation..."
275
-
276
- # ── Phase 2: Evaluate ────────────────────────────────────────────────────────
277
- log ""
278
- log "═══════════════════════════════════════════════"
279
- log " Phase 2: Evaluation"
280
- log "═══════════════════════════════════════════════"
281
- log ""
282
-
283
- DB_PATH="$BENCH_DIR/knowledge-graph.db"
284
- if [ ! -f "$DB_PATH" ]; then
285
- fail "No knowledge-graph.db found — distillation may have failed"
286
- fi
287
-
288
- # Show what's in the DB
289
- log "Knowledge graph contents:"
290
- cd "$KOOG_DIR"
291
- python3 -c "
292
- from triplestore import TripleStore
293
- ts = TripleStore('$DB_PATH')
294
- facts = ts.all_facts()
295
- print(f' Total facts: {len(facts)}')
296
- entities = set()
297
- for f in facts:
298
- entities.add(f.get('entity', ''))
299
- print(f' Unique entities: {len(entities)}')
300
- for e in sorted(entities)[:10]:
301
- print(f' - {e}')
302
- if len(entities) > 10:
303
- print(f' ... and {len(entities) - 10} more')
304
- " 2>/dev/null || warn "Could not inspect DB"
305
-
306
- # Run evaluation
307
- log "Running QA evaluation..."
308
- python3 eval/benchmarks/meeting_runner.py \
309
- --db "$DB_PATH" \
310
- --conditions sinain-memory,full-context \
311
- --format json,markdown
312
-
313
- log ""
314
- log "═══════════════════════════════════════════════"
315
- log " Done!"
316
- log " Results: eval/benchmarks/results/meeting_results.md"
317
- log " DB: $DB_PATH"
318
- log "═══════════════════════════════════════════════"