@geravant/sinain 1.12.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/sinain-core/package-lock.json +963 -0
- package/sinain-core/package.json +1 -0
- package/sinain-core/src/buffers/feed-buffer.ts +32 -0
- package/sinain-core/src/embedding/service.ts +66 -0
- package/sinain-core/src/index.ts +19 -2
- package/sinain-core/src/learning/local-curation.ts +137 -7
- package/sinain-core/src/server.ts +31 -0
- package/sinain-memory/README.md +105 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/embed_client.py +117 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
- package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
- package/sinain-memory/eval/benchmarks/query.py +37 -16
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
- package/sinain-memory/eval/benchmarks/runner.py +10 -3
- package/sinain-memory/graph_query.py +257 -15
- package/sinain-memory/knowledge_integrator.py +365 -72
- package/sinain-memory/memory-config.json +1 -1
- package/sinain-memory/session_distiller.py +43 -19
- package/sinain-memory/triplestore.py +60 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Meeting Memory adapter — loads QA pairs + ground-truth transcript.
|
|
2
|
+
|
|
3
|
+
Unlike LongMemEval, this adapter does NOT ingest. The real sinain pipeline
|
|
4
|
+
(start-local.sh) produces the knowledge-graph.db during a live capture run.
|
|
5
|
+
This adapter only loads QA pairs and provides the full-context baseline.
|
|
6
|
+
|
|
7
|
+
Data layout:
|
|
8
|
+
eval/benchmarks/data/meeting/
|
|
9
|
+
<name>.txt — ground-truth transcript
|
|
10
|
+
<name>_qa.json — gold QA pairs
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from .base_adapter import BenchmarkAdapter, BenchmarkInstance, BenchmarkQuestion
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MeetingMemoryAdapter(BenchmarkAdapter):
|
|
22
|
+
"""Adapter for meeting memory benchmarks (real pipeline capture)."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def name(self) -> str:
|
|
26
|
+
return "meeting"
|
|
27
|
+
|
|
28
|
+
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
29
|
+
"""Load meeting QA pairs and transcripts from data/meeting/."""
|
|
30
|
+
meeting_dir = Path(data_dir) / "meeting"
|
|
31
|
+
if not meeting_dir.exists():
|
|
32
|
+
raise FileNotFoundError(f"Meeting data not found: {meeting_dir}")
|
|
33
|
+
|
|
34
|
+
instances = []
|
|
35
|
+
for qa_path in sorted(meeting_dir.glob("*_qa.json")):
|
|
36
|
+
# Derive transcript path: foo_qa.json → foo.txt
|
|
37
|
+
stem = qa_path.stem.replace("_qa", "")
|
|
38
|
+
transcript_path = qa_path.parent / f"{stem}.txt"
|
|
39
|
+
if not transcript_path.exists():
|
|
40
|
+
print(f"[meeting] warning: no transcript for {qa_path.name}, skipping")
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
# Load QA pairs
|
|
44
|
+
with open(qa_path) as f:
|
|
45
|
+
raw_questions = json.load(f)
|
|
46
|
+
|
|
47
|
+
questions = []
|
|
48
|
+
for item in raw_questions:
|
|
49
|
+
questions.append(BenchmarkQuestion(
|
|
50
|
+
id=item["id"],
|
|
51
|
+
text=item["question"],
|
|
52
|
+
gold_answer=item["gold_answer"],
|
|
53
|
+
category=item.get("category", "unknown"),
|
|
54
|
+
evidence_session_ids=item.get("evidence_timestamps", []),
|
|
55
|
+
metadata={
|
|
56
|
+
"evidence_timestamps": item.get("evidence_timestamps", []),
|
|
57
|
+
},
|
|
58
|
+
))
|
|
59
|
+
|
|
60
|
+
# Load transcript
|
|
61
|
+
transcript_text = transcript_path.read_text(encoding="utf-8")
|
|
62
|
+
|
|
63
|
+
instances.append(BenchmarkInstance(
|
|
64
|
+
id=f"meeting-{stem}",
|
|
65
|
+
sessions=[], # Not used — real pipeline does ingestion
|
|
66
|
+
questions=questions,
|
|
67
|
+
raw_sessions=[],
|
|
68
|
+
metadata={
|
|
69
|
+
"transcript_path": str(transcript_path),
|
|
70
|
+
"raw_transcript": transcript_text,
|
|
71
|
+
"stem": stem,
|
|
72
|
+
},
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
total_q = sum(len(i.questions) for i in instances)
|
|
76
|
+
print(f"[meeting] loaded {total_q} questions across {len(instances)} meetings")
|
|
77
|
+
return instances
|
|
78
|
+
|
|
79
|
+
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
80
|
+
"""Return the raw transcript verbatim for the full-context baseline."""
|
|
81
|
+
return instance.metadata.get("raw_transcript", "")
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Meeting Memory Benchmark runner — evaluates a captured knowledge-graph.db.
|
|
3
|
+
|
|
4
|
+
Standalone script: does NOT modify or import runner.py.
|
|
5
|
+
Reuses shared infrastructure: query, judge, evaluate, report.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 eval/benchmarks/meeting_runner.py \
|
|
9
|
+
--db /tmp/sinain-bench-XXXX/knowledge-graph.db \
|
|
10
|
+
--conditions sinain-memory,full-context
|
|
11
|
+
|
|
12
|
+
# Quick test (3 questions):
|
|
13
|
+
python3 eval/benchmarks/meeting_runner.py \
|
|
14
|
+
--db /tmp/sinain-bench-XXXX/knowledge-graph.db \
|
|
15
|
+
--subset 3
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
# Add sinain-memory to path
|
|
26
|
+
_koog_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
27
|
+
if _koog_dir not in sys.path:
|
|
28
|
+
sys.path.insert(0, _koog_dir)
|
|
29
|
+
|
|
30
|
+
from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
|
|
31
|
+
from eval.benchmarks.meeting_adapter import MeetingMemoryAdapter
|
|
32
|
+
from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
|
|
33
|
+
from eval.benchmarks.evaluate import token_f1, aggregate_results
|
|
34
|
+
from eval.benchmarks.judges.qa_judge import judge_qa
|
|
35
|
+
from eval.benchmarks.report import generate_markdown, generate_json
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_meeting_benchmark(
|
|
39
|
+
db_path: str,
|
|
40
|
+
conditions: list[str],
|
|
41
|
+
*,
|
|
42
|
+
subset: int | None = None,
|
|
43
|
+
meeting_filter: str | None = None,
|
|
44
|
+
qa_model: str = QA_MODEL,
|
|
45
|
+
judge_model: str = JUDGE_MODEL,
|
|
46
|
+
output_dir: Path = RESULTS_DIR,
|
|
47
|
+
data_dir: Path = DATA_DIR,
|
|
48
|
+
resume: bool = False,
|
|
49
|
+
) -> tuple[dict, list[dict]]:
|
|
50
|
+
"""Run meeting benchmark against a captured knowledge-graph.db."""
|
|
51
|
+
|
|
52
|
+
adapter = MeetingMemoryAdapter()
|
|
53
|
+
instances = adapter.load_dataset(str(data_dir))
|
|
54
|
+
|
|
55
|
+
if meeting_filter:
|
|
56
|
+
instances = [i for i in instances if meeting_filter in i.id]
|
|
57
|
+
print(f"[meeting] filtered to {len(instances)} meeting(s) matching '{meeting_filter}'")
|
|
58
|
+
|
|
59
|
+
if not instances:
|
|
60
|
+
print("[meeting] no meeting data found — check data/meeting/ directory")
|
|
61
|
+
return {"error": "no data"}, []
|
|
62
|
+
|
|
63
|
+
# Validate DB exists for sinain-memory condition
|
|
64
|
+
if "sinain-memory" in conditions:
|
|
65
|
+
if not db_path or not Path(db_path).exists():
|
|
66
|
+
print(f"[meeting] ERROR: --db path does not exist: {db_path}")
|
|
67
|
+
print(" Run the capture pipeline first (see plan Part 1)")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
print(f"\n{'='*60}")
|
|
71
|
+
print(f" Meeting Memory Benchmark")
|
|
72
|
+
print(f" Conditions: {', '.join(conditions)}")
|
|
73
|
+
print(f" DB: {db_path or '(none)'}")
|
|
74
|
+
print(f" QA model: {qa_model}")
|
|
75
|
+
print(f" Judge model: {judge_model}")
|
|
76
|
+
print(f"{'='*60}\n")
|
|
77
|
+
|
|
78
|
+
# Flatten questions
|
|
79
|
+
all_questions = []
|
|
80
|
+
for inst in instances:
|
|
81
|
+
for q in inst.questions:
|
|
82
|
+
all_questions.append((inst, q))
|
|
83
|
+
|
|
84
|
+
if subset:
|
|
85
|
+
all_questions = all_questions[:subset]
|
|
86
|
+
|
|
87
|
+
total = len(all_questions)
|
|
88
|
+
print(f"[meeting] evaluating {total} questions\n")
|
|
89
|
+
|
|
90
|
+
# Resume support
|
|
91
|
+
resume_path = output_dir / "meeting_progress.jsonl"
|
|
92
|
+
completed: dict[str, dict] = {}
|
|
93
|
+
if resume and resume_path.exists():
|
|
94
|
+
for line in resume_path.read_text().strip().split("\n"):
|
|
95
|
+
if line:
|
|
96
|
+
entry = json.loads(line)
|
|
97
|
+
completed[entry["id"]] = entry
|
|
98
|
+
|
|
99
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
details: list[dict] = []
|
|
101
|
+
|
|
102
|
+
for idx, (inst, question) in enumerate(all_questions):
|
|
103
|
+
qid = question.id
|
|
104
|
+
|
|
105
|
+
if qid in completed:
|
|
106
|
+
details.append(completed[qid])
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
print(f"[{idx+1}/{total}] {qid} [{question.category}]")
|
|
110
|
+
print(f" Q: {question.text[:80]}...")
|
|
111
|
+
|
|
112
|
+
full_context = adapter.format_full_context(inst)
|
|
113
|
+
|
|
114
|
+
# Retrieval metrics
|
|
115
|
+
retrieval = {}
|
|
116
|
+
if db_path and "sinain-memory" in conditions:
|
|
117
|
+
retrieved_facts = _get_retrieved_facts(db_path, question.text)
|
|
118
|
+
retrieval = compute_content_recall(retrieved_facts, question.gold_answer)
|
|
119
|
+
|
|
120
|
+
# Generate answers per condition
|
|
121
|
+
answers = {}
|
|
122
|
+
for cond in conditions:
|
|
123
|
+
if cond == "sinain-memory" and not db_path:
|
|
124
|
+
answers[cond] = {"text": "(no DB)", "score": 1, "f1": 0.0}
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
print(f" [{cond}] generating answer...")
|
|
128
|
+
answer_text = answer_question(
|
|
129
|
+
question, cond,
|
|
130
|
+
db_path=db_path,
|
|
131
|
+
full_context=full_context,
|
|
132
|
+
model=qa_model,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
f1 = token_f1(answer_text, question.gold_answer)
|
|
136
|
+
|
|
137
|
+
judge_result = judge_qa(
|
|
138
|
+
question.text, question.gold_answer, answer_text,
|
|
139
|
+
condition=cond, model=judge_model,
|
|
140
|
+
)
|
|
141
|
+
score = judge_result["score"] if judge_result else None
|
|
142
|
+
reasoning = judge_result["reasoning"] if judge_result else None
|
|
143
|
+
|
|
144
|
+
answers[cond] = {
|
|
145
|
+
"text": answer_text[:500],
|
|
146
|
+
"score": score,
|
|
147
|
+
"f1": round(f1, 4),
|
|
148
|
+
"reasoning": reasoning,
|
|
149
|
+
}
|
|
150
|
+
print(f" score={score}/5 f1={f1:.2f}")
|
|
151
|
+
|
|
152
|
+
entry = {
|
|
153
|
+
"id": qid,
|
|
154
|
+
"question": question.text,
|
|
155
|
+
"gold_answer": question.gold_answer,
|
|
156
|
+
"category": question.category,
|
|
157
|
+
"retrieval": retrieval,
|
|
158
|
+
"answers": answers,
|
|
159
|
+
}
|
|
160
|
+
details.append(entry)
|
|
161
|
+
|
|
162
|
+
# Save progress
|
|
163
|
+
with open(resume_path, "a") as f:
|
|
164
|
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
165
|
+
|
|
166
|
+
summary = aggregate_results(details)
|
|
167
|
+
return summary, details
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def main() -> None:
|
|
171
|
+
parser = argparse.ArgumentParser(description="Meeting Memory Benchmark")
|
|
172
|
+
parser.add_argument("--db", required=False, default=None,
|
|
173
|
+
help="Path to knowledge-graph.db from capture run")
|
|
174
|
+
parser.add_argument("--conditions", default="sinain-memory,full-context",
|
|
175
|
+
help="Comma-separated conditions (sinain-memory, full-context)")
|
|
176
|
+
parser.add_argument("--subset", type=int, default=None,
|
|
177
|
+
help="Run only first N questions")
|
|
178
|
+
parser.add_argument("--meeting", default=None,
|
|
179
|
+
help="Filter to specific meeting by stem (e.g. al-futaim-prep-5min)")
|
|
180
|
+
parser.add_argument("--qa-model", default=QA_MODEL)
|
|
181
|
+
parser.add_argument("--judge-model", default=JUDGE_MODEL)
|
|
182
|
+
parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
|
|
183
|
+
parser.add_argument("--data-dir", type=Path, default=DATA_DIR)
|
|
184
|
+
parser.add_argument("--format", default="json,markdown")
|
|
185
|
+
parser.add_argument("--resume", action="store_true")
|
|
186
|
+
args = parser.parse_args()
|
|
187
|
+
|
|
188
|
+
conditions = [c.strip() for c in args.conditions.split(",")]
|
|
189
|
+
formats = [f.strip() for f in args.format.split(",")]
|
|
190
|
+
|
|
191
|
+
summary, details = run_meeting_benchmark(
|
|
192
|
+
args.db, conditions,
|
|
193
|
+
subset=args.subset,
|
|
194
|
+
meeting_filter=args.meeting,
|
|
195
|
+
qa_model=args.qa_model,
|
|
196
|
+
judge_model=args.judge_model,
|
|
197
|
+
output_dir=args.output_dir,
|
|
198
|
+
data_dir=args.data_dir,
|
|
199
|
+
resume=args.resume,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Write outputs
|
|
203
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
204
|
+
|
|
205
|
+
if "json" in formats:
|
|
206
|
+
json_path = args.output_dir / "meeting_results.json"
|
|
207
|
+
json_path.write_text(generate_json("meeting", summary, details))
|
|
208
|
+
print(f"\n[output] JSON: {json_path}")
|
|
209
|
+
|
|
210
|
+
if "markdown" in formats:
|
|
211
|
+
md_path = args.output_dir / "meeting_results.md"
|
|
212
|
+
md_path.write_text(generate_markdown("meeting", summary, details))
|
|
213
|
+
print(f"[output] Markdown: {md_path}")
|
|
214
|
+
|
|
215
|
+
# Print summary
|
|
216
|
+
print(f"\n{'='*60}")
|
|
217
|
+
print(f" Meeting Memory Benchmark — Summary")
|
|
218
|
+
print(f"{'='*60}")
|
|
219
|
+
ipr = summary.get("ipr")
|
|
220
|
+
if ipr:
|
|
221
|
+
print(f" IPR: {ipr:.1%}")
|
|
222
|
+
for cond, data in summary.get("conditions", {}).items():
|
|
223
|
+
print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
|
|
224
|
+
for k, v in summary.get("retrieval", {}).items():
|
|
225
|
+
print(f" {k}: {v:.1%}")
|
|
226
|
+
print()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
if __name__ == "__main__":
|
|
230
|
+
main()
|
|
@@ -52,30 +52,51 @@ def _get_all_facts_text(db_path: str) -> str:
|
|
|
52
52
|
def _query_knowledge(db_path: str, question: str) -> str:
|
|
53
53
|
"""Query sinain knowledge graph for facts relevant to a question.
|
|
54
54
|
|
|
55
|
-
Strategy:
|
|
56
|
-
|
|
55
|
+
Strategy: retrieve broadly, then re-rank by keyword overlap with the question.
|
|
56
|
+
This ensures specific facts (CTO background) beat generic ones (meeting schedule)
|
|
57
|
+
when the question asks about the CTO.
|
|
57
58
|
"""
|
|
58
|
-
from graph_query import
|
|
59
|
+
from graph_query import query_facts_hybrid, query_top_facts, format_facts_text
|
|
59
60
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
# Retrieve a broad candidate set
|
|
62
|
+
candidates = query_facts_hybrid(db_path, question, max_facts=30)
|
|
63
|
+
if not candidates:
|
|
64
|
+
candidates = query_top_facts(db_path, limit=30)
|
|
65
|
+
if not candidates:
|
|
66
|
+
return "(no knowledge available)"
|
|
65
67
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
+
# Re-rank by embedding similarity if available, fall back to keyword overlap
|
|
69
|
+
try:
|
|
70
|
+
from embed_client import rank_by_similarity
|
|
71
|
+
fact_texts = [str(f.get("value", "")) for f in candidates]
|
|
72
|
+
ranked_indices = rank_by_similarity(question, fact_texts)
|
|
73
|
+
if ranked_indices is not None:
|
|
74
|
+
ranked = [candidates[i] for i, _ in ranked_indices[:MAX_FACTS_PER_QUERY]]
|
|
75
|
+
return format_facts_text(ranked, max_chars=3000)
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
# Fallback: keyword overlap ranking
|
|
80
|
+
q_keywords = set(_extract_keywords(question))
|
|
81
|
+
def _relevance(fact: dict) -> float:
|
|
82
|
+
value = str(fact.get("value", "")).lower()
|
|
83
|
+
entity = str(fact.get("entity", "")).lower()
|
|
84
|
+
fact_words = set(_extract_keywords(value + " " + entity))
|
|
85
|
+
if not q_keywords:
|
|
86
|
+
return 0.0
|
|
87
|
+
return len(q_keywords & fact_words) / len(q_keywords)
|
|
88
|
+
|
|
89
|
+
ranked = sorted(candidates, key=_relevance, reverse=True)
|
|
90
|
+
return format_facts_text(ranked[:MAX_FACTS_PER_QUERY], max_chars=3000)
|
|
68
91
|
|
|
69
92
|
|
|
70
93
|
def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
|
|
71
94
|
"""Get facts retrieved for a question (for retrieval evaluation)."""
|
|
72
|
-
from graph_query import
|
|
95
|
+
from graph_query import query_facts_hybrid, query_top_facts
|
|
73
96
|
|
|
74
|
-
|
|
75
|
-
if
|
|
76
|
-
facts
|
|
77
|
-
if facts:
|
|
78
|
-
return facts
|
|
97
|
+
facts = query_facts_hybrid(db_path, question, max_facts=k)
|
|
98
|
+
if facts:
|
|
99
|
+
return facts
|
|
79
100
|
|
|
80
101
|
# Fallback: top facts by confidence
|
|
81
102
|
return query_top_facts(db_path, limit=k)
|