@geravant/sinain 1.12.0 → 1.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/package.json +1 -1
  2. package/sinain-core/package-lock.json +963 -0
  3. package/sinain-core/package.json +1 -0
  4. package/sinain-core/src/buffers/feed-buffer.ts +32 -0
  5. package/sinain-core/src/embedding/service.ts +66 -0
  6. package/sinain-core/src/index.ts +19 -2
  7. package/sinain-core/src/learning/local-curation.ts +137 -7
  8. package/sinain-core/src/server.ts +31 -0
  9. package/sinain-memory/README.md +105 -0
  10. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  12. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  13. package/sinain-memory/embed_client.py +117 -0
  14. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  16. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  18. package/sinain-memory/eval/benchmarks/meeting_adapter.py +81 -0
  19. package/sinain-memory/eval/benchmarks/meeting_runner.py +230 -0
  20. package/sinain-memory/eval/benchmarks/query.py +37 -16
  21. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +318 -0
  22. package/sinain-memory/eval/benchmarks/runner.py +10 -3
  23. package/sinain-memory/graph_query.py +257 -15
  24. package/sinain-memory/knowledge_integrator.py +365 -72
  25. package/sinain-memory/memory-config.json +1 -1
  26. package/sinain-memory/session_distiller.py +43 -19
  27. package/sinain-memory/triplestore.py +60 -0
@@ -0,0 +1,81 @@
1
+ """Meeting Memory adapter — loads QA pairs + ground-truth transcript.
2
+
3
+ Unlike LongMemEval, this adapter does NOT ingest. The real sinain pipeline
4
+ (start-local.sh) produces the knowledge-graph.db during a live capture run.
5
+ This adapter only loads QA pairs and provides the full-context baseline.
6
+
7
+ Data layout:
8
+ eval/benchmarks/data/meeting/
9
+ <name>.txt — ground-truth transcript
10
+ <name>_qa.json — gold QA pairs
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from pathlib import Path
17
+
18
+ from .base_adapter import BenchmarkAdapter, BenchmarkInstance, BenchmarkQuestion
19
+
20
+
21
+ class MeetingMemoryAdapter(BenchmarkAdapter):
22
+ """Adapter for meeting memory benchmarks (real pipeline capture)."""
23
+
24
+ @property
25
+ def name(self) -> str:
26
+ return "meeting"
27
+
28
+ def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
29
+ """Load meeting QA pairs and transcripts from data/meeting/."""
30
+ meeting_dir = Path(data_dir) / "meeting"
31
+ if not meeting_dir.exists():
32
+ raise FileNotFoundError(f"Meeting data not found: {meeting_dir}")
33
+
34
+ instances = []
35
+ for qa_path in sorted(meeting_dir.glob("*_qa.json")):
36
+ # Derive transcript path: foo_qa.json → foo.txt
37
+ stem = qa_path.stem.replace("_qa", "")
38
+ transcript_path = qa_path.parent / f"{stem}.txt"
39
+ if not transcript_path.exists():
40
+ print(f"[meeting] warning: no transcript for {qa_path.name}, skipping")
41
+ continue
42
+
43
+ # Load QA pairs
44
+ with open(qa_path) as f:
45
+ raw_questions = json.load(f)
46
+
47
+ questions = []
48
+ for item in raw_questions:
49
+ questions.append(BenchmarkQuestion(
50
+ id=item["id"],
51
+ text=item["question"],
52
+ gold_answer=item["gold_answer"],
53
+ category=item.get("category", "unknown"),
54
+ evidence_session_ids=item.get("evidence_timestamps", []),
55
+ metadata={
56
+ "evidence_timestamps": item.get("evidence_timestamps", []),
57
+ },
58
+ ))
59
+
60
+ # Load transcript
61
+ transcript_text = transcript_path.read_text(encoding="utf-8")
62
+
63
+ instances.append(BenchmarkInstance(
64
+ id=f"meeting-{stem}",
65
+ sessions=[], # Not used — real pipeline does ingestion
66
+ questions=questions,
67
+ raw_sessions=[],
68
+ metadata={
69
+ "transcript_path": str(transcript_path),
70
+ "raw_transcript": transcript_text,
71
+ "stem": stem,
72
+ },
73
+ ))
74
+
75
+ total_q = sum(len(i.questions) for i in instances)
76
+ print(f"[meeting] loaded {total_q} questions across {len(instances)} meetings")
77
+ return instances
78
+
79
+ def format_full_context(self, instance: BenchmarkInstance) -> str:
80
+ """Return the raw transcript verbatim for the full-context baseline."""
81
+ return instance.metadata.get("raw_transcript", "")
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python3
2
+ """Meeting Memory Benchmark runner — evaluates a captured knowledge-graph.db.
3
+
4
+ Standalone script: does NOT modify or import runner.py.
5
+ Reuses shared infrastructure: query, judge, evaluate, report.
6
+
7
+ Usage:
8
+ python3 eval/benchmarks/meeting_runner.py \
9
+ --db /tmp/sinain-bench-XXXX/knowledge-graph.db \
10
+ --conditions sinain-memory,full-context
11
+
12
+ # Quick test (3 questions):
13
+ python3 eval/benchmarks/meeting_runner.py \
14
+ --db /tmp/sinain-bench-XXXX/knowledge-graph.db \
15
+ --subset 3
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ # Add sinain-memory to path
26
+ _koog_dir = str(Path(__file__).resolve().parent.parent.parent)
27
+ if _koog_dir not in sys.path:
28
+ sys.path.insert(0, _koog_dir)
29
+
30
+ from eval.benchmarks.config import DATA_DIR, RESULTS_DIR, QA_MODEL, JUDGE_MODEL
31
+ from eval.benchmarks.meeting_adapter import MeetingMemoryAdapter
32
+ from eval.benchmarks.query import answer_question, _get_retrieved_facts, compute_content_recall
33
+ from eval.benchmarks.evaluate import token_f1, aggregate_results
34
+ from eval.benchmarks.judges.qa_judge import judge_qa
35
+ from eval.benchmarks.report import generate_markdown, generate_json
36
+
37
+
38
+ def run_meeting_benchmark(
39
+ db_path: str,
40
+ conditions: list[str],
41
+ *,
42
+ subset: int | None = None,
43
+ meeting_filter: str | None = None,
44
+ qa_model: str = QA_MODEL,
45
+ judge_model: str = JUDGE_MODEL,
46
+ output_dir: Path = RESULTS_DIR,
47
+ data_dir: Path = DATA_DIR,
48
+ resume: bool = False,
49
+ ) -> tuple[dict, list[dict]]:
50
+ """Run meeting benchmark against a captured knowledge-graph.db."""
51
+
52
+ adapter = MeetingMemoryAdapter()
53
+ instances = adapter.load_dataset(str(data_dir))
54
+
55
+ if meeting_filter:
56
+ instances = [i for i in instances if meeting_filter in i.id]
57
+ print(f"[meeting] filtered to {len(instances)} meeting(s) matching '{meeting_filter}'")
58
+
59
+ if not instances:
60
+ print("[meeting] no meeting data found — check data/meeting/ directory")
61
+ return {"error": "no data"}, []
62
+
63
+ # Validate DB exists for sinain-memory condition
64
+ if "sinain-memory" in conditions:
65
+ if not db_path or not Path(db_path).exists():
66
+ print(f"[meeting] ERROR: --db path does not exist: {db_path}")
67
+ print(" Run the capture pipeline first (see plan Part 1)")
68
+ sys.exit(1)
69
+
70
+ print(f"\n{'='*60}")
71
+ print(f" Meeting Memory Benchmark")
72
+ print(f" Conditions: {', '.join(conditions)}")
73
+ print(f" DB: {db_path or '(none)'}")
74
+ print(f" QA model: {qa_model}")
75
+ print(f" Judge model: {judge_model}")
76
+ print(f"{'='*60}\n")
77
+
78
+ # Flatten questions
79
+ all_questions = []
80
+ for inst in instances:
81
+ for q in inst.questions:
82
+ all_questions.append((inst, q))
83
+
84
+ if subset:
85
+ all_questions = all_questions[:subset]
86
+
87
+ total = len(all_questions)
88
+ print(f"[meeting] evaluating {total} questions\n")
89
+
90
+ # Resume support
91
+ resume_path = output_dir / "meeting_progress.jsonl"
92
+ completed: dict[str, dict] = {}
93
+ if resume and resume_path.exists():
94
+ for line in resume_path.read_text().strip().split("\n"):
95
+ if line:
96
+ entry = json.loads(line)
97
+ completed[entry["id"]] = entry
98
+
99
+ output_dir.mkdir(parents=True, exist_ok=True)
100
+ details: list[dict] = []
101
+
102
+ for idx, (inst, question) in enumerate(all_questions):
103
+ qid = question.id
104
+
105
+ if qid in completed:
106
+ details.append(completed[qid])
107
+ continue
108
+
109
+ print(f"[{idx+1}/{total}] {qid} [{question.category}]")
110
+ print(f" Q: {question.text[:80]}...")
111
+
112
+ full_context = adapter.format_full_context(inst)
113
+
114
+ # Retrieval metrics
115
+ retrieval = {}
116
+ if db_path and "sinain-memory" in conditions:
117
+ retrieved_facts = _get_retrieved_facts(db_path, question.text)
118
+ retrieval = compute_content_recall(retrieved_facts, question.gold_answer)
119
+
120
+ # Generate answers per condition
121
+ answers = {}
122
+ for cond in conditions:
123
+ if cond == "sinain-memory" and not db_path:
124
+ answers[cond] = {"text": "(no DB)", "score": 1, "f1": 0.0}
125
+ continue
126
+
127
+ print(f" [{cond}] generating answer...")
128
+ answer_text = answer_question(
129
+ question, cond,
130
+ db_path=db_path,
131
+ full_context=full_context,
132
+ model=qa_model,
133
+ )
134
+
135
+ f1 = token_f1(answer_text, question.gold_answer)
136
+
137
+ judge_result = judge_qa(
138
+ question.text, question.gold_answer, answer_text,
139
+ condition=cond, model=judge_model,
140
+ )
141
+ score = judge_result["score"] if judge_result else None
142
+ reasoning = judge_result["reasoning"] if judge_result else None
143
+
144
+ answers[cond] = {
145
+ "text": answer_text[:500],
146
+ "score": score,
147
+ "f1": round(f1, 4),
148
+ "reasoning": reasoning,
149
+ }
150
+ print(f" score={score}/5 f1={f1:.2f}")
151
+
152
+ entry = {
153
+ "id": qid,
154
+ "question": question.text,
155
+ "gold_answer": question.gold_answer,
156
+ "category": question.category,
157
+ "retrieval": retrieval,
158
+ "answers": answers,
159
+ }
160
+ details.append(entry)
161
+
162
+ # Save progress
163
+ with open(resume_path, "a") as f:
164
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
165
+
166
+ summary = aggregate_results(details)
167
+ return summary, details
168
+
169
+
170
+ def main() -> None:
171
+ parser = argparse.ArgumentParser(description="Meeting Memory Benchmark")
172
+ parser.add_argument("--db", required=False, default=None,
173
+ help="Path to knowledge-graph.db from capture run")
174
+ parser.add_argument("--conditions", default="sinain-memory,full-context",
175
+ help="Comma-separated conditions (sinain-memory, full-context)")
176
+ parser.add_argument("--subset", type=int, default=None,
177
+ help="Run only first N questions")
178
+ parser.add_argument("--meeting", default=None,
179
+ help="Filter to specific meeting by stem (e.g. al-futaim-prep-5min)")
180
+ parser.add_argument("--qa-model", default=QA_MODEL)
181
+ parser.add_argument("--judge-model", default=JUDGE_MODEL)
182
+ parser.add_argument("--output-dir", type=Path, default=RESULTS_DIR)
183
+ parser.add_argument("--data-dir", type=Path, default=DATA_DIR)
184
+ parser.add_argument("--format", default="json,markdown")
185
+ parser.add_argument("--resume", action="store_true")
186
+ args = parser.parse_args()
187
+
188
+ conditions = [c.strip() for c in args.conditions.split(",")]
189
+ formats = [f.strip() for f in args.format.split(",")]
190
+
191
+ summary, details = run_meeting_benchmark(
192
+ args.db, conditions,
193
+ subset=args.subset,
194
+ meeting_filter=args.meeting,
195
+ qa_model=args.qa_model,
196
+ judge_model=args.judge_model,
197
+ output_dir=args.output_dir,
198
+ data_dir=args.data_dir,
199
+ resume=args.resume,
200
+ )
201
+
202
+ # Write outputs
203
+ args.output_dir.mkdir(parents=True, exist_ok=True)
204
+
205
+ if "json" in formats:
206
+ json_path = args.output_dir / "meeting_results.json"
207
+ json_path.write_text(generate_json("meeting", summary, details))
208
+ print(f"\n[output] JSON: {json_path}")
209
+
210
+ if "markdown" in formats:
211
+ md_path = args.output_dir / "meeting_results.md"
212
+ md_path.write_text(generate_markdown("meeting", summary, details))
213
+ print(f"[output] Markdown: {md_path}")
214
+
215
+ # Print summary
216
+ print(f"\n{'='*60}")
217
+ print(f" Meeting Memory Benchmark — Summary")
218
+ print(f"{'='*60}")
219
+ ipr = summary.get("ipr")
220
+ if ipr:
221
+ print(f" IPR: {ipr:.1%}")
222
+ for cond, data in summary.get("conditions", {}).items():
223
+ print(f" {cond}: {data['mean_score']:.2f}/5 (n={data['n']})")
224
+ for k, v in summary.get("retrieval", {}).items():
225
+ print(f" {k}: {v:.1%}")
226
+ print()
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -52,30 +52,51 @@ def _get_all_facts_text(db_path: str) -> str:
52
52
  def _query_knowledge(db_path: str, question: str) -> str:
53
53
  """Query sinain knowledge graph for facts relevant to a question.
54
54
 
55
- Strategy: first try tag-based retrieval (targeted). If nothing found,
56
- fall back to full DB dump (sinain stores are small enough).
55
+ Strategy: retrieve broadly, then re-rank by keyword overlap with the question.
56
+ This ensures specific facts (CTO background) beat generic ones (meeting schedule)
57
+ when the question asks about the CTO.
57
58
  """
58
- from graph_query import query_facts_by_entities, format_facts_text
59
+ from graph_query import query_facts_hybrid, query_top_facts, format_facts_text
59
60
 
60
- keywords = _extract_keywords(question)
61
- if keywords:
62
- facts = query_facts_by_entities(db_path, keywords, max_facts=MAX_FACTS_PER_QUERY)
63
- if facts:
64
- return format_facts_text(facts, max_chars=3000)
61
+ # Retrieve a broad candidate set
62
+ candidates = query_facts_hybrid(db_path, question, max_facts=30)
63
+ if not candidates:
64
+ candidates = query_top_facts(db_path, limit=30)
65
+ if not candidates:
66
+ return "(no knowledge available)"
65
67
 
66
- # Fallback: include all facts (DB is small, typically 10-30 facts)
67
- return _get_all_facts_text(db_path)
68
+ # Re-rank by embedding similarity if available, fall back to keyword overlap
69
+ try:
70
+ from embed_client import rank_by_similarity
71
+ fact_texts = [str(f.get("value", "")) for f in candidates]
72
+ ranked_indices = rank_by_similarity(question, fact_texts)
73
+ if ranked_indices is not None:
74
+ ranked = [candidates[i] for i, _ in ranked_indices[:MAX_FACTS_PER_QUERY]]
75
+ return format_facts_text(ranked, max_chars=3000)
76
+ except Exception:
77
+ pass
78
+
79
+ # Fallback: keyword overlap ranking
80
+ q_keywords = set(_extract_keywords(question))
81
+ def _relevance(fact: dict) -> float:
82
+ value = str(fact.get("value", "")).lower()
83
+ entity = str(fact.get("entity", "")).lower()
84
+ fact_words = set(_extract_keywords(value + " " + entity))
85
+ if not q_keywords:
86
+ return 0.0
87
+ return len(q_keywords & fact_words) / len(q_keywords)
88
+
89
+ ranked = sorted(candidates, key=_relevance, reverse=True)
90
+ return format_facts_text(ranked[:MAX_FACTS_PER_QUERY], max_chars=3000)
68
91
 
69
92
 
70
93
  def _get_retrieved_facts(db_path: str, question: str, k: int = 10) -> list[dict]:
71
94
  """Get facts retrieved for a question (for retrieval evaluation)."""
72
- from graph_query import query_facts_by_entities, query_top_facts
95
+ from graph_query import query_facts_hybrid, query_top_facts
73
96
 
74
- keywords = _extract_keywords(question)
75
- if keywords:
76
- facts = query_facts_by_entities(db_path, keywords, max_facts=k)
77
- if facts:
78
- return facts
97
+ facts = query_facts_hybrid(db_path, question, max_facts=k)
98
+ if facts:
99
+ return facts
79
100
 
80
101
  # Fallback: top facts by confidence
81
102
  return query_top_facts(db_path, limit=k)