@geravant/sinain 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.env.example +4 -2
  2. package/config-shared.js +1 -0
  3. package/package.json +4 -1
  4. package/sinain-agent/run.sh +36 -4
  5. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  6. package/sinain-core/src/index.ts +50 -19
  7. package/sinain-memory/graph_query.py +12 -3
  8. package/sinain-memory/knowledge_integrator.py +194 -10
  9. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  10. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  11. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  12. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  13. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  14. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  15. package/sinain-memory/eval/__init__.py +0 -0
  16. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/sinain-memory/eval/assertions.py +0 -267
  18. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  19. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  20. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  21. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  22. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  23. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  24. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  25. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  27. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  29. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  30. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  31. package/sinain-memory/eval/benchmarks/config.py +0 -23
  32. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  33. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  34. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  35. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  38. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  39. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  40. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  41. package/sinain-memory/eval/benchmarks/query.py +0 -193
  42. package/sinain-memory/eval/benchmarks/report.py +0 -87
  43. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  44. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  45. package/sinain-memory/eval/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/judges/base_judge.py +0 -61
  47. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  48. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  49. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  50. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  51. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  52. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  53. package/sinain-memory/eval/schemas.py +0 -247
  54. package/sinain-memory/tests/__init__.py +0 -0
  55. package/sinain-memory/tests/conftest.py +0 -189
  56. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  57. package/sinain-memory/tests/test_embedder.py +0 -210
  58. package/sinain-memory/tests/test_extract_json.py +0 -124
  59. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  60. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  61. package/sinain-memory/tests/test_module_management.py +0 -458
  62. package/sinain-memory/tests/test_parsers.py +0 -96
  63. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  64. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  65. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  66. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  67. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -1,267 +0,0 @@
1
- """Behavioral assertion library for sinain-koog tick evaluation.
2
-
3
- Each assertion function validates a runtime invariant of the pipeline.
4
- Returns ``{"name": str, "passed": bool, "detail": str}``.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
-
10
- def _result(name: str, passed: bool, detail: str) -> dict:
11
- return {"name": name, "passed": passed, "detail": detail}
12
-
13
-
14
- # ---------------------------------------------------------------------------
15
- # Playbook curator assertions
16
- # ---------------------------------------------------------------------------
17
-
18
- def assert_playbook_under_limit(curator_result: dict, limit: int = 50) -> dict:
19
- """Verify playbook body stays under the line limit."""
20
- lines = curator_result.get("playbookLines", 0)
21
- if lines <= limit:
22
- return _result("playbook_under_limit", True, f"body has {lines} lines (limit {limit})")
23
- return _result("playbook_under_limit", False, f"body has {lines} lines, exceeds limit of {limit}")
24
-
25
-
26
- def assert_curator_respected_directive(curator_result: dict, directive: str) -> dict:
27
- """Check that curator changes align with the curate directive."""
28
- changes = curator_result.get("changes", {})
29
- added = len(changes.get("added", []))
30
- pruned = len(changes.get("pruned", []))
31
-
32
- if directive == "aggressive_prune":
33
- # Should have pruned items
34
- if pruned > 0:
35
- return _result("curator_respected_directive", True,
36
- f"aggressive_prune: pruned {pruned} items")
37
- if added == 0 and pruned == 0:
38
- return _result("curator_respected_directive", True,
39
- "aggressive_prune: no changes (acceptable if playbook already lean)")
40
- return _result("curator_respected_directive", False,
41
- f"aggressive_prune: added {added} but pruned {pruned} — expected pruning")
42
-
43
- if directive == "stability":
44
- # Should not aggressively prune established patterns
45
- if pruned > added + 2:
46
- return _result("curator_respected_directive", False,
47
- f"stability: pruned {pruned} items (only added {added}) — too aggressive for stability mode")
48
- return _result("curator_respected_directive", True,
49
- f"stability: added {added}, pruned {pruned} — conservative")
50
-
51
- # normal / insufficient_data — any reasonable mix is fine
52
- return _result("curator_respected_directive", True,
53
- f"{directive}: added {added}, pruned {pruned}")
54
-
55
-
56
- # ---------------------------------------------------------------------------
57
- # Signal analyzer assertions
58
- # ---------------------------------------------------------------------------
59
-
60
- def assert_no_repeat_action(signal_result: dict, recent_logs: list[dict], window: int = 3) -> dict:
61
- """Verify recommendedAction doesn't repeat the last N ticks' actions."""
62
- action = signal_result.get("recommendedAction")
63
- if action is None or action.get("action") == "skip":
64
- return _result("no_repeat_action", True, "no action recommended (skip/null)")
65
-
66
- task = (action.get("task") or "").lower().strip()
67
- if not task:
68
- return _result("no_repeat_action", True, "no task description to compare")
69
-
70
- # Collect recent action tasks
71
- recent_tasks: list[str] = []
72
- for log in recent_logs[:window]:
73
- log_actions = log.get("actionsConsidered", [])
74
- for a in log_actions:
75
- if a.get("chosen"):
76
- recent_tasks.append((a.get("reason") or a.get("task") or "").lower().strip())
77
-
78
- # Check for near-duplicate (substring match to catch rephrasing)
79
- for prev_task in recent_tasks:
80
- if not prev_task:
81
- continue
82
- # If >60% of words overlap, consider it a repeat
83
- task_words = set(task.split())
84
- prev_words = set(prev_task.split())
85
- if not task_words or not prev_words:
86
- continue
87
- overlap = len(task_words & prev_words) / max(len(task_words), len(prev_words))
88
- if overlap > 0.6:
89
- return _result("no_repeat_action", False,
90
- f"action task '{task[:60]}' overlaps with recent '{prev_task[:60]}' ({overlap:.0%} word overlap)")
91
-
92
- return _result("no_repeat_action", True,
93
- f"action task is distinct from last {window} ticks")
94
-
95
-
96
- def assert_signal_confidence_threshold(signal_result: dict, threshold: float = 0.5) -> dict:
97
- """Verify actions are only recommended above the confidence threshold."""
98
- action = signal_result.get("recommendedAction")
99
- if action is None or action.get("action") == "skip":
100
- return _result("signal_confidence_threshold", True, "no action recommended")
101
-
102
- confidence = action.get("confidence")
103
- if confidence is None:
104
- return _result("signal_confidence_threshold", False,
105
- "action recommended but no confidence value provided")
106
-
107
- if confidence >= threshold:
108
- return _result("signal_confidence_threshold", True,
109
- f"confidence {confidence:.2f} >= threshold {threshold}")
110
- return _result("signal_confidence_threshold", False,
111
- f"confidence {confidence:.2f} < threshold {threshold}")
112
-
113
-
114
- # ---------------------------------------------------------------------------
115
- # Insight synthesizer assertions
116
- # ---------------------------------------------------------------------------
117
-
118
- def assert_insight_char_limit(synth_result: dict, limit: int = 500) -> dict:
119
- """Verify suggestion+insight stays under the character limit."""
120
- if synth_result.get("skip", False):
121
- return _result("insight_char_limit", True, "output skipped")
122
-
123
- suggestion = synth_result.get("suggestion", "")
124
- insight = synth_result.get("insight", "")
125
- total = len(suggestion) + len(insight)
126
-
127
- if total <= limit:
128
- return _result("insight_char_limit", True, f"total {total} chars (limit {limit})")
129
- return _result("insight_char_limit", False, f"total {total} chars exceeds limit of {limit}")
130
-
131
-
132
- def assert_skip_reason_specific(synth_result: dict) -> dict:
133
- """If skip=true, verify the reason is specific (not generic boilerplate)."""
134
- if not synth_result.get("skip", False):
135
- return _result("skip_reason_specific", True, "output not skipped")
136
-
137
- reason = (synth_result.get("skipReason") or "").strip()
138
- if not reason:
139
- return _result("skip_reason_specific", False, "skip=true but no skipReason provided")
140
-
141
- # Check against known-generic patterns
142
- generic_phrases = [
143
- "no new data",
144
- "nothing new",
145
- "no updates",
146
- "insufficient data",
147
- "not enough information",
148
- "no changes",
149
- ]
150
- reason_lower = reason.lower()
151
- for phrase in generic_phrases:
152
- if reason_lower == phrase or (len(reason_lower) < 30 and phrase in reason_lower):
153
- return _result("skip_reason_specific", False,
154
- f"skipReason is too generic: '{reason}'")
155
-
156
- return _result("skip_reason_specific", True, f"skipReason is specific ({len(reason)} chars)")
157
-
158
-
159
- # ---------------------------------------------------------------------------
160
- # Memory miner assertions
161
- # ---------------------------------------------------------------------------
162
-
163
- def assert_miner_references_sources(miner_result: dict, daily_files: list[str]) -> dict:
164
- """Verify mining findings reference actual source files that were provided."""
165
- mined = miner_result.get("minedSources", [])
166
- if not mined:
167
- return _result("miner_references_sources", True, "no sources mined (early return)")
168
-
169
- # daily_files contains basenames like "2026-02-21.md"
170
- known_basenames = set(daily_files)
171
- unknown = [s for s in mined if s not in known_basenames]
172
-
173
- if unknown:
174
- return _result("miner_references_sources", False,
175
- f"minedSources references unknown files: {unknown}")
176
- return _result("miner_references_sources", True,
177
- f"all {len(mined)} mined sources are valid")
178
-
179
-
180
- # ---------------------------------------------------------------------------
181
- # Cross-script / structural assertions
182
- # ---------------------------------------------------------------------------
183
-
184
- def assert_schema_valid(script_name: str, output: dict, schema_errors: list[str]) -> dict:
185
- """Wrap schema validation result as an assertion."""
186
- if not schema_errors:
187
- return _result(f"schema_valid_{script_name}", True, "output matches schema")
188
- return _result(f"schema_valid_{script_name}", False,
189
- f"{len(schema_errors)} schema errors: {'; '.join(schema_errors[:3])}")
190
-
191
-
192
- def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
193
- """Verify the playbook still has its mining-index header and effectiveness footer."""
194
- has_header = "<!-- mining-index:" in playbook_text
195
- has_footer = "<!-- effectiveness:" in playbook_text
196
-
197
- if has_header and has_footer:
198
- return _result("playbook_header_footer_intact", True,
199
- "both mining-index and effectiveness comments present")
200
- missing = []
201
- if not has_header:
202
- missing.append("mining-index")
203
- if not has_footer:
204
- missing.append("effectiveness")
205
- return _result("playbook_header_footer_intact", False,
206
- f"missing playbook comments: {', '.join(missing)}")
207
-
208
-
209
- # ---------------------------------------------------------------------------
210
- # Runner: execute all applicable assertions for a tick
211
- # ---------------------------------------------------------------------------
212
-
213
- def run_tick_assertions(
214
- log_entry: dict,
215
- recent_logs: list[dict],
216
- playbook_text: str,
217
- daily_files: list[str],
218
- ) -> list[dict]:
219
- """Run all applicable assertions against a single tick's log entry.
220
-
221
- Returns a list of assertion result dicts.
222
- """
223
- results: list[dict] = []
224
-
225
- # Signal analyzer assertions
226
- signals = log_entry.get("signals")
227
- if signals is not None:
228
- results.append(assert_signal_confidence_threshold(
229
- {"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
230
- ))
231
- results.append(assert_no_repeat_action(
232
- {"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
233
- recent_logs,
234
- ))
235
-
236
- # Curator assertions — playbookChanges can be {"note": "skipped"} or full output
237
- curator = log_entry.get("playbookChanges")
238
- if isinstance(curator, dict) and "changes" in curator:
239
- curator_with_lines = {**curator}
240
- if "playbookLines" not in curator_with_lines:
241
- curator_with_lines["playbookLines"] = curator.get("playbookLines", 0)
242
- results.append(assert_playbook_under_limit(curator_with_lines))
243
-
244
- directive = log_entry.get("curateDirective", "normal")
245
- results.append(assert_curator_respected_directive(curator_with_lines, directive))
246
-
247
- # Insight synthesizer assertions — output can be null (pipeline-level skip)
248
- output = log_entry.get("output")
249
- if isinstance(output, dict):
250
- results.append(assert_insight_char_limit(output))
251
- results.append(assert_skip_reason_specific(output))
252
-
253
- # Mining assertions — log uses miningFindings (str) and minedSources (list)
254
- mining = log_entry.get("miningResult")
255
- if mining is not None:
256
- results.append(assert_miner_references_sources(mining, daily_files))
257
- elif log_entry.get("minedSources"):
258
- # Reconstruct mining result from flat log fields
259
- results.append(assert_miner_references_sources(
260
- {"minedSources": log_entry.get("minedSources", [])}, daily_files
261
- ))
262
-
263
- # Playbook health (if we have playbook text)
264
- if playbook_text:
265
- results.append(assert_playbook_header_footer_intact(playbook_text))
266
-
267
- return results
File without changes
@@ -1,43 +0,0 @@
1
- """Base adapter and data classes for benchmark evaluation."""
2
-
3
- from __future__ import annotations
4
-
5
- from abc import ABC, abstractmethod
6
- from dataclasses import dataclass, field
7
-
8
-
9
- @dataclass
10
- class BenchmarkQuestion:
11
- id: str
12
- text: str
13
- gold_answer: str
14
- category: str # single-session, multi-session, temporal, etc.
15
- evidence_session_ids: list[str] = field(default_factory=list)
16
- metadata: dict = field(default_factory=dict)
17
-
18
-
19
- @dataclass
20
- class BenchmarkInstance:
21
- """A set of conversations + questions that share the same context."""
22
- id: str
23
- sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
24
- questions: list[BenchmarkQuestion] = field(default_factory=list)
25
- raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
26
- metadata: dict = field(default_factory=dict)
27
-
28
-
29
- class BenchmarkAdapter(ABC):
30
- """Abstract adapter: converts a published benchmark into sinain's format."""
31
-
32
- @property
33
- @abstractmethod
34
- def name(self) -> str:
35
- """Benchmark name (e.g. 'longmemeval', 'locomo')."""
36
-
37
- @abstractmethod
38
- def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
39
- """Download (if needed) and parse the benchmark dataset."""
40
-
41
- @abstractmethod
42
- def format_full_context(self, instance: BenchmarkInstance) -> str:
43
- """Render the full conversation history as a text string for the baseline condition."""
@@ -1,23 +0,0 @@
1
- """Benchmark configuration — models, paths, thresholds."""
2
-
3
- from pathlib import Path
4
-
5
- BENCHMARKS_DIR = Path(__file__).resolve().parent
6
- DATA_DIR = BENCHMARKS_DIR / "data"
7
- RESULTS_DIR = BENCHMARKS_DIR / "results"
8
-
9
- # LLM models (via OpenRouter)
10
- QA_MODEL = "google/gemini-2.5-flash"
11
- JUDGE_MODEL = "openai/gpt-4o"
12
-
13
- # Retrieval
14
- K_VALUES = [1, 3, 5, 10]
15
- MAX_FACTS_PER_QUERY = 10
16
-
17
- # Ingestion
18
- DISTILLER_TIMEOUT_S = 30
19
- INTEGRATOR_TIMEOUT_S = 60
20
-
21
- # Dataset URLs
22
- LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
23
- LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
@@ -1,146 +0,0 @@
1
- """Evaluation pipeline — score answers and compute aggregate metrics.
2
-
3
- Combines:
4
- - LLM-as-Judge (QA scoring, 1-5 scale)
5
- - Retrieval metrics (Recall@k, NDCG@k)
6
- - Token F1 overlap (mechanical, free)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import math
12
- import re
13
- from collections import defaultdict
14
-
15
- from .base_adapter import BenchmarkQuestion
16
- from .config import K_VALUES
17
-
18
-
19
- # ── Token F1 (mechanical, no LLM needed) ─────────────────────────────────────
20
-
21
- def _tokenize(text: str) -> list[str]:
22
- """Simple whitespace + punctuation tokenizer."""
23
- return re.findall(r"\w+", text.lower())
24
-
25
-
26
- def token_f1(predicted: str, gold: str | int) -> float:
27
- """Compute token-level F1 between predicted and gold answers."""
28
- pred_tokens = set(_tokenize(str(predicted)))
29
- gold_tokens = set(_tokenize(str(gold)))
30
- if not gold_tokens or not pred_tokens:
31
- return 0.0
32
- overlap = pred_tokens & gold_tokens
33
- if not overlap:
34
- return 0.0
35
- precision = len(overlap) / len(pred_tokens)
36
- recall = len(overlap) / len(gold_tokens)
37
- return 2 * precision * recall / (precision + recall)
38
-
39
-
40
- # ── Retrieval metrics (reuse logic from retrieval_evaluator.py) ───────────────
41
-
42
- def dcg_at_k(relevant_positions: list[int], k: int) -> float:
43
- """Discounted Cumulative Gain at k."""
44
- score = 0.0
45
- for pos in relevant_positions:
46
- if pos < k:
47
- score += 1.0 / math.log2(pos + 2)
48
- return score
49
-
50
-
51
- def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
52
- """Normalized DCG at k."""
53
- dcg = dcg_at_k(relevant_positions, k)
54
- ideal_positions = list(range(min(num_relevant, k)))
55
- idcg = dcg_at_k(ideal_positions, k)
56
- return dcg / idcg if idcg > 0 else 0.0
57
-
58
-
59
- def compute_retrieval_metrics(
60
- retrieved_ids: list[str],
61
- expected_ids: list[str],
62
- k_values: list[int] | None = None,
63
- ) -> dict:
64
- """Compute Recall@k and NDCG@k for a single question."""
65
- ks = k_values or K_VALUES
66
- expected_set = set(expected_ids)
67
- relevant_positions = [i for i, rid in enumerate(retrieved_ids) if rid in expected_set]
68
-
69
- result = {}
70
- for k in ks:
71
- hit = any(pos < k for pos in relevant_positions)
72
- result[f"recall@{k}"] = 1.0 if hit else 0.0
73
- result[f"ndcg@{k}"] = ndcg_at_k(relevant_positions, len(expected_set), k)
74
- return result
75
-
76
-
77
- # ── Aggregate metrics ─────────────────────────────────────────────────────────
78
-
79
- def aggregate_results(per_question: list[dict]) -> dict:
80
- """Compute aggregate metrics from per-question results.
81
-
82
- Each per_question entry has:
83
- {id, category, retrieval: {recall@k, ndcg@k}, answers: {condition: {score, f1}}}
84
- """
85
- if not per_question:
86
- return {"error": "no results"}
87
-
88
- # Per-condition scores
89
- condition_scores: dict[str, list[float]] = defaultdict(list)
90
- condition_f1s: dict[str, list[float]] = defaultdict(list)
91
- # Per-category per-condition
92
- cat_scores: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
93
- # Retrieval
94
- retrieval_metrics: dict[str, list[float]] = defaultdict(list)
95
-
96
- for q in per_question:
97
- cat = q.get("category", "unknown")
98
-
99
- for cond, data in q.get("answers", {}).items():
100
- if data.get("score") is not None:
101
- condition_scores[cond].append(data["score"])
102
- cat_scores[cat][cond].append(data["score"])
103
- if data.get("f1") is not None:
104
- condition_f1s[cond].append(data["f1"])
105
-
106
- for metric, val in q.get("retrieval", {}).items():
107
- if isinstance(val, (int, float)):
108
- retrieval_metrics[metric].append(val)
109
-
110
- def _mean(lst: list[float]) -> float:
111
- return round(sum(lst) / len(lst), 4) if lst else 0.0
112
-
113
- # Build summary
114
- conditions = {}
115
- for cond in sorted(condition_scores):
116
- conditions[cond] = {
117
- "mean_score": _mean(condition_scores[cond]),
118
- "mean_f1": _mean(condition_f1s.get(cond, [])),
119
- "n": len(condition_scores[cond]),
120
- }
121
-
122
- # IPR: sinain-memory vs full-context
123
- sm_scores = condition_scores.get("sinain-memory", [])
124
- fc_scores = condition_scores.get("full-context", [])
125
- ipr = _mean(sm_scores) / _mean(fc_scores) if fc_scores and _mean(fc_scores) > 0 else None
126
-
127
- # Category breakdown
128
- categories = {}
129
- for cat in sorted(cat_scores):
130
- categories[cat] = {}
131
- for cond in sorted(cat_scores[cat]):
132
- categories[cat][cond] = {
133
- "mean_score": _mean(cat_scores[cat][cond]),
134
- "n": len(cat_scores[cat][cond]),
135
- }
136
-
137
- # Retrieval summary
138
- retrieval = {k: _mean(v) for k, v in sorted(retrieval_metrics.items())}
139
-
140
- return {
141
- "total_questions": len(per_question),
142
- "conditions": conditions,
143
- "ipr": round(ipr, 4) if ipr else None,
144
- "categories": categories,
145
- "retrieval": retrieval,
146
- }
@@ -1,152 +0,0 @@
1
- """Ingestion pipeline — benchmark conversations → sinain triplestore.
2
-
3
- Runs session_distiller.py + knowledge_integrator.py via subprocess (exact production path).
4
- Caches results aggressively to avoid repeated LLM calls.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import hashlib
10
- import json
11
- import os
12
- import shutil
13
- import tempfile
14
- from pathlib import Path
15
- from subprocess import run, PIPE, TimeoutExpired
16
-
17
- from .base_adapter import BenchmarkInstance
18
- from .config import DISTILLER_TIMEOUT_S, INTEGRATOR_TIMEOUT_S
19
-
20
-
21
- def _scripts_dir() -> Path:
22
- """Locate sinain-memory scripts directory."""
23
- return Path(__file__).resolve().parent.parent.parent
24
-
25
-
26
- def _content_hash(sessions: list[list[dict]]) -> str:
27
- """Hash session content for caching."""
28
- raw = json.dumps(sessions, sort_keys=True, ensure_ascii=False)
29
- return hashlib.sha256(raw.encode()).hexdigest()[:16]
30
-
31
-
32
- def _run_script(script_name: str, args: list[str], timeout: int) -> str | None:
33
- """Run a Python script from sinain-memory, return stdout or None on failure."""
34
- script_path = _scripts_dir() / script_name
35
- if not script_path.exists():
36
- print(f"[ingest] {script_name} not found at {script_path}")
37
- return None
38
-
39
- env = {**os.environ, "PYTHONPATH": str(_scripts_dir())}
40
- # Ensure a working model is available (common.py defaults may reference unreleased models)
41
- if "SINAIN_BENCH_MODEL" in os.environ:
42
- env["SINAIN_FAST_MODEL"] = os.environ["SINAIN_BENCH_MODEL"]
43
- try:
44
- result = run(
45
- ["python3", str(script_path)] + args,
46
- capture_output=True, text=True, timeout=timeout, env=env,
47
- )
48
- if result.returncode != 0:
49
- print(f"[ingest] {script_name} failed: {result.stderr[:200]}")
50
- return None
51
- return result.stdout.strip()
52
- except TimeoutExpired:
53
- print(f"[ingest] {script_name} timed out ({timeout}s)")
54
- return None
55
-
56
-
57
- def ingest_instance(
58
- instance: BenchmarkInstance,
59
- cache_dir: Path,
60
- ) -> Path | None:
61
- """Ingest a benchmark instance into a triplestore. Returns db_path or None.
62
-
63
- Uses caching: if the same haystack was already ingested, returns the cached DB.
64
- """
65
- ch = _content_hash(instance.sessions)
66
- cache_path = cache_dir / "stores" / f"{ch}.db"
67
-
68
- if cache_path.exists():
69
- return cache_path
70
-
71
- cache_path.parent.mkdir(parents=True, exist_ok=True)
72
-
73
- # Create temp memory directory
74
- tmp = tempfile.mkdtemp(prefix="sinain-bench-")
75
- mem_dir = Path(tmp) / "memory"
76
- for subdir in ["", "playbook-logs", "playbook-archive"]:
77
- (mem_dir / subdir).mkdir(parents=True, exist_ok=True)
78
-
79
- # Write a minimal playbook so integrator doesn't fail
80
- (mem_dir / "sinain-playbook.md").write_text("# Sinain Playbook\n\n(benchmark run)\n")
81
-
82
- success = False
83
- try:
84
- # Batch sessions into chunks of ~10 for fewer LLM calls.
85
- # Each chunk becomes one distiller call with a combined transcript.
86
- BATCH_SIZE = 10
87
- num_sessions = len(instance.sessions)
88
- batch_idx = 0
89
-
90
- for start in range(0, num_sessions, BATCH_SIZE):
91
- batch = instance.sessions[start:start + BATCH_SIZE]
92
- # Flatten batch into one transcript
93
- combined: list[dict] = []
94
- for session in batch:
95
- combined.extend(session)
96
- if len(combined) < 3:
97
- continue
98
-
99
- first_ts = combined[0].get("ts", "2025-01-01T10:00:00Z")
100
- meta = json.dumps({
101
- "ts": first_ts,
102
- "sessionKey": f"benchmark-batch-{batch_idx}",
103
- "durationMs": len(combined) * 30000,
104
- })
105
- batch_idx += 1
106
-
107
- # Step 1: Distill the batch
108
- digest_json = _run_script("session_distiller.py", [
109
- "--memory-dir", str(mem_dir),
110
- "--transcript", json.dumps(combined),
111
- "--session-meta", meta,
112
- ], DISTILLER_TIMEOUT_S)
113
-
114
- if not digest_json:
115
- continue
116
-
117
- try:
118
- digest = json.loads(digest_json)
119
- except json.JSONDecodeError:
120
- continue
121
-
122
- if digest.get("isEmpty") or digest.get("error"):
123
- continue
124
-
125
- # Step 2: Integrate into knowledge graph
126
- _run_script("knowledge_integrator.py", [
127
- "--memory-dir", str(mem_dir),
128
- "--digest", json.dumps(digest),
129
- ], INTEGRATOR_TIMEOUT_S)
130
-
131
- # Copy the resulting DB to cache
132
- db_path = mem_dir / "knowledge-graph.db"
133
- if db_path.exists() and db_path.stat().st_size > 0:
134
- shutil.copy2(db_path, cache_path)
135
- success = True
136
-
137
- finally:
138
- shutil.rmtree(tmp, ignore_errors=True)
139
-
140
- return cache_path if success else None
141
-
142
-
143
- def get_knowledge_doc(db_path: Path) -> str:
144
- """Render a sinain-knowledge.md style document from a triplestore."""
145
- import sys
146
- sys.path.insert(0, str(_scripts_dir()))
147
- from graph_query import query_top_facts, format_facts_text
148
-
149
- facts = query_top_facts(str(db_path), limit=30)
150
- if not facts:
151
- return "(no knowledge available)"
152
- return format_facts_text(facts, max_chars=6000)
File without changes