@geravant/sinain 1.13.0 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/.env.example +33 -27
  2. package/cli.js +30 -14
  3. package/config-shared.js +173 -30
  4. package/launcher.js +38 -21
  5. package/onboard.js +36 -20
  6. package/package.json +4 -1
  7. package/sinain-agent/run.sh +600 -127
  8. package/sinain-core/src/agents-loader.ts +254 -0
  9. package/sinain-core/src/buffers/feed-buffer.ts +6 -4
  10. package/sinain-core/src/config.ts +77 -15
  11. package/sinain-core/src/escalation/escalator.ts +178 -18
  12. package/sinain-core/src/index.ts +218 -31
  13. package/sinain-core/src/learning/local-curation.ts +81 -27
  14. package/sinain-core/src/overlay/commands.ts +25 -0
  15. package/sinain-core/src/overlay/ws-handler.ts +3 -0
  16. package/sinain-core/src/server.ts +101 -10
  17. package/sinain-core/src/types.ts +29 -3
  18. package/sinain-memory/graph_query.py +12 -3
  19. package/sinain-memory/knowledge_integrator.py +194 -10
  20. package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
  21. package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
  22. package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
  23. package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
  24. package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
  25. package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
  26. package/sinain-memory/eval/__init__.py +0 -0
  27. package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
  28. package/sinain-memory/eval/assertions.py +0 -267
  29. package/sinain-memory/eval/benchmarks/__init__.py +0 -0
  30. package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
  31. package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
  32. package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
  33. package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
  34. package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
  35. package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
  36. package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
  37. package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
  38. package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
  39. package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
  40. package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
  41. package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
  42. package/sinain-memory/eval/benchmarks/config.py +0 -23
  43. package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
  44. package/sinain-memory/eval/benchmarks/ingest.py +0 -152
  45. package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
  46. package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
  47. package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
  48. package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
  49. package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
  50. package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
  51. package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
  52. package/sinain-memory/eval/benchmarks/query.py +0 -193
  53. package/sinain-memory/eval/benchmarks/report.py +0 -87
  54. package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
  55. package/sinain-memory/eval/benchmarks/runner.py +0 -283
  56. package/sinain-memory/eval/judges/__init__.py +0 -0
  57. package/sinain-memory/eval/judges/base_judge.py +0 -61
  58. package/sinain-memory/eval/judges/curation_judge.py +0 -46
  59. package/sinain-memory/eval/judges/insight_judge.py +0 -48
  60. package/sinain-memory/eval/judges/mining_judge.py +0 -42
  61. package/sinain-memory/eval/judges/signal_judge.py +0 -45
  62. package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
  63. package/sinain-memory/eval/retrieval_evaluator.py +0 -186
  64. package/sinain-memory/eval/schemas.py +0 -247
  65. package/sinain-memory/tests/__init__.py +0 -0
  66. package/sinain-memory/tests/conftest.py +0 -189
  67. package/sinain-memory/tests/test_curator_helpers.py +0 -94
  68. package/sinain-memory/tests/test_embedder.py +0 -210
  69. package/sinain-memory/tests/test_extract_json.py +0 -124
  70. package/sinain-memory/tests/test_feedback_computation.py +0 -121
  71. package/sinain-memory/tests/test_miner_helpers.py +0 -71
  72. package/sinain-memory/tests/test_module_management.py +0 -458
  73. package/sinain-memory/tests/test_parsers.py +0 -96
  74. package/sinain-memory/tests/test_tick_evaluator.py +0 -430
  75. package/sinain-memory/tests/test_triple_extractor.py +0 -255
  76. package/sinain-memory/tests/test_triple_ingest.py +0 -191
  77. package/sinain-memory/tests/test_triple_migrate.py +0 -138
  78. package/sinain-memory/tests/test_triplestore.py +0 -248
@@ -21,7 +21,9 @@ import json
21
21
  import re
22
22
  import shutil
23
23
  import sys
24
+ import unicodedata
24
25
  from datetime import datetime, timezone
26
+ from difflib import SequenceMatcher
25
27
  from pathlib import Path
26
28
 
27
29
  from common import (
@@ -121,9 +123,50 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
121
123
  return f"fact:{slug}-{h}"
122
124
 
123
125
 
126
+ _UNICODE_PRE_MAP = str.maketrans({"ß": "ss", "ẞ": "SS"})
127
+
128
+
124
129
  def _normalize_entity(name: str) -> str:
125
- """Normalize entity name to canonical form: lowercase, hyphenated, no punctuation."""
126
- return re.sub(r"[^a-z0-9-]", "", name.lower().replace(" ", "-").replace("_", "-"))
130
+ """Normalize entity name to canonical form: lowercase, hyphenated, ASCII-transliterated."""
131
+ s = name.translate(_UNICODE_PRE_MAP)
132
+ s = unicodedata.normalize("NFKD", s)
133
+ s = s.encode("ascii", "ignore").decode("ascii")
134
+ s = s.lower().replace(" ", "-").replace("_", "-")
135
+ s = re.sub(r"[^a-z0-9-]", "", s)
136
+ s = re.sub(r"-{2,}", "-", s)
137
+ return s.strip("-")
138
+
139
+
140
+ def _find_matching_entity(
141
+ name: str,
142
+ existing_names: dict[str, str],
143
+ ) -> str | None:
144
+ """Find an existing entity that fuzzy-matches `name`. Returns entity_node_id or None."""
145
+ if name in existing_names:
146
+ return existing_names[name]
147
+
148
+ # Hyphen-insensitive exact match (chatgpt == chat-gpt)
149
+ name_compact = name.replace("-", "")
150
+ for existing_name, node_id in existing_names.items():
151
+ if existing_name.replace("-", "") == name_compact:
152
+ return node_id
153
+
154
+ # Edit-distance fuzzy match
155
+ if len(name) < 3:
156
+ return None
157
+ threshold = 0.90
158
+ best_match = None
159
+ best_ratio = threshold
160
+ for existing_name, node_id in existing_names.items():
161
+ if len(existing_name) < 3:
162
+ continue
163
+ if frozenset({name, existing_name}) in _DEDUP_SKIP_PAIRS:
164
+ continue
165
+ ratio = SequenceMatcher(None, name, existing_name).ratio()
166
+ if ratio >= best_ratio:
167
+ best_ratio = ratio
168
+ best_match = node_id
169
+ return best_match
127
170
 
128
171
 
129
172
  def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
@@ -528,7 +571,14 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
528
571
  # --- Build entity graph layer (two-layer model) ---
529
572
  if digest_entities and stats["asserted"] > 0:
530
573
  try:
531
- # Create entity:* nodes from digest entities
574
+ # Load existing entity names for fuzzy matching
575
+ all_entity_nodes: dict[str, str] = {} # {name: entity_node_id}
576
+ for r in store.entities_with_attr("name"):
577
+ if r[0].startswith("entity:"):
578
+ all_entity_nodes[r[1]] = r[0]
579
+
580
+ # Create entity:* nodes from digest entities (with fuzzy dedup)
581
+ entity_resolve: dict[str, str] = {} # {normalized_name: resolved_node_id}
532
582
  for ent in (digest_entities or []):
533
583
  if isinstance(ent, dict):
534
584
  ename = _normalize_entity(ent.get("name", ""))
@@ -539,12 +589,22 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
539
589
  if not ename or len(ename) < 2:
540
590
  continue
541
591
 
592
+ # Check for fuzzy match against existing entities
593
+ matched_id = _find_matching_entity(ename, all_entity_nodes)
594
+ if matched_id:
595
+ entity_resolve[ename] = matched_id
596
+ if matched_id != f"entity:{ename}":
597
+ print(f" [graph] alias: \"{ename}\" → {matched_id}", file=sys.stderr)
598
+ continue
599
+
542
600
  entity_node_id = f"entity:{ename}"
543
601
  existing = store.entity(entity_node_id)
544
602
  if not existing:
545
603
  tx = store.begin_tx("entity_graph")
546
604
  store.assert_triple(tx, entity_node_id, "name", ename)
547
605
  store.assert_triple(tx, entity_node_id, "type", etype)
606
+ all_entity_nodes[ename] = entity_node_id
607
+ entity_resolve[ename] = entity_node_id
548
608
 
549
609
  # Link facts to their entity nodes via "about" ref edges
550
610
  for op_data in ops:
@@ -554,18 +614,13 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
554
614
  value = op_data.get("value", "")
555
615
  attribute = op_data.get("attribute", "")
556
616
  fact_eid = _fact_id(entity, attribute, value)
557
- entity_node_id = f"entity:{_normalize_entity(entity)}"
617
+ norm_entity = _normalize_entity(entity)
618
+ entity_node_id = entity_resolve.get(norm_entity, f"entity:{norm_entity}")
558
619
  # Only link if entity node exists
559
620
  if store.entity(entity_node_id):
560
621
  tx = store.begin_tx("entity_graph")
561
622
  store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
562
623
 
563
- # Infer cross-entity refs from fact content
564
- all_entity_nodes = {}
565
- for r in store.entities_with_attr("name"):
566
- if r[0].startswith("entity:"):
567
- all_entity_nodes[r[1]] = r[0] # {name: entity_id}
568
-
569
624
  ref_count = 0
570
625
  for fact_eid_row in store.entities_with_attr("value"):
571
626
  fact_eid = fact_eid_row[0]
@@ -695,17 +750,146 @@ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
695
750
  return {"bootstrapped": stats.get("asserted", 0)}
696
751
 
697
752
 
753
+ # Pairs that fuzzy matching incorrectly clusters — reviewed and confirmed distinct.
754
+ _DEDUP_SKIP_PAIRS = {
755
+ frozenset({"ai-driven-development", "spac-driven-development"}),
756
+ frozenset({"german", "germany"}),
757
+ frozenset({"llama", "ollama"}),
758
+ frozenset({"gemma", "gemma4"}),
759
+ }
760
+
761
+
762
+ def merge_entity_duplicates(db_path: str, dry_run: bool = True) -> dict:
763
+ """Merge fragmented entity nodes using fuzzy matching.
764
+
765
+ Idempotent: checks for migration:entity-dedup-v1 stamp.
766
+ """
767
+ from triplestore import TripleStore
768
+ store = TripleStore(db_path)
769
+
770
+ # Idempotency check
771
+ stamp = store.entity("migration:entity-dedup-v1")
772
+ if stamp:
773
+ print("migration:entity-dedup-v1 already applied — skipping", file=sys.stderr)
774
+ return {"status": "already_applied"}
775
+
776
+ # Load all entity nodes
777
+ all_entities: dict[str, str] = {} # {name: entity_node_id}
778
+ for entity_id, name in store.entities_with_attr("name"):
779
+ if entity_id.startswith("entity:"):
780
+ all_entities[name] = entity_id
781
+
782
+ print(f"Total entity nodes: {len(all_entities)}", file=sys.stderr)
783
+
784
+ # Build clusters via greedy matching
785
+ remaining = dict(all_entities) # copy
786
+ clusters: list[list[tuple[str, str]]] = [] # [[( name, node_id ), ...], ...]
787
+
788
+ while remaining:
789
+ seed_name, seed_id = next(iter(remaining.items()))
790
+ cluster = [(seed_name, seed_id)]
791
+ del remaining[seed_name]
792
+
793
+ # Find all matches for this seed
794
+ to_remove = []
795
+ for other_name, other_id in remaining.items():
796
+ matched = _find_matching_entity(other_name, {seed_name: seed_id})
797
+ if matched:
798
+ cluster.append((other_name, other_id))
799
+ to_remove.append(other_name)
800
+ for name in to_remove:
801
+ del remaining[name]
802
+
803
+ if len(cluster) > 1:
804
+ # Filter out known false-positive pairs
805
+ names_set = {n for n, _ in cluster}
806
+ if any(pair <= names_set for pair in _DEDUP_SKIP_PAIRS):
807
+ continue
808
+ clusters.append(cluster)
809
+
810
+ print(f"Found {len(clusters)} duplicate clusters", file=sys.stderr)
811
+
812
+ merge_count = 0
813
+ repoint_count = 0
814
+
815
+ for cluster in clusters:
816
+ # Canonical selection: if any entity has significantly more backrefs (5+),
817
+ # use it. Otherwise prefer longest name (most complete spelling).
818
+ max_refs = max(len(store.backrefs(nid)) for _, nid in cluster)
819
+ if max_refs >= 5:
820
+ cluster.sort(key=lambda x: (-len(store.backrefs(x[1])), -len(x[0]), x[0]))
821
+ else:
822
+ cluster.sort(key=lambda x: (-len(x[0]), x[0]))
823
+ canonical_name, canonical_id = cluster[0]
824
+ duplicates = cluster[1:]
825
+
826
+ dup_names = [d[0] for d in duplicates]
827
+ print(f" cluster: {canonical_name} ← {dup_names}", file=sys.stderr)
828
+
829
+ if dry_run:
830
+ merge_count += len(duplicates)
831
+ continue
832
+
833
+ for dup_name, dup_id in duplicates:
834
+ # Re-point all refs pointing to this duplicate
835
+ refs = store.backrefs(dup_id)
836
+ for src_entity, attr in refs:
837
+ tx = store.begin_tx("entity_dedup")
838
+ store.retract_triple(tx, src_entity, attr, dup_id)
839
+ store.assert_triple(tx, src_entity, attr, canonical_id, value_type="ref")
840
+ repoint_count += 1
841
+
842
+ # Retract all triples of the duplicate entity itself
843
+ dup_attrs = store.entity(dup_id)
844
+ tx = store.begin_tx("entity_dedup")
845
+ for attr, values in dup_attrs.items():
846
+ if not isinstance(values, list):
847
+ values = [values]
848
+ for val in values:
849
+ store.retract_triple(tx, dup_id, attr, str(val))
850
+
851
+ merge_count += 1
852
+
853
+ # Stamp migration
854
+ if not dry_run and clusters:
855
+ tx = store.begin_tx("entity_dedup")
856
+ store.assert_triple(tx, "migration:entity-dedup-v1", "applied_at",
857
+ datetime.now(timezone.utc).isoformat())
858
+ store.assert_triple(tx, "migration:entity-dedup-v1", "clusters_merged",
859
+ str(len(clusters)))
860
+
861
+ result = {
862
+ "status": "dry_run" if dry_run else "applied",
863
+ "clusters": len(clusters),
864
+ "entities_merged": merge_count,
865
+ "refs_repointed": repoint_count,
866
+ }
867
+ print(json.dumps(result, indent=2), file=sys.stderr)
868
+ return result
869
+
870
+
698
871
  def main() -> None:
699
872
  parser = argparse.ArgumentParser(description="Knowledge Integrator")
700
873
  parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
701
874
  parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
702
875
  parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
703
876
  parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
877
+ parser.add_argument("--dedup-entities", action="store_true", help="Merge fragmented entity nodes")
878
+ parser.add_argument("--dry-run", action="store_true", help="Preview changes without applying")
704
879
  args = parser.parse_args()
705
880
 
706
881
  memory_dir = args.memory_dir
707
882
  db_path = str(Path(memory_dir) / "knowledge-graph.db")
708
883
 
884
+ # Entity dedup mode: merge fragmented entity nodes
885
+ if args.dedup_entities:
886
+ if not Path(db_path).exists():
887
+ output_json({"error": "knowledge-graph.db not found"})
888
+ return
889
+ result = merge_entity_duplicates(db_path, dry_run=args.dry_run)
890
+ output_json(result)
891
+ return
892
+
709
893
  # Bootstrap mode: seed graph from current playbook
710
894
  if args.bootstrap:
711
895
  result = _bootstrap_graph(memory_dir, db_path)
File without changes
@@ -1,267 +0,0 @@
1
- """Behavioral assertion library for sinain-koog tick evaluation.
2
-
3
- Each assertion function validates a runtime invariant of the pipeline.
4
- Returns ``{"name": str, "passed": bool, "detail": str}``.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
-
10
- def _result(name: str, passed: bool, detail: str) -> dict:
11
- return {"name": name, "passed": passed, "detail": detail}
12
-
13
-
14
- # ---------------------------------------------------------------------------
15
- # Playbook curator assertions
16
- # ---------------------------------------------------------------------------
17
-
18
- def assert_playbook_under_limit(curator_result: dict, limit: int = 50) -> dict:
19
- """Verify playbook body stays under the line limit."""
20
- lines = curator_result.get("playbookLines", 0)
21
- if lines <= limit:
22
- return _result("playbook_under_limit", True, f"body has {lines} lines (limit {limit})")
23
- return _result("playbook_under_limit", False, f"body has {lines} lines, exceeds limit of {limit}")
24
-
25
-
26
- def assert_curator_respected_directive(curator_result: dict, directive: str) -> dict:
27
- """Check that curator changes align with the curate directive."""
28
- changes = curator_result.get("changes", {})
29
- added = len(changes.get("added", []))
30
- pruned = len(changes.get("pruned", []))
31
-
32
- if directive == "aggressive_prune":
33
- # Should have pruned items
34
- if pruned > 0:
35
- return _result("curator_respected_directive", True,
36
- f"aggressive_prune: pruned {pruned} items")
37
- if added == 0 and pruned == 0:
38
- return _result("curator_respected_directive", True,
39
- "aggressive_prune: no changes (acceptable if playbook already lean)")
40
- return _result("curator_respected_directive", False,
41
- f"aggressive_prune: added {added} but pruned {pruned} — expected pruning")
42
-
43
- if directive == "stability":
44
- # Should not aggressively prune established patterns
45
- if pruned > added + 2:
46
- return _result("curator_respected_directive", False,
47
- f"stability: pruned {pruned} items (only added {added}) — too aggressive for stability mode")
48
- return _result("curator_respected_directive", True,
49
- f"stability: added {added}, pruned {pruned} — conservative")
50
-
51
- # normal / insufficient_data — any reasonable mix is fine
52
- return _result("curator_respected_directive", True,
53
- f"{directive}: added {added}, pruned {pruned}")
54
-
55
-
56
- # ---------------------------------------------------------------------------
57
- # Signal analyzer assertions
58
- # ---------------------------------------------------------------------------
59
-
60
- def assert_no_repeat_action(signal_result: dict, recent_logs: list[dict], window: int = 3) -> dict:
61
- """Verify recommendedAction doesn't repeat the last N ticks' actions."""
62
- action = signal_result.get("recommendedAction")
63
- if action is None or action.get("action") == "skip":
64
- return _result("no_repeat_action", True, "no action recommended (skip/null)")
65
-
66
- task = (action.get("task") or "").lower().strip()
67
- if not task:
68
- return _result("no_repeat_action", True, "no task description to compare")
69
-
70
- # Collect recent action tasks
71
- recent_tasks: list[str] = []
72
- for log in recent_logs[:window]:
73
- log_actions = log.get("actionsConsidered", [])
74
- for a in log_actions:
75
- if a.get("chosen"):
76
- recent_tasks.append((a.get("reason") or a.get("task") or "").lower().strip())
77
-
78
- # Check for near-duplicate (substring match to catch rephrasing)
79
- for prev_task in recent_tasks:
80
- if not prev_task:
81
- continue
82
- # If >60% of words overlap, consider it a repeat
83
- task_words = set(task.split())
84
- prev_words = set(prev_task.split())
85
- if not task_words or not prev_words:
86
- continue
87
- overlap = len(task_words & prev_words) / max(len(task_words), len(prev_words))
88
- if overlap > 0.6:
89
- return _result("no_repeat_action", False,
90
- f"action task '{task[:60]}' overlaps with recent '{prev_task[:60]}' ({overlap:.0%} word overlap)")
91
-
92
- return _result("no_repeat_action", True,
93
- f"action task is distinct from last {window} ticks")
94
-
95
-
96
- def assert_signal_confidence_threshold(signal_result: dict, threshold: float = 0.5) -> dict:
97
- """Verify actions are only recommended above the confidence threshold."""
98
- action = signal_result.get("recommendedAction")
99
- if action is None or action.get("action") == "skip":
100
- return _result("signal_confidence_threshold", True, "no action recommended")
101
-
102
- confidence = action.get("confidence")
103
- if confidence is None:
104
- return _result("signal_confidence_threshold", False,
105
- "action recommended but no confidence value provided")
106
-
107
- if confidence >= threshold:
108
- return _result("signal_confidence_threshold", True,
109
- f"confidence {confidence:.2f} >= threshold {threshold}")
110
- return _result("signal_confidence_threshold", False,
111
- f"confidence {confidence:.2f} < threshold {threshold}")
112
-
113
-
114
- # ---------------------------------------------------------------------------
115
- # Insight synthesizer assertions
116
- # ---------------------------------------------------------------------------
117
-
118
- def assert_insight_char_limit(synth_result: dict, limit: int = 500) -> dict:
119
- """Verify suggestion+insight stays under the character limit."""
120
- if synth_result.get("skip", False):
121
- return _result("insight_char_limit", True, "output skipped")
122
-
123
- suggestion = synth_result.get("suggestion", "")
124
- insight = synth_result.get("insight", "")
125
- total = len(suggestion) + len(insight)
126
-
127
- if total <= limit:
128
- return _result("insight_char_limit", True, f"total {total} chars (limit {limit})")
129
- return _result("insight_char_limit", False, f"total {total} chars exceeds limit of {limit}")
130
-
131
-
132
- def assert_skip_reason_specific(synth_result: dict) -> dict:
133
- """If skip=true, verify the reason is specific (not generic boilerplate)."""
134
- if not synth_result.get("skip", False):
135
- return _result("skip_reason_specific", True, "output not skipped")
136
-
137
- reason = (synth_result.get("skipReason") or "").strip()
138
- if not reason:
139
- return _result("skip_reason_specific", False, "skip=true but no skipReason provided")
140
-
141
- # Check against known-generic patterns
142
- generic_phrases = [
143
- "no new data",
144
- "nothing new",
145
- "no updates",
146
- "insufficient data",
147
- "not enough information",
148
- "no changes",
149
- ]
150
- reason_lower = reason.lower()
151
- for phrase in generic_phrases:
152
- if reason_lower == phrase or (len(reason_lower) < 30 and phrase in reason_lower):
153
- return _result("skip_reason_specific", False,
154
- f"skipReason is too generic: '{reason}'")
155
-
156
- return _result("skip_reason_specific", True, f"skipReason is specific ({len(reason)} chars)")
157
-
158
-
159
- # ---------------------------------------------------------------------------
160
- # Memory miner assertions
161
- # ---------------------------------------------------------------------------
162
-
163
- def assert_miner_references_sources(miner_result: dict, daily_files: list[str]) -> dict:
164
- """Verify mining findings reference actual source files that were provided."""
165
- mined = miner_result.get("minedSources", [])
166
- if not mined:
167
- return _result("miner_references_sources", True, "no sources mined (early return)")
168
-
169
- # daily_files contains basenames like "2026-02-21.md"
170
- known_basenames = set(daily_files)
171
- unknown = [s for s in mined if s not in known_basenames]
172
-
173
- if unknown:
174
- return _result("miner_references_sources", False,
175
- f"minedSources references unknown files: {unknown}")
176
- return _result("miner_references_sources", True,
177
- f"all {len(mined)} mined sources are valid")
178
-
179
-
180
- # ---------------------------------------------------------------------------
181
- # Cross-script / structural assertions
182
- # ---------------------------------------------------------------------------
183
-
184
- def assert_schema_valid(script_name: str, output: dict, schema_errors: list[str]) -> dict:
185
- """Wrap schema validation result as an assertion."""
186
- if not schema_errors:
187
- return _result(f"schema_valid_{script_name}", True, "output matches schema")
188
- return _result(f"schema_valid_{script_name}", False,
189
- f"{len(schema_errors)} schema errors: {'; '.join(schema_errors[:3])}")
190
-
191
-
192
- def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
193
- """Verify the playbook still has its mining-index header and effectiveness footer."""
194
- has_header = "<!-- mining-index:" in playbook_text
195
- has_footer = "<!-- effectiveness:" in playbook_text
196
-
197
- if has_header and has_footer:
198
- return _result("playbook_header_footer_intact", True,
199
- "both mining-index and effectiveness comments present")
200
- missing = []
201
- if not has_header:
202
- missing.append("mining-index")
203
- if not has_footer:
204
- missing.append("effectiveness")
205
- return _result("playbook_header_footer_intact", False,
206
- f"missing playbook comments: {', '.join(missing)}")
207
-
208
-
209
- # ---------------------------------------------------------------------------
210
- # Runner: execute all applicable assertions for a tick
211
- # ---------------------------------------------------------------------------
212
-
213
- def run_tick_assertions(
214
- log_entry: dict,
215
- recent_logs: list[dict],
216
- playbook_text: str,
217
- daily_files: list[str],
218
- ) -> list[dict]:
219
- """Run all applicable assertions against a single tick's log entry.
220
-
221
- Returns a list of assertion result dicts.
222
- """
223
- results: list[dict] = []
224
-
225
- # Signal analyzer assertions
226
- signals = log_entry.get("signals")
227
- if signals is not None:
228
- results.append(assert_signal_confidence_threshold(
229
- {"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
230
- ))
231
- results.append(assert_no_repeat_action(
232
- {"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
233
- recent_logs,
234
- ))
235
-
236
- # Curator assertions — playbookChanges can be {"note": "skipped"} or full output
237
- curator = log_entry.get("playbookChanges")
238
- if isinstance(curator, dict) and "changes" in curator:
239
- curator_with_lines = {**curator}
240
- if "playbookLines" not in curator_with_lines:
241
- curator_with_lines["playbookLines"] = curator.get("playbookLines", 0)
242
- results.append(assert_playbook_under_limit(curator_with_lines))
243
-
244
- directive = log_entry.get("curateDirective", "normal")
245
- results.append(assert_curator_respected_directive(curator_with_lines, directive))
246
-
247
- # Insight synthesizer assertions — output can be null (pipeline-level skip)
248
- output = log_entry.get("output")
249
- if isinstance(output, dict):
250
- results.append(assert_insight_char_limit(output))
251
- results.append(assert_skip_reason_specific(output))
252
-
253
- # Mining assertions — log uses miningFindings (str) and minedSources (list)
254
- mining = log_entry.get("miningResult")
255
- if mining is not None:
256
- results.append(assert_miner_references_sources(mining, daily_files))
257
- elif log_entry.get("minedSources"):
258
- # Reconstruct mining result from flat log fields
259
- results.append(assert_miner_references_sources(
260
- {"minedSources": log_entry.get("minedSources", [])}, daily_files
261
- ))
262
-
263
- # Playbook health (if we have playbook text)
264
- if playbook_text:
265
- results.append(assert_playbook_header_footer_intact(playbook_text))
266
-
267
- return results
File without changes
@@ -1,43 +0,0 @@
1
- """Base adapter and data classes for benchmark evaluation."""
2
-
3
- from __future__ import annotations
4
-
5
- from abc import ABC, abstractmethod
6
- from dataclasses import dataclass, field
7
-
8
-
9
- @dataclass
10
- class BenchmarkQuestion:
11
- id: str
12
- text: str
13
- gold_answer: str
14
- category: str # single-session, multi-session, temporal, etc.
15
- evidence_session_ids: list[str] = field(default_factory=list)
16
- metadata: dict = field(default_factory=dict)
17
-
18
-
19
- @dataclass
20
- class BenchmarkInstance:
21
- """A set of conversations + questions that share the same context."""
22
- id: str
23
- sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
24
- questions: list[BenchmarkQuestion] = field(default_factory=list)
25
- raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
26
- metadata: dict = field(default_factory=dict)
27
-
28
-
29
- class BenchmarkAdapter(ABC):
30
- """Abstract adapter: converts a published benchmark into sinain's format."""
31
-
32
- @property
33
- @abstractmethod
34
- def name(self) -> str:
35
- """Benchmark name (e.g. 'longmemeval', 'locomo')."""
36
-
37
- @abstractmethod
38
- def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
39
- """Download (if needed) and parse the benchmark dataset."""
40
-
41
- @abstractmethod
42
- def format_full_context(self, instance: BenchmarkInstance) -> str:
43
- """Render the full conversation history as a text string for the baseline condition."""
@@ -1,23 +0,0 @@
1
- """Benchmark configuration — models, paths, thresholds."""
2
-
3
- from pathlib import Path
4
-
5
- BENCHMARKS_DIR = Path(__file__).resolve().parent
6
- DATA_DIR = BENCHMARKS_DIR / "data"
7
- RESULTS_DIR = BENCHMARKS_DIR / "results"
8
-
9
- # LLM models (via OpenRouter)
10
- QA_MODEL = "google/gemini-2.5-flash"
11
- JUDGE_MODEL = "openai/gpt-4o"
12
-
13
- # Retrieval
14
- K_VALUES = [1, 3, 5, 10]
15
- MAX_FACTS_PER_QUERY = 10
16
-
17
- # Ingestion
18
- DISTILLER_TIMEOUT_S = 30
19
- INTEGRATOR_TIMEOUT_S = 60
20
-
21
- # Dataset URLs
22
- LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
23
- LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"