@geravant/sinain 1.13.0 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +33 -27
- package/cli.js +30 -14
- package/config-shared.js +173 -30
- package/launcher.js +38 -21
- package/onboard.js +36 -20
- package/package.json +4 -1
- package/sinain-agent/run.sh +600 -127
- package/sinain-core/src/agents-loader.ts +254 -0
- package/sinain-core/src/buffers/feed-buffer.ts +6 -4
- package/sinain-core/src/config.ts +77 -15
- package/sinain-core/src/escalation/escalator.ts +178 -18
- package/sinain-core/src/index.ts +218 -31
- package/sinain-core/src/learning/local-curation.ts +81 -27
- package/sinain-core/src/overlay/commands.ts +25 -0
- package/sinain-core/src/overlay/ws-handler.ts +3 -0
- package/sinain-core/src/server.ts +101 -10
- package/sinain-core/src/types.ts +29 -3
- package/sinain-memory/graph_query.py +12 -3
- package/sinain-memory/knowledge_integrator.py +194 -10
- package/sinain-memory/__pycache__/common.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/embed_client.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/graph_query.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/knowledge_integrator.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/session_distiller.cpython-312.pyc +0 -0
- package/sinain-memory/__pycache__/triplestore.cpython-312.pyc +0 -0
- package/sinain-memory/eval/__init__.py +0 -0
- package/sinain-memory/eval/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/assertions.py +0 -267
- package/sinain-memory/eval/benchmarks/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/base_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/config.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/evaluate.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/ingest.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/longmemeval_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_adapter.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/meeting_runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/query.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/report.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/__pycache__/runner.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/base_adapter.py +0 -43
- package/sinain-memory/eval/benchmarks/config.py +0 -23
- package/sinain-memory/eval/benchmarks/evaluate.py +0 -146
- package/sinain-memory/eval/benchmarks/ingest.py +0 -152
- package/sinain-memory/eval/benchmarks/judges/__init__.py +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/__init__.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/__pycache__/qa_judge.cpython-312.pyc +0 -0
- package/sinain-memory/eval/benchmarks/judges/qa_judge.py +0 -81
- package/sinain-memory/eval/benchmarks/longmemeval_adapter.py +0 -177
- package/sinain-memory/eval/benchmarks/meeting_adapter.py +0 -81
- package/sinain-memory/eval/benchmarks/meeting_runner.py +0 -230
- package/sinain-memory/eval/benchmarks/query.py +0 -193
- package/sinain-memory/eval/benchmarks/report.py +0 -87
- package/sinain-memory/eval/benchmarks/run_meeting_bench.sh +0 -318
- package/sinain-memory/eval/benchmarks/runner.py +0 -283
- package/sinain-memory/eval/judges/__init__.py +0 -0
- package/sinain-memory/eval/judges/base_judge.py +0 -61
- package/sinain-memory/eval/judges/curation_judge.py +0 -46
- package/sinain-memory/eval/judges/insight_judge.py +0 -48
- package/sinain-memory/eval/judges/mining_judge.py +0 -42
- package/sinain-memory/eval/judges/signal_judge.py +0 -45
- package/sinain-memory/eval/retrieval_benchmark.jsonl +0 -12
- package/sinain-memory/eval/retrieval_evaluator.py +0 -186
- package/sinain-memory/eval/schemas.py +0 -247
- package/sinain-memory/tests/__init__.py +0 -0
- package/sinain-memory/tests/conftest.py +0 -189
- package/sinain-memory/tests/test_curator_helpers.py +0 -94
- package/sinain-memory/tests/test_embedder.py +0 -210
- package/sinain-memory/tests/test_extract_json.py +0 -124
- package/sinain-memory/tests/test_feedback_computation.py +0 -121
- package/sinain-memory/tests/test_miner_helpers.py +0 -71
- package/sinain-memory/tests/test_module_management.py +0 -458
- package/sinain-memory/tests/test_parsers.py +0 -96
- package/sinain-memory/tests/test_tick_evaluator.py +0 -430
- package/sinain-memory/tests/test_triple_extractor.py +0 -255
- package/sinain-memory/tests/test_triple_ingest.py +0 -191
- package/sinain-memory/tests/test_triple_migrate.py +0 -138
- package/sinain-memory/tests/test_triplestore.py +0 -248
|
@@ -21,7 +21,9 @@ import json
|
|
|
21
21
|
import re
|
|
22
22
|
import shutil
|
|
23
23
|
import sys
|
|
24
|
+
import unicodedata
|
|
24
25
|
from datetime import datetime, timezone
|
|
26
|
+
from difflib import SequenceMatcher
|
|
25
27
|
from pathlib import Path
|
|
26
28
|
|
|
27
29
|
from common import (
|
|
@@ -121,9 +123,50 @@ def _fact_id(entity: str, attribute: str, value: str) -> str:
|
|
|
121
123
|
return f"fact:{slug}-{h}"
|
|
122
124
|
|
|
123
125
|
|
|
126
|
+
_UNICODE_PRE_MAP = str.maketrans({"ß": "ss", "ẞ": "SS"})
|
|
127
|
+
|
|
128
|
+
|
|
124
129
|
def _normalize_entity(name: str) -> str:
|
|
125
|
-
"""Normalize entity name to canonical form: lowercase, hyphenated,
|
|
126
|
-
|
|
130
|
+
"""Normalize entity name to canonical form: lowercase, hyphenated, ASCII-transliterated."""
|
|
131
|
+
s = name.translate(_UNICODE_PRE_MAP)
|
|
132
|
+
s = unicodedata.normalize("NFKD", s)
|
|
133
|
+
s = s.encode("ascii", "ignore").decode("ascii")
|
|
134
|
+
s = s.lower().replace(" ", "-").replace("_", "-")
|
|
135
|
+
s = re.sub(r"[^a-z0-9-]", "", s)
|
|
136
|
+
s = re.sub(r"-{2,}", "-", s)
|
|
137
|
+
return s.strip("-")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _find_matching_entity(
|
|
141
|
+
name: str,
|
|
142
|
+
existing_names: dict[str, str],
|
|
143
|
+
) -> str | None:
|
|
144
|
+
"""Find an existing entity that fuzzy-matches `name`. Returns entity_node_id or None."""
|
|
145
|
+
if name in existing_names:
|
|
146
|
+
return existing_names[name]
|
|
147
|
+
|
|
148
|
+
# Hyphen-insensitive exact match (chatgpt == chat-gpt)
|
|
149
|
+
name_compact = name.replace("-", "")
|
|
150
|
+
for existing_name, node_id in existing_names.items():
|
|
151
|
+
if existing_name.replace("-", "") == name_compact:
|
|
152
|
+
return node_id
|
|
153
|
+
|
|
154
|
+
# Edit-distance fuzzy match
|
|
155
|
+
if len(name) < 3:
|
|
156
|
+
return None
|
|
157
|
+
threshold = 0.90
|
|
158
|
+
best_match = None
|
|
159
|
+
best_ratio = threshold
|
|
160
|
+
for existing_name, node_id in existing_names.items():
|
|
161
|
+
if len(existing_name) < 3:
|
|
162
|
+
continue
|
|
163
|
+
if frozenset({name, existing_name}) in _DEDUP_SKIP_PAIRS:
|
|
164
|
+
continue
|
|
165
|
+
ratio = SequenceMatcher(None, name, existing_name).ratio()
|
|
166
|
+
if ratio >= best_ratio:
|
|
167
|
+
best_ratio = ratio
|
|
168
|
+
best_match = node_id
|
|
169
|
+
return best_match
|
|
127
170
|
|
|
128
171
|
|
|
129
172
|
def _canonicalize_ops(ops: list[dict], existing_entities: list[str], existing_facts: list[dict]) -> list[dict]:
|
|
@@ -528,7 +571,14 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
528
571
|
# --- Build entity graph layer (two-layer model) ---
|
|
529
572
|
if digest_entities and stats["asserted"] > 0:
|
|
530
573
|
try:
|
|
531
|
-
#
|
|
574
|
+
# Load existing entity names for fuzzy matching
|
|
575
|
+
all_entity_nodes: dict[str, str] = {} # {name: entity_node_id}
|
|
576
|
+
for r in store.entities_with_attr("name"):
|
|
577
|
+
if r[0].startswith("entity:"):
|
|
578
|
+
all_entity_nodes[r[1]] = r[0]
|
|
579
|
+
|
|
580
|
+
# Create entity:* nodes from digest entities (with fuzzy dedup)
|
|
581
|
+
entity_resolve: dict[str, str] = {} # {normalized_name: resolved_node_id}
|
|
532
582
|
for ent in (digest_entities or []):
|
|
533
583
|
if isinstance(ent, dict):
|
|
534
584
|
ename = _normalize_entity(ent.get("name", ""))
|
|
@@ -539,12 +589,22 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
539
589
|
if not ename or len(ename) < 2:
|
|
540
590
|
continue
|
|
541
591
|
|
|
592
|
+
# Check for fuzzy match against existing entities
|
|
593
|
+
matched_id = _find_matching_entity(ename, all_entity_nodes)
|
|
594
|
+
if matched_id:
|
|
595
|
+
entity_resolve[ename] = matched_id
|
|
596
|
+
if matched_id != f"entity:{ename}":
|
|
597
|
+
print(f" [graph] alias: \"{ename}\" → {matched_id}", file=sys.stderr)
|
|
598
|
+
continue
|
|
599
|
+
|
|
542
600
|
entity_node_id = f"entity:{ename}"
|
|
543
601
|
existing = store.entity(entity_node_id)
|
|
544
602
|
if not existing:
|
|
545
603
|
tx = store.begin_tx("entity_graph")
|
|
546
604
|
store.assert_triple(tx, entity_node_id, "name", ename)
|
|
547
605
|
store.assert_triple(tx, entity_node_id, "type", etype)
|
|
606
|
+
all_entity_nodes[ename] = entity_node_id
|
|
607
|
+
entity_resolve[ename] = entity_node_id
|
|
548
608
|
|
|
549
609
|
# Link facts to their entity nodes via "about" ref edges
|
|
550
610
|
for op_data in ops:
|
|
@@ -554,18 +614,13 @@ def _execute_graph_ops(db_path: str, ops: list[dict], digest_ts: str, digest_ent
|
|
|
554
614
|
value = op_data.get("value", "")
|
|
555
615
|
attribute = op_data.get("attribute", "")
|
|
556
616
|
fact_eid = _fact_id(entity, attribute, value)
|
|
557
|
-
|
|
617
|
+
norm_entity = _normalize_entity(entity)
|
|
618
|
+
entity_node_id = entity_resolve.get(norm_entity, f"entity:{norm_entity}")
|
|
558
619
|
# Only link if entity node exists
|
|
559
620
|
if store.entity(entity_node_id):
|
|
560
621
|
tx = store.begin_tx("entity_graph")
|
|
561
622
|
store.assert_triple(tx, fact_eid, "about", entity_node_id, value_type="ref")
|
|
562
623
|
|
|
563
|
-
# Infer cross-entity refs from fact content
|
|
564
|
-
all_entity_nodes = {}
|
|
565
|
-
for r in store.entities_with_attr("name"):
|
|
566
|
-
if r[0].startswith("entity:"):
|
|
567
|
-
all_entity_nodes[r[1]] = r[0] # {name: entity_id}
|
|
568
|
-
|
|
569
624
|
ref_count = 0
|
|
570
625
|
for fact_eid_row in store.entities_with_attr("value"):
|
|
571
626
|
fact_eid = fact_eid_row[0]
|
|
@@ -695,17 +750,146 @@ def _bootstrap_graph(memory_dir: str, db_path: str) -> dict:
|
|
|
695
750
|
return {"bootstrapped": stats.get("asserted", 0)}
|
|
696
751
|
|
|
697
752
|
|
|
753
|
+
# Pairs that fuzzy matching incorrectly clusters — reviewed and confirmed distinct.
|
|
754
|
+
_DEDUP_SKIP_PAIRS = {
|
|
755
|
+
frozenset({"ai-driven-development", "spac-driven-development"}),
|
|
756
|
+
frozenset({"german", "germany"}),
|
|
757
|
+
frozenset({"llama", "ollama"}),
|
|
758
|
+
frozenset({"gemma", "gemma4"}),
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def merge_entity_duplicates(db_path: str, dry_run: bool = True) -> dict:
|
|
763
|
+
"""Merge fragmented entity nodes using fuzzy matching.
|
|
764
|
+
|
|
765
|
+
Idempotent: checks for migration:entity-dedup-v1 stamp.
|
|
766
|
+
"""
|
|
767
|
+
from triplestore import TripleStore
|
|
768
|
+
store = TripleStore(db_path)
|
|
769
|
+
|
|
770
|
+
# Idempotency check
|
|
771
|
+
stamp = store.entity("migration:entity-dedup-v1")
|
|
772
|
+
if stamp:
|
|
773
|
+
print("migration:entity-dedup-v1 already applied — skipping", file=sys.stderr)
|
|
774
|
+
return {"status": "already_applied"}
|
|
775
|
+
|
|
776
|
+
# Load all entity nodes
|
|
777
|
+
all_entities: dict[str, str] = {} # {name: entity_node_id}
|
|
778
|
+
for entity_id, name in store.entities_with_attr("name"):
|
|
779
|
+
if entity_id.startswith("entity:"):
|
|
780
|
+
all_entities[name] = entity_id
|
|
781
|
+
|
|
782
|
+
print(f"Total entity nodes: {len(all_entities)}", file=sys.stderr)
|
|
783
|
+
|
|
784
|
+
# Build clusters via greedy matching
|
|
785
|
+
remaining = dict(all_entities) # copy
|
|
786
|
+
clusters: list[list[tuple[str, str]]] = [] # [[( name, node_id ), ...], ...]
|
|
787
|
+
|
|
788
|
+
while remaining:
|
|
789
|
+
seed_name, seed_id = next(iter(remaining.items()))
|
|
790
|
+
cluster = [(seed_name, seed_id)]
|
|
791
|
+
del remaining[seed_name]
|
|
792
|
+
|
|
793
|
+
# Find all matches for this seed
|
|
794
|
+
to_remove = []
|
|
795
|
+
for other_name, other_id in remaining.items():
|
|
796
|
+
matched = _find_matching_entity(other_name, {seed_name: seed_id})
|
|
797
|
+
if matched:
|
|
798
|
+
cluster.append((other_name, other_id))
|
|
799
|
+
to_remove.append(other_name)
|
|
800
|
+
for name in to_remove:
|
|
801
|
+
del remaining[name]
|
|
802
|
+
|
|
803
|
+
if len(cluster) > 1:
|
|
804
|
+
# Filter out known false-positive pairs
|
|
805
|
+
names_set = {n for n, _ in cluster}
|
|
806
|
+
if any(pair <= names_set for pair in _DEDUP_SKIP_PAIRS):
|
|
807
|
+
continue
|
|
808
|
+
clusters.append(cluster)
|
|
809
|
+
|
|
810
|
+
print(f"Found {len(clusters)} duplicate clusters", file=sys.stderr)
|
|
811
|
+
|
|
812
|
+
merge_count = 0
|
|
813
|
+
repoint_count = 0
|
|
814
|
+
|
|
815
|
+
for cluster in clusters:
|
|
816
|
+
# Canonical selection: if any entity has significantly more backrefs (5+),
|
|
817
|
+
# use it. Otherwise prefer longest name (most complete spelling).
|
|
818
|
+
max_refs = max(len(store.backrefs(nid)) for _, nid in cluster)
|
|
819
|
+
if max_refs >= 5:
|
|
820
|
+
cluster.sort(key=lambda x: (-len(store.backrefs(x[1])), -len(x[0]), x[0]))
|
|
821
|
+
else:
|
|
822
|
+
cluster.sort(key=lambda x: (-len(x[0]), x[0]))
|
|
823
|
+
canonical_name, canonical_id = cluster[0]
|
|
824
|
+
duplicates = cluster[1:]
|
|
825
|
+
|
|
826
|
+
dup_names = [d[0] for d in duplicates]
|
|
827
|
+
print(f" cluster: {canonical_name} ← {dup_names}", file=sys.stderr)
|
|
828
|
+
|
|
829
|
+
if dry_run:
|
|
830
|
+
merge_count += len(duplicates)
|
|
831
|
+
continue
|
|
832
|
+
|
|
833
|
+
for dup_name, dup_id in duplicates:
|
|
834
|
+
# Re-point all refs pointing to this duplicate
|
|
835
|
+
refs = store.backrefs(dup_id)
|
|
836
|
+
for src_entity, attr in refs:
|
|
837
|
+
tx = store.begin_tx("entity_dedup")
|
|
838
|
+
store.retract_triple(tx, src_entity, attr, dup_id)
|
|
839
|
+
store.assert_triple(tx, src_entity, attr, canonical_id, value_type="ref")
|
|
840
|
+
repoint_count += 1
|
|
841
|
+
|
|
842
|
+
# Retract all triples of the duplicate entity itself
|
|
843
|
+
dup_attrs = store.entity(dup_id)
|
|
844
|
+
tx = store.begin_tx("entity_dedup")
|
|
845
|
+
for attr, values in dup_attrs.items():
|
|
846
|
+
if not isinstance(values, list):
|
|
847
|
+
values = [values]
|
|
848
|
+
for val in values:
|
|
849
|
+
store.retract_triple(tx, dup_id, attr, str(val))
|
|
850
|
+
|
|
851
|
+
merge_count += 1
|
|
852
|
+
|
|
853
|
+
# Stamp migration
|
|
854
|
+
if not dry_run and clusters:
|
|
855
|
+
tx = store.begin_tx("entity_dedup")
|
|
856
|
+
store.assert_triple(tx, "migration:entity-dedup-v1", "applied_at",
|
|
857
|
+
datetime.now(timezone.utc).isoformat())
|
|
858
|
+
store.assert_triple(tx, "migration:entity-dedup-v1", "clusters_merged",
|
|
859
|
+
str(len(clusters)))
|
|
860
|
+
|
|
861
|
+
result = {
|
|
862
|
+
"status": "dry_run" if dry_run else "applied",
|
|
863
|
+
"clusters": len(clusters),
|
|
864
|
+
"entities_merged": merge_count,
|
|
865
|
+
"refs_repointed": repoint_count,
|
|
866
|
+
}
|
|
867
|
+
print(json.dumps(result, indent=2), file=sys.stderr)
|
|
868
|
+
return result
|
|
869
|
+
|
|
870
|
+
|
|
698
871
|
def main() -> None:
|
|
699
872
|
parser = argparse.ArgumentParser(description="Knowledge Integrator")
|
|
700
873
|
parser.add_argument("--memory-dir", required=True, help="Path to memory/ directory")
|
|
701
874
|
parser.add_argument("--digest", default=None, help="SessionDigest JSON string")
|
|
702
875
|
parser.add_argument("--bootstrap", action="store_true", help="One-time: seed graph from playbook")
|
|
703
876
|
parser.add_argument("--retag", action="store_true", help="Re-extract tags for all existing facts")
|
|
877
|
+
parser.add_argument("--dedup-entities", action="store_true", help="Merge fragmented entity nodes")
|
|
878
|
+
parser.add_argument("--dry-run", action="store_true", help="Preview changes without applying")
|
|
704
879
|
args = parser.parse_args()
|
|
705
880
|
|
|
706
881
|
memory_dir = args.memory_dir
|
|
707
882
|
db_path = str(Path(memory_dir) / "knowledge-graph.db")
|
|
708
883
|
|
|
884
|
+
# Entity dedup mode: merge fragmented entity nodes
|
|
885
|
+
if args.dedup_entities:
|
|
886
|
+
if not Path(db_path).exists():
|
|
887
|
+
output_json({"error": "knowledge-graph.db not found"})
|
|
888
|
+
return
|
|
889
|
+
result = merge_entity_duplicates(db_path, dry_run=args.dry_run)
|
|
890
|
+
output_json(result)
|
|
891
|
+
return
|
|
892
|
+
|
|
709
893
|
# Bootstrap mode: seed graph from current playbook
|
|
710
894
|
if args.bootstrap:
|
|
711
895
|
result = _bootstrap_graph(memory_dir, db_path)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
Binary file
|
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
"""Behavioral assertion library for sinain-koog tick evaluation.
|
|
2
|
-
|
|
3
|
-
Each assertion function validates a runtime invariant of the pipeline.
|
|
4
|
-
Returns ``{"name": str, "passed": bool, "detail": str}``.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def _result(name: str, passed: bool, detail: str) -> dict:
|
|
11
|
-
return {"name": name, "passed": passed, "detail": detail}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# ---------------------------------------------------------------------------
|
|
15
|
-
# Playbook curator assertions
|
|
16
|
-
# ---------------------------------------------------------------------------
|
|
17
|
-
|
|
18
|
-
def assert_playbook_under_limit(curator_result: dict, limit: int = 50) -> dict:
|
|
19
|
-
"""Verify playbook body stays under the line limit."""
|
|
20
|
-
lines = curator_result.get("playbookLines", 0)
|
|
21
|
-
if lines <= limit:
|
|
22
|
-
return _result("playbook_under_limit", True, f"body has {lines} lines (limit {limit})")
|
|
23
|
-
return _result("playbook_under_limit", False, f"body has {lines} lines, exceeds limit of {limit}")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def assert_curator_respected_directive(curator_result: dict, directive: str) -> dict:
|
|
27
|
-
"""Check that curator changes align with the curate directive."""
|
|
28
|
-
changes = curator_result.get("changes", {})
|
|
29
|
-
added = len(changes.get("added", []))
|
|
30
|
-
pruned = len(changes.get("pruned", []))
|
|
31
|
-
|
|
32
|
-
if directive == "aggressive_prune":
|
|
33
|
-
# Should have pruned items
|
|
34
|
-
if pruned > 0:
|
|
35
|
-
return _result("curator_respected_directive", True,
|
|
36
|
-
f"aggressive_prune: pruned {pruned} items")
|
|
37
|
-
if added == 0 and pruned == 0:
|
|
38
|
-
return _result("curator_respected_directive", True,
|
|
39
|
-
"aggressive_prune: no changes (acceptable if playbook already lean)")
|
|
40
|
-
return _result("curator_respected_directive", False,
|
|
41
|
-
f"aggressive_prune: added {added} but pruned {pruned} — expected pruning")
|
|
42
|
-
|
|
43
|
-
if directive == "stability":
|
|
44
|
-
# Should not aggressively prune established patterns
|
|
45
|
-
if pruned > added + 2:
|
|
46
|
-
return _result("curator_respected_directive", False,
|
|
47
|
-
f"stability: pruned {pruned} items (only added {added}) — too aggressive for stability mode")
|
|
48
|
-
return _result("curator_respected_directive", True,
|
|
49
|
-
f"stability: added {added}, pruned {pruned} — conservative")
|
|
50
|
-
|
|
51
|
-
# normal / insufficient_data — any reasonable mix is fine
|
|
52
|
-
return _result("curator_respected_directive", True,
|
|
53
|
-
f"{directive}: added {added}, pruned {pruned}")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# ---------------------------------------------------------------------------
|
|
57
|
-
# Signal analyzer assertions
|
|
58
|
-
# ---------------------------------------------------------------------------
|
|
59
|
-
|
|
60
|
-
def assert_no_repeat_action(signal_result: dict, recent_logs: list[dict], window: int = 3) -> dict:
|
|
61
|
-
"""Verify recommendedAction doesn't repeat the last N ticks' actions."""
|
|
62
|
-
action = signal_result.get("recommendedAction")
|
|
63
|
-
if action is None or action.get("action") == "skip":
|
|
64
|
-
return _result("no_repeat_action", True, "no action recommended (skip/null)")
|
|
65
|
-
|
|
66
|
-
task = (action.get("task") or "").lower().strip()
|
|
67
|
-
if not task:
|
|
68
|
-
return _result("no_repeat_action", True, "no task description to compare")
|
|
69
|
-
|
|
70
|
-
# Collect recent action tasks
|
|
71
|
-
recent_tasks: list[str] = []
|
|
72
|
-
for log in recent_logs[:window]:
|
|
73
|
-
log_actions = log.get("actionsConsidered", [])
|
|
74
|
-
for a in log_actions:
|
|
75
|
-
if a.get("chosen"):
|
|
76
|
-
recent_tasks.append((a.get("reason") or a.get("task") or "").lower().strip())
|
|
77
|
-
|
|
78
|
-
# Check for near-duplicate (substring match to catch rephrasing)
|
|
79
|
-
for prev_task in recent_tasks:
|
|
80
|
-
if not prev_task:
|
|
81
|
-
continue
|
|
82
|
-
# If >60% of words overlap, consider it a repeat
|
|
83
|
-
task_words = set(task.split())
|
|
84
|
-
prev_words = set(prev_task.split())
|
|
85
|
-
if not task_words or not prev_words:
|
|
86
|
-
continue
|
|
87
|
-
overlap = len(task_words & prev_words) / max(len(task_words), len(prev_words))
|
|
88
|
-
if overlap > 0.6:
|
|
89
|
-
return _result("no_repeat_action", False,
|
|
90
|
-
f"action task '{task[:60]}' overlaps with recent '{prev_task[:60]}' ({overlap:.0%} word overlap)")
|
|
91
|
-
|
|
92
|
-
return _result("no_repeat_action", True,
|
|
93
|
-
f"action task is distinct from last {window} ticks")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def assert_signal_confidence_threshold(signal_result: dict, threshold: float = 0.5) -> dict:
|
|
97
|
-
"""Verify actions are only recommended above the confidence threshold."""
|
|
98
|
-
action = signal_result.get("recommendedAction")
|
|
99
|
-
if action is None or action.get("action") == "skip":
|
|
100
|
-
return _result("signal_confidence_threshold", True, "no action recommended")
|
|
101
|
-
|
|
102
|
-
confidence = action.get("confidence")
|
|
103
|
-
if confidence is None:
|
|
104
|
-
return _result("signal_confidence_threshold", False,
|
|
105
|
-
"action recommended but no confidence value provided")
|
|
106
|
-
|
|
107
|
-
if confidence >= threshold:
|
|
108
|
-
return _result("signal_confidence_threshold", True,
|
|
109
|
-
f"confidence {confidence:.2f} >= threshold {threshold}")
|
|
110
|
-
return _result("signal_confidence_threshold", False,
|
|
111
|
-
f"confidence {confidence:.2f} < threshold {threshold}")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# ---------------------------------------------------------------------------
|
|
115
|
-
# Insight synthesizer assertions
|
|
116
|
-
# ---------------------------------------------------------------------------
|
|
117
|
-
|
|
118
|
-
def assert_insight_char_limit(synth_result: dict, limit: int = 500) -> dict:
|
|
119
|
-
"""Verify suggestion+insight stays under the character limit."""
|
|
120
|
-
if synth_result.get("skip", False):
|
|
121
|
-
return _result("insight_char_limit", True, "output skipped")
|
|
122
|
-
|
|
123
|
-
suggestion = synth_result.get("suggestion", "")
|
|
124
|
-
insight = synth_result.get("insight", "")
|
|
125
|
-
total = len(suggestion) + len(insight)
|
|
126
|
-
|
|
127
|
-
if total <= limit:
|
|
128
|
-
return _result("insight_char_limit", True, f"total {total} chars (limit {limit})")
|
|
129
|
-
return _result("insight_char_limit", False, f"total {total} chars exceeds limit of {limit}")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def assert_skip_reason_specific(synth_result: dict) -> dict:
|
|
133
|
-
"""If skip=true, verify the reason is specific (not generic boilerplate)."""
|
|
134
|
-
if not synth_result.get("skip", False):
|
|
135
|
-
return _result("skip_reason_specific", True, "output not skipped")
|
|
136
|
-
|
|
137
|
-
reason = (synth_result.get("skipReason") or "").strip()
|
|
138
|
-
if not reason:
|
|
139
|
-
return _result("skip_reason_specific", False, "skip=true but no skipReason provided")
|
|
140
|
-
|
|
141
|
-
# Check against known-generic patterns
|
|
142
|
-
generic_phrases = [
|
|
143
|
-
"no new data",
|
|
144
|
-
"nothing new",
|
|
145
|
-
"no updates",
|
|
146
|
-
"insufficient data",
|
|
147
|
-
"not enough information",
|
|
148
|
-
"no changes",
|
|
149
|
-
]
|
|
150
|
-
reason_lower = reason.lower()
|
|
151
|
-
for phrase in generic_phrases:
|
|
152
|
-
if reason_lower == phrase or (len(reason_lower) < 30 and phrase in reason_lower):
|
|
153
|
-
return _result("skip_reason_specific", False,
|
|
154
|
-
f"skipReason is too generic: '{reason}'")
|
|
155
|
-
|
|
156
|
-
return _result("skip_reason_specific", True, f"skipReason is specific ({len(reason)} chars)")
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
# ---------------------------------------------------------------------------
|
|
160
|
-
# Memory miner assertions
|
|
161
|
-
# ---------------------------------------------------------------------------
|
|
162
|
-
|
|
163
|
-
def assert_miner_references_sources(miner_result: dict, daily_files: list[str]) -> dict:
|
|
164
|
-
"""Verify mining findings reference actual source files that were provided."""
|
|
165
|
-
mined = miner_result.get("minedSources", [])
|
|
166
|
-
if not mined:
|
|
167
|
-
return _result("miner_references_sources", True, "no sources mined (early return)")
|
|
168
|
-
|
|
169
|
-
# daily_files contains basenames like "2026-02-21.md"
|
|
170
|
-
known_basenames = set(daily_files)
|
|
171
|
-
unknown = [s for s in mined if s not in known_basenames]
|
|
172
|
-
|
|
173
|
-
if unknown:
|
|
174
|
-
return _result("miner_references_sources", False,
|
|
175
|
-
f"minedSources references unknown files: {unknown}")
|
|
176
|
-
return _result("miner_references_sources", True,
|
|
177
|
-
f"all {len(mined)} mined sources are valid")
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
# ---------------------------------------------------------------------------
|
|
181
|
-
# Cross-script / structural assertions
|
|
182
|
-
# ---------------------------------------------------------------------------
|
|
183
|
-
|
|
184
|
-
def assert_schema_valid(script_name: str, output: dict, schema_errors: list[str]) -> dict:
|
|
185
|
-
"""Wrap schema validation result as an assertion."""
|
|
186
|
-
if not schema_errors:
|
|
187
|
-
return _result(f"schema_valid_{script_name}", True, "output matches schema")
|
|
188
|
-
return _result(f"schema_valid_{script_name}", False,
|
|
189
|
-
f"{len(schema_errors)} schema errors: {'; '.join(schema_errors[:3])}")
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def assert_playbook_header_footer_intact(playbook_text: str) -> dict:
|
|
193
|
-
"""Verify the playbook still has its mining-index header and effectiveness footer."""
|
|
194
|
-
has_header = "<!-- mining-index:" in playbook_text
|
|
195
|
-
has_footer = "<!-- effectiveness:" in playbook_text
|
|
196
|
-
|
|
197
|
-
if has_header and has_footer:
|
|
198
|
-
return _result("playbook_header_footer_intact", True,
|
|
199
|
-
"both mining-index and effectiveness comments present")
|
|
200
|
-
missing = []
|
|
201
|
-
if not has_header:
|
|
202
|
-
missing.append("mining-index")
|
|
203
|
-
if not has_footer:
|
|
204
|
-
missing.append("effectiveness")
|
|
205
|
-
return _result("playbook_header_footer_intact", False,
|
|
206
|
-
f"missing playbook comments: {', '.join(missing)}")
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
# ---------------------------------------------------------------------------
|
|
210
|
-
# Runner: execute all applicable assertions for a tick
|
|
211
|
-
# ---------------------------------------------------------------------------
|
|
212
|
-
|
|
213
|
-
def run_tick_assertions(
|
|
214
|
-
log_entry: dict,
|
|
215
|
-
recent_logs: list[dict],
|
|
216
|
-
playbook_text: str,
|
|
217
|
-
daily_files: list[str],
|
|
218
|
-
) -> list[dict]:
|
|
219
|
-
"""Run all applicable assertions against a single tick's log entry.
|
|
220
|
-
|
|
221
|
-
Returns a list of assertion result dicts.
|
|
222
|
-
"""
|
|
223
|
-
results: list[dict] = []
|
|
224
|
-
|
|
225
|
-
# Signal analyzer assertions
|
|
226
|
-
signals = log_entry.get("signals")
|
|
227
|
-
if signals is not None:
|
|
228
|
-
results.append(assert_signal_confidence_threshold(
|
|
229
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
230
|
-
))
|
|
231
|
-
results.append(assert_no_repeat_action(
|
|
232
|
-
{"signals": signals, "recommendedAction": log_entry.get("recommendedAction")},
|
|
233
|
-
recent_logs,
|
|
234
|
-
))
|
|
235
|
-
|
|
236
|
-
# Curator assertions — playbookChanges can be {"note": "skipped"} or full output
|
|
237
|
-
curator = log_entry.get("playbookChanges")
|
|
238
|
-
if isinstance(curator, dict) and "changes" in curator:
|
|
239
|
-
curator_with_lines = {**curator}
|
|
240
|
-
if "playbookLines" not in curator_with_lines:
|
|
241
|
-
curator_with_lines["playbookLines"] = curator.get("playbookLines", 0)
|
|
242
|
-
results.append(assert_playbook_under_limit(curator_with_lines))
|
|
243
|
-
|
|
244
|
-
directive = log_entry.get("curateDirective", "normal")
|
|
245
|
-
results.append(assert_curator_respected_directive(curator_with_lines, directive))
|
|
246
|
-
|
|
247
|
-
# Insight synthesizer assertions — output can be null (pipeline-level skip)
|
|
248
|
-
output = log_entry.get("output")
|
|
249
|
-
if isinstance(output, dict):
|
|
250
|
-
results.append(assert_insight_char_limit(output))
|
|
251
|
-
results.append(assert_skip_reason_specific(output))
|
|
252
|
-
|
|
253
|
-
# Mining assertions — log uses miningFindings (str) and minedSources (list)
|
|
254
|
-
mining = log_entry.get("miningResult")
|
|
255
|
-
if mining is not None:
|
|
256
|
-
results.append(assert_miner_references_sources(mining, daily_files))
|
|
257
|
-
elif log_entry.get("minedSources"):
|
|
258
|
-
# Reconstruct mining result from flat log fields
|
|
259
|
-
results.append(assert_miner_references_sources(
|
|
260
|
-
{"minedSources": log_entry.get("minedSources", [])}, daily_files
|
|
261
|
-
))
|
|
262
|
-
|
|
263
|
-
# Playbook health (if we have playbook text)
|
|
264
|
-
if playbook_text:
|
|
265
|
-
results.append(assert_playbook_header_footer_intact(playbook_text))
|
|
266
|
-
|
|
267
|
-
return results
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
"""Base adapter and data classes for benchmark evaluation."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from abc import ABC, abstractmethod
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class BenchmarkQuestion:
|
|
11
|
-
id: str
|
|
12
|
-
text: str
|
|
13
|
-
gold_answer: str
|
|
14
|
-
category: str # single-session, multi-session, temporal, etc.
|
|
15
|
-
evidence_session_ids: list[str] = field(default_factory=list)
|
|
16
|
-
metadata: dict = field(default_factory=dict)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class BenchmarkInstance:
|
|
21
|
-
"""A set of conversations + questions that share the same context."""
|
|
22
|
-
id: str
|
|
23
|
-
sessions: list[list[dict]] # list of sessions, each a list of feed items {source, text, ts}
|
|
24
|
-
questions: list[BenchmarkQuestion] = field(default_factory=list)
|
|
25
|
-
raw_sessions: list[dict] = field(default_factory=list) # original benchmark format (for full-context condition)
|
|
26
|
-
metadata: dict = field(default_factory=dict)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class BenchmarkAdapter(ABC):
|
|
30
|
-
"""Abstract adapter: converts a published benchmark into sinain's format."""
|
|
31
|
-
|
|
32
|
-
@property
|
|
33
|
-
@abstractmethod
|
|
34
|
-
def name(self) -> str:
|
|
35
|
-
"""Benchmark name (e.g. 'longmemeval', 'locomo')."""
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def load_dataset(self, data_dir: str) -> list[BenchmarkInstance]:
|
|
39
|
-
"""Download (if needed) and parse the benchmark dataset."""
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def format_full_context(self, instance: BenchmarkInstance) -> str:
|
|
43
|
-
"""Render the full conversation history as a text string for the baseline condition."""
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
"""Benchmark configuration — models, paths, thresholds."""
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
BENCHMARKS_DIR = Path(__file__).resolve().parent
|
|
6
|
-
DATA_DIR = BENCHMARKS_DIR / "data"
|
|
7
|
-
RESULTS_DIR = BENCHMARKS_DIR / "results"
|
|
8
|
-
|
|
9
|
-
# LLM models (via OpenRouter)
|
|
10
|
-
QA_MODEL = "google/gemini-2.5-flash"
|
|
11
|
-
JUDGE_MODEL = "openai/gpt-4o"
|
|
12
|
-
|
|
13
|
-
# Retrieval
|
|
14
|
-
K_VALUES = [1, 3, 5, 10]
|
|
15
|
-
MAX_FACTS_PER_QUERY = 10
|
|
16
|
-
|
|
17
|
-
# Ingestion
|
|
18
|
-
DISTILLER_TIMEOUT_S = 30
|
|
19
|
-
INTEGRATOR_TIMEOUT_S = 60
|
|
20
|
-
|
|
21
|
-
# Dataset URLs
|
|
22
|
-
LONGMEMEVAL_HF = "xiaowu0162/longmemeval-cleaned"
|
|
23
|
-
LOCOMO_GITHUB = "https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json"
|