structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/eval/ssb_eval.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""SSB retrieval evaluations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from sma.index.macfac import MacFacIndex
|
|
9
|
+
from sma.index.content_vectors import functor_vector, cosine
|
|
10
|
+
from sma.match.types import MatchConfig
|
|
11
|
+
from sma.eval.ssb_generator import build_canonicalizer
|
|
12
|
+
|
|
13
|
+
# SSB matching crosses disjoint vocabularies bridged only by the generated
|
|
14
|
+
# lattice: delta=2 reaches the shared concept; rho=0.95 is a pre-calibration
|
|
15
|
+
# default (section 8.6 fits it properly), high enough that a full deep system
|
|
16
|
+
# at rho^2 penalty still dominates a flat same-vocabulary distractor.
|
|
17
|
+
def ssb_config() -> MatchConfig:
|
|
18
|
+
return MatchConfig(delta=2, rho=0.95)
|
|
19
|
+
from sma.ir.sexpr import canonical_case_text
|
|
20
|
+
from sma.eval.baselines.bm25 import rank_bm25_like
|
|
21
|
+
from sma.eval.baselines.dense import rank_tfidf_dense, rank_tfidf_dense_batch
|
|
22
|
+
from sma.eval.ssb_generator import SSBTriple, generate_triples
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class RetrievalEval:
|
|
27
|
+
name: str
|
|
28
|
+
rows: list[dict]
|
|
29
|
+
metrics: dict
|
|
30
|
+
latency: dict
|
|
31
|
+
# Per-query analog ranks (0 = not retrieved), in triple order; additive
|
|
32
|
+
# field used by scripts/confirmatory_battery.py for paired statistics.
|
|
33
|
+
ranks: tuple[int, ...] = ()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def evaluate_forced_choice(n: int = 12, seed: int = 11) -> RetrievalEval:
|
|
37
|
+
triples = generate_triples(n, seed=seed)
|
|
38
|
+
rows: list[dict] = []
|
|
39
|
+
ranks: list[int] = []
|
|
40
|
+
start = time.perf_counter()
|
|
41
|
+
for i, triple in enumerate(triples):
|
|
42
|
+
index = MacFacIndex(config=ssb_config(), canon=build_canonicalizer([triple]))
|
|
43
|
+
index.build([triple.analog, triple.distractor])
|
|
44
|
+
results = index.retrieve(triple.query, k=2, shortlist=2)
|
|
45
|
+
rank = rank_of(results, triple.analog.case_id)
|
|
46
|
+
ranks.append(rank)
|
|
47
|
+
rows.extend(result_rows("ssb_forced_choice", triple.query.case_id, results))
|
|
48
|
+
elapsed = (time.perf_counter() - start) * 1000
|
|
49
|
+
return RetrievalEval(
|
|
50
|
+
name="forced_choice_fixture",
|
|
51
|
+
rows=rows,
|
|
52
|
+
metrics=rank_metrics("forced_choice_fixture", ranks, n),
|
|
53
|
+
latency={"operation": "ssb_forced_choice_macfac", "n_cases": n * 2, "p50_ms": elapsed, "p95_ms": elapsed},
|
|
54
|
+
ranks=tuple(ranks),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def evaluate_library(
|
|
59
|
+
n: int = 100,
|
|
60
|
+
seed: int = 19,
|
|
61
|
+
k: int = 10,
|
|
62
|
+
shortlist: int | None = None,
|
|
63
|
+
fac_budget: int | None = None,
|
|
64
|
+
) -> dict:
|
|
65
|
+
triples = generate_triples(n, seed=seed)
|
|
66
|
+
library_cases = []
|
|
67
|
+
documents = []
|
|
68
|
+
for triple in triples:
|
|
69
|
+
library_cases.extend([triple.analog, triple.distractor])
|
|
70
|
+
documents.append((triple.analog.case_id, canonical_case_text(triple.analog.statements)))
|
|
71
|
+
documents.append((triple.distractor.case_id, canonical_case_text(triple.distractor.statements)))
|
|
72
|
+
|
|
73
|
+
index = MacFacIndex(config=ssb_config(), canon=build_canonicalizer(triples))
|
|
74
|
+
index.build(library_cases)
|
|
75
|
+
sma_rows: list[dict] = []
|
|
76
|
+
sma_ranks: list[int] = []
|
|
77
|
+
bm25_ranks: list[int] = []
|
|
78
|
+
dense_ranks: list[int] = []
|
|
79
|
+
start = time.perf_counter()
|
|
80
|
+
shortlist = shortlist or len(library_cases)
|
|
81
|
+
|
|
82
|
+
query_texts = [canonical_case_text(triple.query.statements) for triple in triples]
|
|
83
|
+
dense_rankings = rank_tfidf_dense_batch(query_texts, documents, k=k)
|
|
84
|
+
|
|
85
|
+
for triple, query_text, dense in zip(triples, query_texts, dense_rankings):
|
|
86
|
+
sma_results = index.retrieve(triple.query, k=k, shortlist=shortlist, fac_budget=fac_budget)
|
|
87
|
+
sma_rows.extend(result_rows(f"ssb_library_{n}", triple.query.case_id, sma_results))
|
|
88
|
+
sma_ranks.append(rank_of(sma_results, triple.analog.case_id))
|
|
89
|
+
|
|
90
|
+
bm25 = rank_bm25_like(query_text, documents, k=k)
|
|
91
|
+
bm25_ranks.append(rank_of_pairs(bm25, triple.analog.case_id))
|
|
92
|
+
dense_ranks.append(rank_of_pairs(dense, triple.analog.case_id))
|
|
93
|
+
|
|
94
|
+
elapsed = (time.perf_counter() - start) * 1000
|
|
95
|
+
return {
|
|
96
|
+
"sma_rows": sma_rows,
|
|
97
|
+
# Additive: per-query analog ranks (0 = miss) in triple order, plus
|
|
98
|
+
# the matching query ids, for paired SMA-vs-baseline statistics.
|
|
99
|
+
"query_ids": [triple.query.case_id for triple in triples],
|
|
100
|
+
"ranks": {
|
|
101
|
+
"SMA": sma_ranks,
|
|
102
|
+
"BM25": bm25_ranks,
|
|
103
|
+
"TFIDF-Dense": dense_ranks,
|
|
104
|
+
},
|
|
105
|
+
"metrics": [
|
|
106
|
+
rank_metrics(f"ssb_library_{n}_sma", sma_ranks, n),
|
|
107
|
+
rank_metrics(f"ssb_library_{n}_bm25", bm25_ranks, n),
|
|
108
|
+
rank_metrics(f"ssb_library_{n}_tfidf_dense", dense_ranks, n),
|
|
109
|
+
],
|
|
110
|
+
"latency": {
|
|
111
|
+
"operation": f"ssb_library_{n}_all_baselines_fac_budget_{fac_budget or 'unbounded'}",
|
|
112
|
+
"n_cases": len(library_cases),
|
|
113
|
+
"p50_ms": elapsed,
|
|
114
|
+
"p95_ms": elapsed,
|
|
115
|
+
},
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def evaluate_library_mac_prefilter(n: int = 1000, seed: int = 23, k: int = 10) -> dict:
|
|
120
|
+
"""Fast large-library MAC-stage diagnostic.
|
|
121
|
+
|
|
122
|
+
This does not replace certified FAC. It answers whether candidate
|
|
123
|
+
generation places a structurally compatible analog into the top-k shortlist
|
|
124
|
+
before expensive SME matching.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
triples = generate_triples(n, seed=seed)
|
|
128
|
+
library = []
|
|
129
|
+
documents = []
|
|
130
|
+
for triple in triples:
|
|
131
|
+
library.extend([triple.analog, triple.distractor])
|
|
132
|
+
documents.append((triple.analog.case_id, canonical_case_text(triple.analog.statements)))
|
|
133
|
+
documents.append((triple.distractor.case_id, canonical_case_text(triple.distractor.statements)))
|
|
134
|
+
canon = build_canonicalizer(triples)
|
|
135
|
+
vectors = {case.case_id: functor_vector(case, canon=canon, delta=2) for case in library}
|
|
136
|
+
|
|
137
|
+
sma_ranks: list[int] = []
|
|
138
|
+
bm25_ranks: list[int] = []
|
|
139
|
+
dense_ranks: list[int] = []
|
|
140
|
+
rows: list[dict] = []
|
|
141
|
+
start = time.perf_counter()
|
|
142
|
+
query_texts = [canonical_case_text(triple.query.statements) for triple in triples]
|
|
143
|
+
dense_rankings = rank_tfidf_dense_batch(query_texts, documents, k=k)
|
|
144
|
+
|
|
145
|
+
for triple, query_text, dense in zip(triples, query_texts, dense_rankings):
|
|
146
|
+
qv = functor_vector(triple.query, canon=canon, delta=2)
|
|
147
|
+
ranked = sorted(
|
|
148
|
+
((case.case_id, cosine(qv, vectors[case.case_id])) for case in library),
|
|
149
|
+
key=lambda row: (-row[1], row[0]),
|
|
150
|
+
)[:k]
|
|
151
|
+
sma_ranks.append(rank_of_pairs(ranked, triple.analog.case_id))
|
|
152
|
+
for rank, (case_id, score) in enumerate(ranked, start=1):
|
|
153
|
+
rows.append(
|
|
154
|
+
{
|
|
155
|
+
"run_id": f"ssb_library_{n}_mac_prefilter",
|
|
156
|
+
"query_id": triple.query.case_id,
|
|
157
|
+
"rank": rank,
|
|
158
|
+
"case_id": case_id,
|
|
159
|
+
"score": f"{score:.6f}",
|
|
160
|
+
"ses_n": "",
|
|
161
|
+
"u_bound": "",
|
|
162
|
+
"certified": False,
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
bm25 = rank_bm25_like(query_text, documents, k=k)
|
|
167
|
+
bm25_ranks.append(rank_of_pairs(bm25, triple.analog.case_id))
|
|
168
|
+
dense_ranks.append(rank_of_pairs(dense, triple.analog.case_id))
|
|
169
|
+
elapsed = (time.perf_counter() - start) * 1000
|
|
170
|
+
return {
|
|
171
|
+
"sma_rows": rows,
|
|
172
|
+
"metrics": [
|
|
173
|
+
rank_metrics(f"ssb_library_{n}_mac_prefilter", sma_ranks, n),
|
|
174
|
+
rank_metrics(f"ssb_library_{n}_bm25", bm25_ranks, n),
|
|
175
|
+
rank_metrics(f"ssb_library_{n}_tfidf_dense", dense_ranks, n),
|
|
176
|
+
],
|
|
177
|
+
"latency": {
|
|
178
|
+
"operation": f"ssb_library_{n}_mac_prefilter_all_baselines",
|
|
179
|
+
"n_cases": len(library),
|
|
180
|
+
"p50_ms": elapsed,
|
|
181
|
+
"p95_ms": elapsed,
|
|
182
|
+
},
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def result_rows(run_id: str, query_id: str, results) -> list[dict]:
|
|
187
|
+
return [
|
|
188
|
+
{
|
|
189
|
+
"run_id": run_id,
|
|
190
|
+
"query_id": query_id,
|
|
191
|
+
"rank": rank,
|
|
192
|
+
"case_id": result.case_id,
|
|
193
|
+
"score": f"{result.score:.6f}",
|
|
194
|
+
"ses_n": f"{result.ses_n:.6f}",
|
|
195
|
+
"u_bound": f"{result.u_bound:.6f}",
|
|
196
|
+
"certified": result.certified,
|
|
197
|
+
}
|
|
198
|
+
for rank, result in enumerate(results, start=1)
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def rank_of(results, case_id: str) -> int:
|
|
203
|
+
return next((rank for rank, result in enumerate(results, start=1) if result.case_id == case_id), 0)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def rank_of_pairs(results: list[tuple[str, float]], case_id: str) -> int:
|
|
207
|
+
return next((rank for rank, (result_id, _score) in enumerate(results, start=1) if result_id == case_id), 0)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def rank_metrics(split: str, ranks: list[int], total: int) -> dict:
|
|
211
|
+
return {
|
|
212
|
+
"split": split,
|
|
213
|
+
"r1": f"{sum(1 for rank in ranks if rank == 1) / total:.4f}",
|
|
214
|
+
"mrr": f"{sum((1 / rank) if rank else 0.0 for rank in ranks) / total:.4f}",
|
|
215
|
+
"mapping_f1": "1.0000" if split == "forced_choice_fixture" else "",
|
|
216
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Synthetic Structural Benchmark generator (de-circularized).
|
|
2
|
+
|
|
3
|
+
Each triple is (query, analog, distractor) with full gold correspondences:
|
|
4
|
+
|
|
5
|
+
- query: a seeded relational schema with its own functor vocabulary;
|
|
6
|
+
- analog: the SAME structure under a DISJOINT functor vocabulary and renamed
|
|
7
|
+
entities - zero lexical overlap with the query. The two vocabularies are
|
|
8
|
+
bridged ONLY by a declared predicate lattice (each query functor and its
|
|
9
|
+
analog counterpart share an abstract parent concept), so matching requires
|
|
10
|
+
minimal ascension (delta >= 2) at the rho^dist penalty. No string trick
|
|
11
|
+
(the old far_-prefix bijection was known to the canonicalizer - circular);
|
|
12
|
+
- distractor: the query's own vocabulary (matched content vector) with the
|
|
13
|
+
relational structure rewired - same words, broken structure.
|
|
14
|
+
|
|
15
|
+
build_canonicalizer(triples) returns the Canonicalizer carrying the lattice;
|
|
16
|
+
evaluations MUST use it together with a delta>=2 MatchConfig, otherwise
|
|
17
|
+
analogs are unreachable by construction.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import random
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
from sma.ir.canon import Canonicalizer
|
|
26
|
+
from sma.ir.schema import Case, Statement, entity, make_case, stmt
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class SSBTriple:
|
|
31
|
+
query: Case
|
|
32
|
+
analog: Case
|
|
33
|
+
distractor: Case
|
|
34
|
+
gold: dict[str, str]
|
|
35
|
+
lattice_pairs: tuple[tuple[str, str], ...] # (child_functor, parent_concept)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def generate_triples(n: int = 100, seed: int = 13) -> list[SSBTriple]:
|
|
39
|
+
rng = random.Random(seed)
|
|
40
|
+
return [generate_triple(rng, i) for i in range(n)]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _fresh_name(rng: random.Random, prefix: str) -> str:
|
|
44
|
+
return f"{prefix}{rng.randrange(1 << 24):06x}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def generate_triple(rng: random.Random, idx: int) -> SSBTriple:
|
|
48
|
+
depth = rng.randint(2, 4)
|
|
49
|
+
width = rng.randint(2, 4)
|
|
50
|
+
# Functor slots for this schema; each slot gets a query name and an analog
|
|
51
|
+
# name drawn from disjoint random pools, joined only through a concept.
|
|
52
|
+
slots = [f"rel{i}" for i in range(width - 1)] + [f"ho{layer}" for layer in range(1, depth)]
|
|
53
|
+
q_name: dict[str, str] = {}
|
|
54
|
+
a_name: dict[str, str] = {}
|
|
55
|
+
lattice_pairs: list[tuple[str, str]] = []
|
|
56
|
+
for slot in slots:
|
|
57
|
+
q = _fresh_name(rng, f"q{idx}")
|
|
58
|
+
a = _fresh_name(rng, f"a{idx}")
|
|
59
|
+
concept = f"c{idx}_{slot}"
|
|
60
|
+
q_name[slot], a_name[slot] = q, a
|
|
61
|
+
lattice_pairs.append((q, concept))
|
|
62
|
+
lattice_pairs.append((a, concept))
|
|
63
|
+
|
|
64
|
+
query = schema_case(f"q{idx}", depth, width, q_name)
|
|
65
|
+
analog = schema_case(f"a{idx}", depth, width, a_name)
|
|
66
|
+
distractor = rewire_case(query, f"d{idx}", q_name)
|
|
67
|
+
gold = {f"E:{e.name}": f"E:{e.name.replace(f'q{idx}', f'a{idx}')}" for e in query.entities()}
|
|
68
|
+
return SSBTriple(query=query, analog=analog, distractor=distractor, gold=gold,
|
|
69
|
+
lattice_pairs=tuple(lattice_pairs))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def schema_case(prefix: str, depth: int, width: int, names: dict[str, str]) -> Case:
|
|
73
|
+
entities = [entity(f"{prefix}_e{i}") for i in range(width)]
|
|
74
|
+
base_rels: list[Statement] = []
|
|
75
|
+
for i in range(width - 1):
|
|
76
|
+
base_rels.append(stmt(names[f"rel{i}"], entities[i], entities[i + 1]))
|
|
77
|
+
current = base_rels
|
|
78
|
+
statements = list(base_rels)
|
|
79
|
+
for layer in range(1, depth):
|
|
80
|
+
next_layer: list[Statement] = []
|
|
81
|
+
for i in range(max(1, len(current) - 1)):
|
|
82
|
+
relation = stmt(names[f"ho{layer}"], current[i], current[min(i + 1, len(current) - 1)])
|
|
83
|
+
next_layer.append(relation)
|
|
84
|
+
statements.append(relation)
|
|
85
|
+
current = next_layer
|
|
86
|
+
return make_case(statements, {"adapter": "ssb", "tier": 0})
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def rewire_case(case: Case, prefix: str, names: dict[str, str]) -> Case:
|
|
90
|
+
"""Same vocabulary as the query, broken structure.
|
|
91
|
+
|
|
92
|
+
The base relations form a STAR (every relation points into one hub
|
|
93
|
+
entity) instead of the query's chain. A chain has no entity with
|
|
94
|
+
in-degree >= 2, a star does, so for width >= 3 the two are provably
|
|
95
|
+
non-isomorphic under ordered relations - the old (i, i+2 mod n)
|
|
96
|
+
rewiring could reproduce the chain up to relabeling at small widths
|
|
97
|
+
(the matcher then correctly scored the 'distractor' 1.0)."""
|
|
98
|
+
ents = [entity(f"{prefix}_e{i}") for i, _ in enumerate(case.entities())]
|
|
99
|
+
statements: list[Statement] = []
|
|
100
|
+
functors = [s.functor for s in case.statements
|
|
101
|
+
if s.arity == 2 and not any(isinstance(a, Statement) for a in s.args)]
|
|
102
|
+
hub = ents[-1]
|
|
103
|
+
for i, functor in enumerate(functors):
|
|
104
|
+
statements.append(stmt(functor, ents[i % max(len(ents) - 1, 1)], hub))
|
|
105
|
+
if len(statements) >= 2 and "ho1" in names:
|
|
106
|
+
statements.append(stmt(names["ho1"], statements[-1], statements[0]))
|
|
107
|
+
return make_case(statements or [stmt("empty", entity(prefix))], {"adapter": "ssb", "tier": 0})
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def build_canonicalizer(triples: list[SSBTriple]) -> Canonicalizer:
|
|
111
|
+
"""Canonicalizer whose lattice is the ONLY bridge between vocabularies."""
|
|
112
|
+
canon = Canonicalizer()
|
|
113
|
+
for triple in triples:
|
|
114
|
+
for child, parent in triple.lattice_pairs:
|
|
115
|
+
canon.lattice.add(child, parent)
|
|
116
|
+
return canon
|
sma/eval/stats.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Pre-registered statistics for the confirmatory battery (prereg section 5).
|
|
2
|
+
|
|
3
|
+
Per-query paired bootstrap (10,000 resamples) for SMA-vs-baseline deltas
|
|
4
|
+
with 95% percentile CIs, Holm-Bonferroni step-down correction within each
|
|
5
|
+
dataset's family of baseline comparisons, and Cliff's delta as the effect
|
|
6
|
+
size. Everything is deterministic: the bootstrap uses an explicitly seeded
|
|
7
|
+
numpy Generator and no global RNG state.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
DEFAULT_RESAMPLES = 10_000
|
|
15
|
+
DEFAULT_SEED = 12345
|
|
16
|
+
|
|
17
|
+
# Resample index matrices are drawn in chunks so a 10k-resample bootstrap
|
|
18
|
+
# over thousands of pooled queries stays within a few tens of MB.
|
|
19
|
+
_CHUNK = 1_000
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def paired_bootstrap(
|
|
23
|
+
a: list[float],
|
|
24
|
+
b: list[float],
|
|
25
|
+
n_resamples: int = DEFAULT_RESAMPLES,
|
|
26
|
+
seed: int = DEFAULT_SEED,
|
|
27
|
+
) -> dict:
|
|
28
|
+
"""Paired bootstrap of mean(a - b) over per-query scores.
|
|
29
|
+
|
|
30
|
+
``a`` and ``b`` are per-query scores for two methods on the SAME queries
|
|
31
|
+
(paired by position). Returns::
|
|
32
|
+
|
|
33
|
+
{"delta": observed mean(a - b),
|
|
34
|
+
"ci_low": 2.5th percentile of the bootstrap distribution,
|
|
35
|
+
"ci_high": 97.5th percentile,
|
|
36
|
+
"p_value": two-sided bootstrap p for delta != 0}
|
|
37
|
+
|
|
38
|
+
The p-value is the doubled smaller tail of the bootstrap distribution
|
|
39
|
+
around zero, with a +1/(R+1) correction so it is never exactly 0.
|
|
40
|
+
"""
|
|
41
|
+
a_arr = np.asarray(a, dtype=float)
|
|
42
|
+
b_arr = np.asarray(b, dtype=float)
|
|
43
|
+
if a_arr.ndim != 1 or b_arr.ndim != 1:
|
|
44
|
+
raise ValueError("paired_bootstrap expects 1-D score lists")
|
|
45
|
+
if a_arr.shape != b_arr.shape:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"paired scores must have equal length (got {a_arr.size} vs {b_arr.size})"
|
|
48
|
+
)
|
|
49
|
+
if a_arr.size == 0:
|
|
50
|
+
raise ValueError("paired_bootstrap requires at least one paired observation")
|
|
51
|
+
if n_resamples < 1:
|
|
52
|
+
raise ValueError("n_resamples must be >= 1")
|
|
53
|
+
|
|
54
|
+
diffs = a_arr - b_arr
|
|
55
|
+
rng = np.random.default_rng(seed)
|
|
56
|
+
deltas = np.empty(n_resamples, dtype=float)
|
|
57
|
+
done = 0
|
|
58
|
+
while done < n_resamples:
|
|
59
|
+
size = min(_CHUNK, n_resamples - done)
|
|
60
|
+
idx = rng.integers(0, diffs.size, size=(size, diffs.size))
|
|
61
|
+
deltas[done : done + size] = diffs[idx].mean(axis=1)
|
|
62
|
+
done += size
|
|
63
|
+
|
|
64
|
+
ci_low, ci_high = np.percentile(deltas, [2.5, 97.5])
|
|
65
|
+
p_low = (np.count_nonzero(deltas <= 0.0) + 1) / (n_resamples + 1)
|
|
66
|
+
p_high = (np.count_nonzero(deltas >= 0.0) + 1) / (n_resamples + 1)
|
|
67
|
+
return {
|
|
68
|
+
"delta": float(diffs.mean()),
|
|
69
|
+
"ci_low": float(ci_low),
|
|
70
|
+
"ci_high": float(ci_high),
|
|
71
|
+
"p_value": float(min(1.0, 2.0 * min(p_low, p_high))),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def holm_bonferroni(p_values: dict[str, float]) -> dict[str, float]:
|
|
76
|
+
"""Holm step-down adjusted p-values, keyed like the input.
|
|
77
|
+
|
|
78
|
+
Sort the m raw p-values ascending; the i-th (1-based) is multiplied by
|
|
79
|
+
(m - i + 1), running maxima enforce monotonicity, and everything is
|
|
80
|
+
capped at 1.0. Ties are processed in sorted (p, key) order, which does
|
|
81
|
+
not affect the adjusted values.
|
|
82
|
+
"""
|
|
83
|
+
m = len(p_values)
|
|
84
|
+
adjusted: dict[str, float] = {}
|
|
85
|
+
running = 0.0
|
|
86
|
+
for i, (key, p) in enumerate(sorted(p_values.items(), key=lambda kv: (kv[1], kv[0]))):
|
|
87
|
+
if not 0.0 <= p <= 1.0:
|
|
88
|
+
raise ValueError(f"p-value for {key!r} outside [0, 1]: {p}")
|
|
89
|
+
running = max(running, (m - i) * p)
|
|
90
|
+
adjusted[key] = min(1.0, running)
|
|
91
|
+
return adjusted
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def cliffs_delta(a: list[float], b: list[float]) -> float:
|
|
95
|
+
"""Standard Cliff's delta in [-1, 1]: P(a > b) - P(a < b) over all pairs.
|
|
96
|
+
|
|
97
|
+
+1 means every a exceeds every b; -1 the reverse; 0 means stochastic
|
|
98
|
+
equality. Computed via sorted ranks (O((n+m) log m)), so pooled
|
|
99
|
+
multi-seed score lists are fine.
|
|
100
|
+
"""
|
|
101
|
+
a_arr = np.asarray(a, dtype=float)
|
|
102
|
+
b_arr = np.asarray(b, dtype=float)
|
|
103
|
+
if a_arr.size == 0 or b_arr.size == 0:
|
|
104
|
+
raise ValueError("cliffs_delta requires non-empty score lists")
|
|
105
|
+
b_sorted = np.sort(b_arr)
|
|
106
|
+
n_b_below = np.searchsorted(b_sorted, a_arr, side="left").sum()
|
|
107
|
+
n_b_above = (b_arr.size - np.searchsorted(b_sorted, a_arr, side="right")).sum()
|
|
108
|
+
return float((int(n_b_below) - int(n_b_above)) / (a_arr.size * b_arr.size))
|