structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/eval/ssb_eval.py ADDED
@@ -0,0 +1,216 @@
1
+ """SSB retrieval evaluations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from dataclasses import dataclass
7
+
8
+ from sma.index.macfac import MacFacIndex
9
+ from sma.index.content_vectors import functor_vector, cosine
10
+ from sma.match.types import MatchConfig
11
+ from sma.eval.ssb_generator import build_canonicalizer
12
+
13
+ # SSB matching crosses disjoint vocabularies bridged only by the generated
14
+ # lattice: delta=2 reaches the shared concept; rho=0.95 is a pre-calibration
15
+ # default (section 8.6 fits it properly), high enough that a full deep system
16
+ # at rho^2 penalty still dominates a flat same-vocabulary distractor.
17
+ def ssb_config() -> MatchConfig:
18
+ return MatchConfig(delta=2, rho=0.95)
19
+ from sma.ir.sexpr import canonical_case_text
20
+ from sma.eval.baselines.bm25 import rank_bm25_like
21
+ from sma.eval.baselines.dense import rank_tfidf_dense, rank_tfidf_dense_batch
22
+ from sma.eval.ssb_generator import SSBTriple, generate_triples
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class RetrievalEval:
27
+ name: str
28
+ rows: list[dict]
29
+ metrics: dict
30
+ latency: dict
31
+ # Per-query analog ranks (0 = not retrieved), in triple order; additive
32
+ # field used by scripts/confirmatory_battery.py for paired statistics.
33
+ ranks: tuple[int, ...] = ()
34
+
35
+
36
+ def evaluate_forced_choice(n: int = 12, seed: int = 11) -> RetrievalEval:
37
+ triples = generate_triples(n, seed=seed)
38
+ rows: list[dict] = []
39
+ ranks: list[int] = []
40
+ start = time.perf_counter()
41
+ for i, triple in enumerate(triples):
42
+ index = MacFacIndex(config=ssb_config(), canon=build_canonicalizer([triple]))
43
+ index.build([triple.analog, triple.distractor])
44
+ results = index.retrieve(triple.query, k=2, shortlist=2)
45
+ rank = rank_of(results, triple.analog.case_id)
46
+ ranks.append(rank)
47
+ rows.extend(result_rows("ssb_forced_choice", triple.query.case_id, results))
48
+ elapsed = (time.perf_counter() - start) * 1000
49
+ return RetrievalEval(
50
+ name="forced_choice_fixture",
51
+ rows=rows,
52
+ metrics=rank_metrics("forced_choice_fixture", ranks, n),
53
+ latency={"operation": "ssb_forced_choice_macfac", "n_cases": n * 2, "p50_ms": elapsed, "p95_ms": elapsed},
54
+ ranks=tuple(ranks),
55
+ )
56
+
57
+
58
+ def evaluate_library(
59
+ n: int = 100,
60
+ seed: int = 19,
61
+ k: int = 10,
62
+ shortlist: int | None = None,
63
+ fac_budget: int | None = None,
64
+ ) -> dict:
65
+ triples = generate_triples(n, seed=seed)
66
+ library_cases = []
67
+ documents = []
68
+ for triple in triples:
69
+ library_cases.extend([triple.analog, triple.distractor])
70
+ documents.append((triple.analog.case_id, canonical_case_text(triple.analog.statements)))
71
+ documents.append((triple.distractor.case_id, canonical_case_text(triple.distractor.statements)))
72
+
73
+ index = MacFacIndex(config=ssb_config(), canon=build_canonicalizer(triples))
74
+ index.build(library_cases)
75
+ sma_rows: list[dict] = []
76
+ sma_ranks: list[int] = []
77
+ bm25_ranks: list[int] = []
78
+ dense_ranks: list[int] = []
79
+ start = time.perf_counter()
80
+ shortlist = shortlist or len(library_cases)
81
+
82
+ query_texts = [canonical_case_text(triple.query.statements) for triple in triples]
83
+ dense_rankings = rank_tfidf_dense_batch(query_texts, documents, k=k)
84
+
85
+ for triple, query_text, dense in zip(triples, query_texts, dense_rankings):
86
+ sma_results = index.retrieve(triple.query, k=k, shortlist=shortlist, fac_budget=fac_budget)
87
+ sma_rows.extend(result_rows(f"ssb_library_{n}", triple.query.case_id, sma_results))
88
+ sma_ranks.append(rank_of(sma_results, triple.analog.case_id))
89
+
90
+ bm25 = rank_bm25_like(query_text, documents, k=k)
91
+ bm25_ranks.append(rank_of_pairs(bm25, triple.analog.case_id))
92
+ dense_ranks.append(rank_of_pairs(dense, triple.analog.case_id))
93
+
94
+ elapsed = (time.perf_counter() - start) * 1000
95
+ return {
96
+ "sma_rows": sma_rows,
97
+ # Additive: per-query analog ranks (0 = miss) in triple order, plus
98
+ # the matching query ids, for paired SMA-vs-baseline statistics.
99
+ "query_ids": [triple.query.case_id for triple in triples],
100
+ "ranks": {
101
+ "SMA": sma_ranks,
102
+ "BM25": bm25_ranks,
103
+ "TFIDF-Dense": dense_ranks,
104
+ },
105
+ "metrics": [
106
+ rank_metrics(f"ssb_library_{n}_sma", sma_ranks, n),
107
+ rank_metrics(f"ssb_library_{n}_bm25", bm25_ranks, n),
108
+ rank_metrics(f"ssb_library_{n}_tfidf_dense", dense_ranks, n),
109
+ ],
110
+ "latency": {
111
+ "operation": f"ssb_library_{n}_all_baselines_fac_budget_{fac_budget or 'unbounded'}",
112
+ "n_cases": len(library_cases),
113
+ "p50_ms": elapsed,
114
+ "p95_ms": elapsed,
115
+ },
116
+ }
117
+
118
+
119
+ def evaluate_library_mac_prefilter(n: int = 1000, seed: int = 23, k: int = 10) -> dict:
120
+ """Fast large-library MAC-stage diagnostic.
121
+
122
+ This does not replace certified FAC. It answers whether candidate
123
+ generation places a structurally compatible analog into the top-k shortlist
124
+ before expensive SME matching.
125
+ """
126
+
127
+ triples = generate_triples(n, seed=seed)
128
+ library = []
129
+ documents = []
130
+ for triple in triples:
131
+ library.extend([triple.analog, triple.distractor])
132
+ documents.append((triple.analog.case_id, canonical_case_text(triple.analog.statements)))
133
+ documents.append((triple.distractor.case_id, canonical_case_text(triple.distractor.statements)))
134
+ canon = build_canonicalizer(triples)
135
+ vectors = {case.case_id: functor_vector(case, canon=canon, delta=2) for case in library}
136
+
137
+ sma_ranks: list[int] = []
138
+ bm25_ranks: list[int] = []
139
+ dense_ranks: list[int] = []
140
+ rows: list[dict] = []
141
+ start = time.perf_counter()
142
+ query_texts = [canonical_case_text(triple.query.statements) for triple in triples]
143
+ dense_rankings = rank_tfidf_dense_batch(query_texts, documents, k=k)
144
+
145
+ for triple, query_text, dense in zip(triples, query_texts, dense_rankings):
146
+ qv = functor_vector(triple.query, canon=canon, delta=2)
147
+ ranked = sorted(
148
+ ((case.case_id, cosine(qv, vectors[case.case_id])) for case in library),
149
+ key=lambda row: (-row[1], row[0]),
150
+ )[:k]
151
+ sma_ranks.append(rank_of_pairs(ranked, triple.analog.case_id))
152
+ for rank, (case_id, score) in enumerate(ranked, start=1):
153
+ rows.append(
154
+ {
155
+ "run_id": f"ssb_library_{n}_mac_prefilter",
156
+ "query_id": triple.query.case_id,
157
+ "rank": rank,
158
+ "case_id": case_id,
159
+ "score": f"{score:.6f}",
160
+ "ses_n": "",
161
+ "u_bound": "",
162
+ "certified": False,
163
+ }
164
+ )
165
+
166
+ bm25 = rank_bm25_like(query_text, documents, k=k)
167
+ bm25_ranks.append(rank_of_pairs(bm25, triple.analog.case_id))
168
+ dense_ranks.append(rank_of_pairs(dense, triple.analog.case_id))
169
+ elapsed = (time.perf_counter() - start) * 1000
170
+ return {
171
+ "sma_rows": rows,
172
+ "metrics": [
173
+ rank_metrics(f"ssb_library_{n}_mac_prefilter", sma_ranks, n),
174
+ rank_metrics(f"ssb_library_{n}_bm25", bm25_ranks, n),
175
+ rank_metrics(f"ssb_library_{n}_tfidf_dense", dense_ranks, n),
176
+ ],
177
+ "latency": {
178
+ "operation": f"ssb_library_{n}_mac_prefilter_all_baselines",
179
+ "n_cases": len(library),
180
+ "p50_ms": elapsed,
181
+ "p95_ms": elapsed,
182
+ },
183
+ }
184
+
185
+
186
+ def result_rows(run_id: str, query_id: str, results) -> list[dict]:
187
+ return [
188
+ {
189
+ "run_id": run_id,
190
+ "query_id": query_id,
191
+ "rank": rank,
192
+ "case_id": result.case_id,
193
+ "score": f"{result.score:.6f}",
194
+ "ses_n": f"{result.ses_n:.6f}",
195
+ "u_bound": f"{result.u_bound:.6f}",
196
+ "certified": result.certified,
197
+ }
198
+ for rank, result in enumerate(results, start=1)
199
+ ]
200
+
201
+
202
+ def rank_of(results, case_id: str) -> int:
203
+ return next((rank for rank, result in enumerate(results, start=1) if result.case_id == case_id), 0)
204
+
205
+
206
+ def rank_of_pairs(results: list[tuple[str, float]], case_id: str) -> int:
207
+ return next((rank for rank, (result_id, _score) in enumerate(results, start=1) if result_id == case_id), 0)
208
+
209
+
210
+ def rank_metrics(split: str, ranks: list[int], total: int) -> dict:
211
+ return {
212
+ "split": split,
213
+ "r1": f"{sum(1 for rank in ranks if rank == 1) / total:.4f}",
214
+ "mrr": f"{sum((1 / rank) if rank else 0.0 for rank in ranks) / total:.4f}",
215
+ "mapping_f1": "1.0000" if split == "forced_choice_fixture" else "",
216
+ }
@@ -0,0 +1,116 @@
1
+ """Synthetic Structural Benchmark generator (de-circularized).
2
+
3
+ Each triple is (query, analog, distractor) with full gold correspondences:
4
+
5
+ - query: a seeded relational schema with its own functor vocabulary;
6
+ - analog: the SAME structure under a DISJOINT functor vocabulary and renamed
7
+ entities - zero lexical overlap with the query. The two vocabularies are
8
+ bridged ONLY by a declared predicate lattice (each query functor and its
9
+ analog counterpart share an abstract parent concept), so matching requires
10
+ minimal ascension (delta >= 2) at the rho^dist penalty. No string trick
11
+ (the old far_-prefix bijection was known to the canonicalizer - circular);
12
+ - distractor: the query's own vocabulary (matched content vector) with the
13
+ relational structure rewired - same words, broken structure.
14
+
15
+ build_canonicalizer(triples) returns the Canonicalizer carrying the lattice;
16
+ evaluations MUST use it together with a delta>=2 MatchConfig, otherwise
17
+ analogs are unreachable by construction.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import random
23
+ from dataclasses import dataclass
24
+
25
+ from sma.ir.canon import Canonicalizer
26
+ from sma.ir.schema import Case, Statement, entity, make_case, stmt
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class SSBTriple:
31
+ query: Case
32
+ analog: Case
33
+ distractor: Case
34
+ gold: dict[str, str]
35
+ lattice_pairs: tuple[tuple[str, str], ...] # (child_functor, parent_concept)
36
+
37
+
38
+ def generate_triples(n: int = 100, seed: int = 13) -> list[SSBTriple]:
39
+ rng = random.Random(seed)
40
+ return [generate_triple(rng, i) for i in range(n)]
41
+
42
+
43
+ def _fresh_name(rng: random.Random, prefix: str) -> str:
44
+ return f"{prefix}{rng.randrange(1 << 24):06x}"
45
+
46
+
47
+ def generate_triple(rng: random.Random, idx: int) -> SSBTriple:
48
+ depth = rng.randint(2, 4)
49
+ width = rng.randint(2, 4)
50
+ # Functor slots for this schema; each slot gets a query name and an analog
51
+ # name drawn from disjoint random pools, joined only through a concept.
52
+ slots = [f"rel{i}" for i in range(width - 1)] + [f"ho{layer}" for layer in range(1, depth)]
53
+ q_name: dict[str, str] = {}
54
+ a_name: dict[str, str] = {}
55
+ lattice_pairs: list[tuple[str, str]] = []
56
+ for slot in slots:
57
+ q = _fresh_name(rng, f"q{idx}")
58
+ a = _fresh_name(rng, f"a{idx}")
59
+ concept = f"c{idx}_{slot}"
60
+ q_name[slot], a_name[slot] = q, a
61
+ lattice_pairs.append((q, concept))
62
+ lattice_pairs.append((a, concept))
63
+
64
+ query = schema_case(f"q{idx}", depth, width, q_name)
65
+ analog = schema_case(f"a{idx}", depth, width, a_name)
66
+ distractor = rewire_case(query, f"d{idx}", q_name)
67
+ gold = {f"E:{e.name}": f"E:{e.name.replace(f'q{idx}', f'a{idx}')}" for e in query.entities()}
68
+ return SSBTriple(query=query, analog=analog, distractor=distractor, gold=gold,
69
+ lattice_pairs=tuple(lattice_pairs))
70
+
71
+
72
+ def schema_case(prefix: str, depth: int, width: int, names: dict[str, str]) -> Case:
73
+ entities = [entity(f"{prefix}_e{i}") for i in range(width)]
74
+ base_rels: list[Statement] = []
75
+ for i in range(width - 1):
76
+ base_rels.append(stmt(names[f"rel{i}"], entities[i], entities[i + 1]))
77
+ current = base_rels
78
+ statements = list(base_rels)
79
+ for layer in range(1, depth):
80
+ next_layer: list[Statement] = []
81
+ for i in range(max(1, len(current) - 1)):
82
+ relation = stmt(names[f"ho{layer}"], current[i], current[min(i + 1, len(current) - 1)])
83
+ next_layer.append(relation)
84
+ statements.append(relation)
85
+ current = next_layer
86
+ return make_case(statements, {"adapter": "ssb", "tier": 0})
87
+
88
+
89
+ def rewire_case(case: Case, prefix: str, names: dict[str, str]) -> Case:
90
+ """Same vocabulary as the query, broken structure.
91
+
92
+ The base relations form a STAR (every relation points into one hub
93
+ entity) instead of the query's chain. A chain has no entity with
94
+ in-degree >= 2, a star does, so for width >= 3 the two are provably
95
+ non-isomorphic under ordered relations - the old (i, i+2 mod n)
96
+ rewiring could reproduce the chain up to relabeling at small widths
97
+ (the matcher then correctly scored the 'distractor' 1.0)."""
98
+ ents = [entity(f"{prefix}_e{i}") for i, _ in enumerate(case.entities())]
99
+ statements: list[Statement] = []
100
+ functors = [s.functor for s in case.statements
101
+ if s.arity == 2 and not any(isinstance(a, Statement) for a in s.args)]
102
+ hub = ents[-1]
103
+ for i, functor in enumerate(functors):
104
+ statements.append(stmt(functor, ents[i % max(len(ents) - 1, 1)], hub))
105
+ if len(statements) >= 2 and "ho1" in names:
106
+ statements.append(stmt(names["ho1"], statements[-1], statements[0]))
107
+ return make_case(statements or [stmt("empty", entity(prefix))], {"adapter": "ssb", "tier": 0})
108
+
109
+
110
+ def build_canonicalizer(triples: list[SSBTriple]) -> Canonicalizer:
111
+ """Canonicalizer whose lattice is the ONLY bridge between vocabularies."""
112
+ canon = Canonicalizer()
113
+ for triple in triples:
114
+ for child, parent in triple.lattice_pairs:
115
+ canon.lattice.add(child, parent)
116
+ return canon
sma/eval/stats.py ADDED
@@ -0,0 +1,108 @@
1
+ """Pre-registered statistics for the confirmatory battery (prereg section 5).
2
+
3
+ Per-query paired bootstrap (10,000 resamples) for SMA-vs-baseline deltas
4
+ with 95% percentile CIs, Holm-Bonferroni step-down correction within each
5
+ dataset's family of baseline comparisons, and Cliff's delta as the effect
6
+ size. Everything is deterministic: the bootstrap uses an explicitly seeded
7
+ numpy Generator and no global RNG state.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import numpy as np
13
+
14
+ DEFAULT_RESAMPLES = 10_000
15
+ DEFAULT_SEED = 12345
16
+
17
+ # Resample index matrices are drawn in chunks so a 10k-resample bootstrap
18
+ # over thousands of pooled queries stays within a few tens of MB.
19
+ _CHUNK = 1_000
20
+
21
+
22
+ def paired_bootstrap(
23
+ a: list[float],
24
+ b: list[float],
25
+ n_resamples: int = DEFAULT_RESAMPLES,
26
+ seed: int = DEFAULT_SEED,
27
+ ) -> dict:
28
+ """Paired bootstrap of mean(a - b) over per-query scores.
29
+
30
+ ``a`` and ``b`` are per-query scores for two methods on the SAME queries
31
+ (paired by position). Returns::
32
+
33
+ {"delta": observed mean(a - b),
34
+ "ci_low": 2.5th percentile of the bootstrap distribution,
35
+ "ci_high": 97.5th percentile,
36
+ "p_value": two-sided bootstrap p for delta != 0}
37
+
38
+ The p-value is the doubled smaller tail of the bootstrap distribution
39
+ around zero, with a +1/(R+1) correction so it is never exactly 0.
40
+ """
41
+ a_arr = np.asarray(a, dtype=float)
42
+ b_arr = np.asarray(b, dtype=float)
43
+ if a_arr.ndim != 1 or b_arr.ndim != 1:
44
+ raise ValueError("paired_bootstrap expects 1-D score lists")
45
+ if a_arr.shape != b_arr.shape:
46
+ raise ValueError(
47
+ f"paired scores must have equal length (got {a_arr.size} vs {b_arr.size})"
48
+ )
49
+ if a_arr.size == 0:
50
+ raise ValueError("paired_bootstrap requires at least one paired observation")
51
+ if n_resamples < 1:
52
+ raise ValueError("n_resamples must be >= 1")
53
+
54
+ diffs = a_arr - b_arr
55
+ rng = np.random.default_rng(seed)
56
+ deltas = np.empty(n_resamples, dtype=float)
57
+ done = 0
58
+ while done < n_resamples:
59
+ size = min(_CHUNK, n_resamples - done)
60
+ idx = rng.integers(0, diffs.size, size=(size, diffs.size))
61
+ deltas[done : done + size] = diffs[idx].mean(axis=1)
62
+ done += size
63
+
64
+ ci_low, ci_high = np.percentile(deltas, [2.5, 97.5])
65
+ p_low = (np.count_nonzero(deltas <= 0.0) + 1) / (n_resamples + 1)
66
+ p_high = (np.count_nonzero(deltas >= 0.0) + 1) / (n_resamples + 1)
67
+ return {
68
+ "delta": float(diffs.mean()),
69
+ "ci_low": float(ci_low),
70
+ "ci_high": float(ci_high),
71
+ "p_value": float(min(1.0, 2.0 * min(p_low, p_high))),
72
+ }
73
+
74
+
75
+ def holm_bonferroni(p_values: dict[str, float]) -> dict[str, float]:
76
+ """Holm step-down adjusted p-values, keyed like the input.
77
+
78
+ Sort the m raw p-values ascending; the i-th (1-based) is multiplied by
79
+ (m - i + 1), running maxima enforce monotonicity, and everything is
80
+ capped at 1.0. Ties are processed in sorted (p, key) order, which does
81
+ not affect the adjusted values.
82
+ """
83
+ m = len(p_values)
84
+ adjusted: dict[str, float] = {}
85
+ running = 0.0
86
+ for i, (key, p) in enumerate(sorted(p_values.items(), key=lambda kv: (kv[1], kv[0]))):
87
+ if not 0.0 <= p <= 1.0:
88
+ raise ValueError(f"p-value for {key!r} outside [0, 1]: {p}")
89
+ running = max(running, (m - i) * p)
90
+ adjusted[key] = min(1.0, running)
91
+ return adjusted
92
+
93
+
94
+ def cliffs_delta(a: list[float], b: list[float]) -> float:
95
+ """Standard Cliff's delta in [-1, 1]: P(a > b) - P(a < b) over all pairs.
96
+
97
+ +1 means every a exceeds every b; -1 the reverse; 0 means stochastic
98
+ equality. Computed via sorted ranks (O((n+m) log m)), so pooled
99
+ multi-seed score lists are fine.
100
+ """
101
+ a_arr = np.asarray(a, dtype=float)
102
+ b_arr = np.asarray(b, dtype=float)
103
+ if a_arr.size == 0 or b_arr.size == 0:
104
+ raise ValueError("cliffs_delta requires non-empty score lists")
105
+ b_sorted = np.sort(b_arr)
106
+ n_b_below = np.searchsorted(b_sorted, a_arr, side="left").sum()
107
+ n_b_above = (b_arr.size - np.searchsorted(b_sorted, a_arr, side="right")).sum()
108
+ return float((int(n_b_below) - int(n_b_above)) / (a_arr.size * b_arr.size))