structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/agent/comparison.py
ADDED
|
@@ -0,0 +1,591 @@
|
|
|
1
|
+
"""Toggleable SMA / BM25 / dense-RAG / knowledge-graph / context-only comparison.
|
|
2
|
+
|
|
3
|
+
The modes mirror the LogHub evaluation baselines so what you see in the UI is
|
|
4
|
+
the same retrieval mathematics that produced reports/triage_metrics.csv:
|
|
5
|
+
|
|
6
|
+
- sma: MAC/FAC retrieval + SME mapping with candidate inferences
|
|
7
|
+
- bm25: lexical retrieval (rank_bm25 BM25Okapi over raw text)
|
|
8
|
+
- dense rag: sentence-transformer embeddings (all-MiniLM-L6-v2), with a
|
|
9
|
+
deterministic TF-IDF fallback when the model is unavailable
|
|
10
|
+
- knowledge graph: deterministic Tier-0 entity graph, entity-overlap +
|
|
11
|
+
neighbor-bonus scoring (KG-PPR proxy)
|
|
12
|
+
- context only: no retrieval; first k corpus items stuffed into context
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
from sma.encoders import get_encoder
|
|
21
|
+
from sma.encoders.coverage import coverage_warning, rule_coverage
|
|
22
|
+
from sma.encoders.draft_adapter import DraftAdapter
|
|
23
|
+
from sma.index.macfac import MacFacIndex
|
|
24
|
+
from sma.ir.schema import Case
|
|
25
|
+
from sma.match.engine import match_cases
|
|
26
|
+
from sma.match.explain import alignment_summary
|
|
27
|
+
from sma.match.infer import candidate_inferences
|
|
28
|
+
from sma.match.types import MatchConfig
|
|
29
|
+
|
|
30
|
+
from .llm import LocalOrchestrator, default_deepseek, default_orchestrator
|
|
31
|
+
|
|
32
|
+
MODES = ("sma", "bm25", "dense rag", "knowledge graph", "hybrid (fused)", "context only")
|
|
33
|
+
MODE_ALIASES = {
|
|
34
|
+
"rag": "dense rag",
|
|
35
|
+
"kg": "knowledge graph",
|
|
36
|
+
"context": "context only",
|
|
37
|
+
"hybrid": "hybrid (fused)",
|
|
38
|
+
"rrf": "hybrid (fused)",
|
|
39
|
+
"fused": "hybrid (fused)",
|
|
40
|
+
}
|
|
41
|
+
LLM_BACKENDS = ("local", "deepseek")
|
|
42
|
+
|
|
43
|
+
DENSE_MODEL_NAME = "all-MiniLM-L6-v2"
|
|
44
|
+
|
|
45
|
+
_dense_model = None
|
|
46
|
+
_dense_model_error: str | None = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def dense_model():
|
|
50
|
+
"""Lazily load the sentence-transformer once per process; None if unavailable."""
|
|
51
|
+
global _dense_model, _dense_model_error
|
|
52
|
+
if _dense_model is None and _dense_model_error is None:
|
|
53
|
+
try:
|
|
54
|
+
from sentence_transformers import SentenceTransformer
|
|
55
|
+
|
|
56
|
+
_dense_model = SentenceTransformer(DENSE_MODEL_NAME)
|
|
57
|
+
except Exception as exc:
|
|
58
|
+
_dense_model_error = f"{type(exc).__name__}: {exc}"
|
|
59
|
+
return _dense_model
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class CorpusItem:
|
|
64
|
+
item_id: str
|
|
65
|
+
text: str
|
|
66
|
+
adapter_id: str
|
|
67
|
+
case: Case
|
|
68
|
+
label: str = ""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class ModeResult:
|
|
73
|
+
mode: str
|
|
74
|
+
answer: str
|
|
75
|
+
evidence: list[dict]
|
|
76
|
+
llm_status: dict
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ComparisonFramework:
|
|
80
|
+
def __init__(self, orchestrator: LocalOrchestrator | None = None):
|
|
81
|
+
self.items: list[CorpusItem] = []
|
|
82
|
+
self.index = MacFacIndex()
|
|
83
|
+
self.orchestrator = orchestrator or default_orchestrator
|
|
84
|
+
self.orchestrators = {"local": self.orchestrator, "deepseek": default_deepseek}
|
|
85
|
+
# Session flag: when set, logs-adapter encoding (corpus AND queries)
|
|
86
|
+
# goes through the LLM-proposed draft adapter and every SMA evidence
|
|
87
|
+
# row is provenance-stamped as unreviewed.
|
|
88
|
+
self.draft_adapter: DraftAdapter | None = None
|
|
89
|
+
# Per-corpus caches, rebuilt when _version changes.
|
|
90
|
+
self._version = 0
|
|
91
|
+
self._bm25 = None
|
|
92
|
+
self._bm25_version = -1
|
|
93
|
+
self._dense_embeddings = None
|
|
94
|
+
self._dense_version = -1
|
|
95
|
+
self._entity_graphs: list[dict[str, set[str]]] = []
|
|
96
|
+
self._graph_version = -1
|
|
97
|
+
|
|
98
|
+
def clear(self) -> None:
|
|
99
|
+
self.items.clear()
|
|
100
|
+
self.index = MacFacIndex(config=self.index.config)
|
|
101
|
+
self._version += 1
|
|
102
|
+
|
|
103
|
+
def set_scorer(self, scorer: str) -> None:
|
|
104
|
+
"""Switch the SMA scoring regime (ses or mdl) without reindexing.
|
|
105
|
+
|
|
106
|
+
The MAC index structures are scorer-independent; only the FAC scoring
|
|
107
|
+
step reads config, and its cache key already includes the scorer.
|
|
108
|
+
"""
|
|
109
|
+
if scorer not in ("ses", "mdl", "surprisal"):
|
|
110
|
+
raise ValueError(f"unknown scorer: {scorer!r}; expected 'ses', 'mdl' or 'surprisal'")
|
|
111
|
+
self.index.config = MatchConfig(scorer=scorer)
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def draft_note(self) -> str | None:
|
|
115
|
+
"""Provenance stamp shown on every SMA evidence row and the chips strip."""
|
|
116
|
+
if self.draft_adapter is None:
|
|
117
|
+
return None
|
|
118
|
+
return f"draft-adapter (LLM-proposed, unreviewed) hash={self.draft_adapter.draft_hash[:8]}"
|
|
119
|
+
|
|
120
|
+
def apply_draft_adapter(self, adapter: DraftAdapter) -> int:
|
|
121
|
+
"""Re-encode the whole corpus through the draft adapter (texts/labels kept)."""
|
|
122
|
+
self.draft_adapter = adapter
|
|
123
|
+
return self._reencode_all()
|
|
124
|
+
|
|
125
|
+
def revert_draft_adapter(self) -> int:
|
|
126
|
+
"""Restore base adapters: re-encode every item with its original encoder."""
|
|
127
|
+
self.draft_adapter = None
|
|
128
|
+
return self._reencode_all()
|
|
129
|
+
|
|
130
|
+
def _reencode_all(self) -> int:
|
|
131
|
+
self.index = MacFacIndex(config=self.index.config)
|
|
132
|
+
for item in self.items:
|
|
133
|
+
item.case = self._encode(item.text, item.adapter_id)
|
|
134
|
+
self.index.add(item.case)
|
|
135
|
+
self._version += 1
|
|
136
|
+
return len(self.items)
|
|
137
|
+
|
|
138
|
+
def _encode(self, text: str, adapter_id: str) -> Case:
|
|
139
|
+
if self.draft_adapter is not None and adapter_id == "logs":
|
|
140
|
+
return self.draft_adapter.encode(text).case
|
|
141
|
+
return get_encoder(adapter_id).encode(text).case
|
|
142
|
+
|
|
143
|
+
def add_document(self, text: str, adapter_id: str = "logs", label: str = "") -> CorpusItem:
|
|
144
|
+
case = self._encode(text, adapter_id)
|
|
145
|
+
item = CorpusItem(
|
|
146
|
+
item_id=f"doc_{len(self.items)}",
|
|
147
|
+
text=text,
|
|
148
|
+
adapter_id=adapter_id,
|
|
149
|
+
case=case,
|
|
150
|
+
label=label,
|
|
151
|
+
)
|
|
152
|
+
self.items.append(item)
|
|
153
|
+
self.index.add(case)
|
|
154
|
+
self._version += 1
|
|
155
|
+
return item
|
|
156
|
+
|
|
157
|
+
def load_lines(
|
|
158
|
+
self,
|
|
159
|
+
corpus_text: str,
|
|
160
|
+
adapter_id: str = "logs",
|
|
161
|
+
max_items: int = 50,
|
|
162
|
+
single_case: bool = False,
|
|
163
|
+
) -> list[CorpusItem]:
|
|
164
|
+
"""Load raw text as corpus items.
|
|
165
|
+
|
|
166
|
+
single_case=True keeps the whole text as ONE incident. Otherwise text
|
|
167
|
+
splits on blank lines into blocks; a single block with no blank lines
|
|
168
|
+
falls back to one item per line (the legacy behavior that quietly
|
|
169
|
+
explodes a pasted session - hence the explicit flag).
|
|
170
|
+
"""
|
|
171
|
+
if single_case and corpus_text.strip():
|
|
172
|
+
return [self.add_document(corpus_text.strip(), adapter_id=adapter_id)]
|
|
173
|
+
added = []
|
|
174
|
+
blocks = split_corpus(corpus_text)
|
|
175
|
+
for block in blocks[:max_items]:
|
|
176
|
+
added.append(self.add_document(block, adapter_id=adapter_id))
|
|
177
|
+
return added
|
|
178
|
+
|
|
179
|
+
def evidence_for(self, question: str, mode: str, adapter_id: str = "logs", k: int = 4) -> tuple[str, list[dict]]:
|
|
180
|
+
"""Resolve a mode name and run its retriever; returns (canonical_mode, evidence)."""
|
|
181
|
+
mode = MODE_ALIASES.get(mode.lower(), mode.lower())
|
|
182
|
+
if mode == "sma":
|
|
183
|
+
return mode, self.sma_evidence(question, adapter_id, k)
|
|
184
|
+
if mode == "bm25":
|
|
185
|
+
return mode, self.bm25_evidence(question, k)
|
|
186
|
+
if mode == "dense rag":
|
|
187
|
+
return mode, self.dense_evidence(question, k)
|
|
188
|
+
if mode == "knowledge graph":
|
|
189
|
+
return mode, self.kg_evidence(question, k)
|
|
190
|
+
if mode == "hybrid (fused)":
|
|
191
|
+
return mode, self.hybrid_evidence(question, adapter_id, k)
|
|
192
|
+
if mode == "context only":
|
|
193
|
+
return mode, self.context_evidence(k)
|
|
194
|
+
raise ValueError(f"unknown mode: {mode!r}; expected one of {MODES}")
|
|
195
|
+
|
|
196
|
+
def ask(
|
|
197
|
+
self,
|
|
198
|
+
question: str,
|
|
199
|
+
mode: str,
|
|
200
|
+
adapter_id: str = "logs",
|
|
201
|
+
k: int = 4,
|
|
202
|
+
llm: str = "local",
|
|
203
|
+
history: list[dict] | None = None,
|
|
204
|
+
) -> ModeResult:
|
|
205
|
+
mode, evidence = self.evidence_for(question, mode, adapter_id=adapter_id, k=k)
|
|
206
|
+
orchestrator = self.orchestrators.get(llm)
|
|
207
|
+
if orchestrator is None:
|
|
208
|
+
raise ValueError(f"unknown llm backend: {llm!r}; expected one of {LLM_BACKENDS}")
|
|
209
|
+
answer = orchestrator.answer(question, mode, evidence, history=history)
|
|
210
|
+
return ModeResult(mode=mode, answer=answer, evidence=evidence, llm_status=orchestrator.status)
|
|
211
|
+
|
|
212
|
+
def ask_all(self, question: str, adapter_id: str = "logs", k: int = 4,
|
|
213
|
+
modes: tuple[str, ...] | list[str] = MODES, llm: str = "local") -> dict[str, ModeResult]:
|
|
214
|
+
return {mode: self.ask(question, mode, adapter_id=adapter_id, k=k, llm=llm) for mode in modes}
|
|
215
|
+
|
|
216
|
+
# --- mode implementations -------------------------------------------------
|
|
217
|
+
|
|
218
|
+
def sma_evidence(self, question: str, adapter_id: str, k: int) -> list[dict]:
|
|
219
|
+
query_case = self._encode(question, adapter_id)
|
|
220
|
+
# Lattice-miss tripwire (blueprint 12-R3): how much of the query's
|
|
221
|
+
# vocabulary the frozen class rules actually cover.
|
|
222
|
+
coverage = rule_coverage(
|
|
223
|
+
question,
|
|
224
|
+
extra_classes=self.draft_adapter.rules.classes if self.draft_adapter else None,
|
|
225
|
+
)
|
|
226
|
+
mode_detail = "SME mapping + MAC/FAC retrieval"
|
|
227
|
+
if self.draft_note:
|
|
228
|
+
mode_detail += f"; {self.draft_note}"
|
|
229
|
+
# MAC/FAC budgets keep large corpora interactive: the MAC stage screens
|
|
230
|
+
# everything, full SME mapping runs only on the budgeted shortlist.
|
|
231
|
+
shortlist = min(max(k, len(self.items)), 200)
|
|
232
|
+
fac_budget = 30 if len(self.items) > 100 else None
|
|
233
|
+
results = self.index.retrieve(query_case, k=k, shortlist=shortlist, fac_budget=fac_budget)
|
|
234
|
+
evidence = []
|
|
235
|
+
case_to_item = {item.case.case_id: item for item in self.items}
|
|
236
|
+
for result in results:
|
|
237
|
+
item = case_to_item[result.case_id]
|
|
238
|
+
gmap = match_cases(item.case, query_case, config=self.index.config)
|
|
239
|
+
inferences = candidate_inferences(gmap)
|
|
240
|
+
evidence.append(
|
|
241
|
+
{
|
|
242
|
+
"source_id": item.item_id,
|
|
243
|
+
"label": item.label,
|
|
244
|
+
"score": f"{result.ses_n:.4f}",
|
|
245
|
+
"text": item.text,
|
|
246
|
+
"provenance": f"case={item.case.case_id}; ses_n={result.ses_n:.4f}; certified={result.certified}",
|
|
247
|
+
"mode_detail": mode_detail,
|
|
248
|
+
"alignment": alignment_summary(gmap),
|
|
249
|
+
"inferences": [inf.inference_sexpr for inf in inferences[:3]],
|
|
250
|
+
"coverage": coverage,
|
|
251
|
+
}
|
|
252
|
+
)
|
|
253
|
+
warning = coverage_warning(coverage)
|
|
254
|
+
if warning:
|
|
255
|
+
evidence.insert(0, self._coverage_warning_row(coverage, warning))
|
|
256
|
+
return evidence
|
|
257
|
+
|
|
258
|
+
def _coverage_warning_row(self, coverage: dict, warning: str) -> dict:
|
|
259
|
+
"""Pseudo-row prepended to SMA evidence when coverage trips 12-R3."""
|
|
260
|
+
return {
|
|
261
|
+
"source_id": "coverage-tripwire",
|
|
262
|
+
"label": "",
|
|
263
|
+
"score": f"{coverage['fraction']:.2f}",
|
|
264
|
+
"text": "",
|
|
265
|
+
"provenance": (
|
|
266
|
+
f"rule_coverage={coverage['covered_lines']}/{coverage['total_lines']} "
|
|
267
|
+
"non-empty lines fired EVENT_CLASS_RULES (blueprint 12-R3)"
|
|
268
|
+
),
|
|
269
|
+
"mode_detail": "structural coverage tripwire",
|
|
270
|
+
"warning": warning,
|
|
271
|
+
"coverage": coverage,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# --- candidate rankings shared by hybrid fusion ---------------------------
|
|
275
|
+
|
|
276
|
+
def _bm25_ranking(self, question: str, n: int) -> list[tuple[str, float]]:
|
|
277
|
+
if self._bm25_version != self._version:
|
|
278
|
+
from rank_bm25 import BM25Okapi
|
|
279
|
+
|
|
280
|
+
self._bm25 = BM25Okapi([item.text.lower().split() for item in self.items])
|
|
281
|
+
self._bm25_version = self._version
|
|
282
|
+
scores = self._bm25.get_scores(question.lower().split())
|
|
283
|
+
ranked = sorted(zip(self.items, scores), key=lambda row: (-row[1], row[0].item_id))
|
|
284
|
+
return [(item.item_id, float(score)) for item, score in ranked[:n]]
|
|
285
|
+
|
|
286
|
+
def _dense_ranking(self, question: str, n: int) -> list[tuple[str, float]]:
|
|
287
|
+
model = dense_model()
|
|
288
|
+
if model is None:
|
|
289
|
+
from sma.eval.baselines.dense import rank_tfidf_dense
|
|
290
|
+
|
|
291
|
+
return rank_tfidf_dense(
|
|
292
|
+
question, [(item.item_id, item.text) for item in self.items], k=n
|
|
293
|
+
)
|
|
294
|
+
if self._dense_version != self._version:
|
|
295
|
+
self._dense_embeddings = model.encode(
|
|
296
|
+
[item.text for item in self.items], convert_to_tensor=True, show_progress_bar=False
|
|
297
|
+
)
|
|
298
|
+
self._dense_version = self._version
|
|
299
|
+
from sentence_transformers import util
|
|
300
|
+
|
|
301
|
+
query_embedding = model.encode(question, convert_to_tensor=True, show_progress_bar=False)
|
|
302
|
+
sims = util.cos_sim(query_embedding, self._dense_embeddings)[0].cpu().tolist()
|
|
303
|
+
ranked = sorted(zip(self.items, sims), key=lambda row: (-row[1], row[0].item_id))
|
|
304
|
+
return [(item.item_id, float(score)) for item, score in ranked[:n]]
|
|
305
|
+
|
|
306
|
+
def _sma_ranking(self, query_case: Case, n: int) -> list[tuple[str, float]]:
|
|
307
|
+
shortlist = min(max(n, len(self.items)), 200)
|
|
308
|
+
fac_budget = 30 if len(self.items) > 100 else None
|
|
309
|
+
results = self.index.retrieve(query_case, k=n, shortlist=shortlist, fac_budget=fac_budget)
|
|
310
|
+
case_to_item = {item.case.case_id: item for item in self.items}
|
|
311
|
+
return [
|
|
312
|
+
(case_to_item[result.case_id].item_id, result.ses_n)
|
|
313
|
+
for result in results
|
|
314
|
+
if result.case_id in case_to_item
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
def hybrid_evidence(self, question: str, adapter_id: str, k: int) -> list[dict]:
|
|
318
|
+
"""RRF-fused bm25+dense+sma candidates, SME alignment receipts on each.
|
|
319
|
+
|
|
320
|
+
Candidate generation is the union of the top-20 from each retriever,
|
|
321
|
+
fused with reciprocal-rank fusion; the fused top-k then get full SME
|
|
322
|
+
receipts (match_cases + alignment_summary) so accountability rides on
|
|
323
|
+
every candidate regardless of which retriever found it.
|
|
324
|
+
"""
|
|
325
|
+
if not self.items:
|
|
326
|
+
return []
|
|
327
|
+
from sma.eval.baselines.hybrid_rrf import rrf_fuse
|
|
328
|
+
|
|
329
|
+
query_case = self._encode(question, adapter_id)
|
|
330
|
+
coverage = rule_coverage(
|
|
331
|
+
question,
|
|
332
|
+
extra_classes=self.draft_adapter.rules.classes if self.draft_adapter else None,
|
|
333
|
+
)
|
|
334
|
+
n = 20
|
|
335
|
+
rankings = {
|
|
336
|
+
"bm25": self._bm25_ranking(question, n),
|
|
337
|
+
"dense": self._dense_ranking(question, n),
|
|
338
|
+
"sma": self._sma_ranking(query_case, n),
|
|
339
|
+
}
|
|
340
|
+
fused = rrf_fuse(list(rankings.values()), top_k=k)
|
|
341
|
+
rank_of = {
|
|
342
|
+
name: {doc_id: rank for rank, (doc_id, _score) in enumerate(ranking, start=1)}
|
|
343
|
+
for name, ranking in rankings.items()
|
|
344
|
+
}
|
|
345
|
+
mode_detail = "RRF(bm25+dense+sma) candidates, SME alignment receipts"
|
|
346
|
+
if self.draft_note:
|
|
347
|
+
mode_detail += f"; {self.draft_note}"
|
|
348
|
+
by_id = {item.item_id: item for item in self.items}
|
|
349
|
+
evidence = []
|
|
350
|
+
for doc_id, fused_score in fused:
|
|
351
|
+
item = by_id[doc_id]
|
|
352
|
+
gmap = match_cases(item.case, query_case, config=self.index.config)
|
|
353
|
+
inferences = candidate_inferences(gmap)
|
|
354
|
+
ranks = ", ".join(
|
|
355
|
+
f"{name}={rank_of[name].get(doc_id, '-')}" for name in ("bm25", "dense", "sma")
|
|
356
|
+
)
|
|
357
|
+
evidence.append(
|
|
358
|
+
{
|
|
359
|
+
"source_id": item.item_id,
|
|
360
|
+
"label": item.label,
|
|
361
|
+
"score": f"{fused_score:.4f}",
|
|
362
|
+
"text": item.text,
|
|
363
|
+
"provenance": f"rrf={fused_score:.4f}; ranks({ranks}); case={item.case.case_id}",
|
|
364
|
+
"mode_detail": mode_detail,
|
|
365
|
+
"alignment": alignment_summary(gmap),
|
|
366
|
+
"inferences": [inf.inference_sexpr for inf in inferences[:3]],
|
|
367
|
+
"coverage": coverage,
|
|
368
|
+
}
|
|
369
|
+
)
|
|
370
|
+
warning = coverage_warning(coverage)
|
|
371
|
+
if warning:
|
|
372
|
+
evidence.insert(0, self._coverage_warning_row(coverage, warning))
|
|
373
|
+
return evidence
|
|
374
|
+
|
|
375
|
+
def bm25_evidence(self, question: str, k: int) -> list[dict]:
|
|
376
|
+
if not self.items:
|
|
377
|
+
return []
|
|
378
|
+
if self._bm25_version != self._version:
|
|
379
|
+
from rank_bm25 import BM25Okapi
|
|
380
|
+
|
|
381
|
+
self._bm25 = BM25Okapi([item.text.lower().split() for item in self.items])
|
|
382
|
+
self._bm25_version = self._version
|
|
383
|
+
scores = self._bm25.get_scores(question.lower().split())
|
|
384
|
+
ranked = sorted(zip(self.items, scores), key=lambda row: (-row[1], row[0].item_id))
|
|
385
|
+
return [
|
|
386
|
+
{
|
|
387
|
+
"source_id": item.item_id,
|
|
388
|
+
"label": item.label,
|
|
389
|
+
"score": f"{score:.4f}",
|
|
390
|
+
"text": item.text,
|
|
391
|
+
"provenance": f"bm25_okapi={score:.4f}",
|
|
392
|
+
"mode_detail": "BM25 lexical retrieval (rank_bm25)",
|
|
393
|
+
}
|
|
394
|
+
for item, score in ranked[:k]
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
def dense_evidence(self, question: str, k: int) -> list[dict]:
|
|
398
|
+
if not self.items:
|
|
399
|
+
return []
|
|
400
|
+
model = dense_model()
|
|
401
|
+
if model is None:
|
|
402
|
+
from sma.eval.baselines.dense import rank_tfidf_dense
|
|
403
|
+
|
|
404
|
+
ranked = rank_tfidf_dense(
|
|
405
|
+
question, [(item.item_id, item.text) for item in self.items], k=k
|
|
406
|
+
)
|
|
407
|
+
by_id = {item.item_id: item for item in self.items}
|
|
408
|
+
return [
|
|
409
|
+
{
|
|
410
|
+
"source_id": item_id,
|
|
411
|
+
"label": by_id[item_id].label,
|
|
412
|
+
"score": f"{score:.4f}",
|
|
413
|
+
"text": by_id[item_id].text,
|
|
414
|
+
"provenance": f"tfidf_cosine={score:.4f}",
|
|
415
|
+
"mode_detail": (
|
|
416
|
+
"TF-IDF fallback (sentence-transformers unavailable: "
|
|
417
|
+
f"{_dense_model_error})"
|
|
418
|
+
),
|
|
419
|
+
}
|
|
420
|
+
for item_id, score in ranked
|
|
421
|
+
]
|
|
422
|
+
if self._dense_version != self._version:
|
|
423
|
+
self._dense_embeddings = model.encode(
|
|
424
|
+
[item.text for item in self.items], convert_to_tensor=True, show_progress_bar=False
|
|
425
|
+
)
|
|
426
|
+
self._dense_version = self._version
|
|
427
|
+
from sentence_transformers import util
|
|
428
|
+
|
|
429
|
+
query_embedding = model.encode(question, convert_to_tensor=True, show_progress_bar=False)
|
|
430
|
+
sims = util.cos_sim(query_embedding, self._dense_embeddings)[0].cpu().tolist()
|
|
431
|
+
ranked = sorted(zip(self.items, sims), key=lambda row: (-row[1], row[0].item_id))
|
|
432
|
+
return [
|
|
433
|
+
{
|
|
434
|
+
"source_id": item.item_id,
|
|
435
|
+
"label": item.label,
|
|
436
|
+
"score": f"{score:.4f}",
|
|
437
|
+
"text": item.text,
|
|
438
|
+
"provenance": f"dense_cosine={score:.4f}; model={DENSE_MODEL_NAME}",
|
|
439
|
+
"mode_detail": "Dense RAG (sentence-transformers)",
|
|
440
|
+
}
|
|
441
|
+
for item, score in ranked[:k]
|
|
442
|
+
]
|
|
443
|
+
|
|
444
|
+
def kg_evidence(self, question: str, k: int) -> list[dict]:
|
|
445
|
+
if not self.items:
|
|
446
|
+
return []
|
|
447
|
+
if self._graph_version != self._version:
|
|
448
|
+
self._entity_graphs = [case_entity_graph(item.case) for item in self.items]
|
|
449
|
+
self._graph_version = self._version
|
|
450
|
+
q_entities = {token.lower() for token in entityish_tokens(question)}
|
|
451
|
+
rows = []
|
|
452
|
+
for item, graph in zip(self.items, self._entity_graphs):
|
|
453
|
+
matched = q_entities & set(graph)
|
|
454
|
+
neighbor_bonus = sum(len(graph[token]) for token in matched)
|
|
455
|
+
score = len(matched) + 0.1 * neighbor_bonus
|
|
456
|
+
rows.append((score, item, sorted(matched)))
|
|
457
|
+
rows.sort(key=lambda row: (-row[0], row[1].item_id))
|
|
458
|
+
return [
|
|
459
|
+
{
|
|
460
|
+
"source_id": item.item_id,
|
|
461
|
+
"label": item.label,
|
|
462
|
+
"score": f"{score:.4f}",
|
|
463
|
+
"text": item.text,
|
|
464
|
+
"provenance": f"entity_overlap_ppr_proxy={score:.4f}; matched={','.join(matched) or 'none'}",
|
|
465
|
+
"mode_detail": "Tier-0 entity graph, overlap + neighbor bonus (KG-PPR proxy)",
|
|
466
|
+
}
|
|
467
|
+
for score, item, matched in rows[:k]
|
|
468
|
+
]
|
|
469
|
+
|
|
470
|
+
def context_evidence(self, k: int) -> list[dict]:
|
|
471
|
+
return [
|
|
472
|
+
{
|
|
473
|
+
"source_id": item.item_id,
|
|
474
|
+
"label": item.label,
|
|
475
|
+
"score": "context",
|
|
476
|
+
"text": item.text,
|
|
477
|
+
"provenance": "raw_context_window",
|
|
478
|
+
"mode_detail": "No retrieval; first corpus items stuffed into context",
|
|
479
|
+
}
|
|
480
|
+
for item in self.items[:k]
|
|
481
|
+
]
|
|
482
|
+
|
|
483
|
+
def corpus_table(self) -> list[list[str]]:
|
|
484
|
+
return [
|
|
485
|
+
[
|
|
486
|
+
item.item_id,
|
|
487
|
+
item.adapter_id,
|
|
488
|
+
str(len(item.case.statements)),
|
|
489
|
+
item.case.case_id[:12],
|
|
490
|
+
item.text[:160].replace("\n", " "),
|
|
491
|
+
]
|
|
492
|
+
for item in self.items
|
|
493
|
+
]
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def split_corpus(corpus_text: str) -> list[str]:
|
|
497
|
+
blocks = [block.strip() for block in corpus_text.split("\n\n") if block.strip()]
|
|
498
|
+
if len(blocks) <= 1:
|
|
499
|
+
blocks = [line.strip() for line in corpus_text.splitlines() if line.strip()]
|
|
500
|
+
return blocks
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def entityish_tokens(text: str) -> list[str]:
|
|
504
|
+
import re
|
|
505
|
+
|
|
506
|
+
return re.findall(r"[A-Za-z0-9_.:/-]{2,}", text)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def case_entity_graph(case: Case) -> dict[str, set[str]]:
|
|
510
|
+
graph: dict[str, set[str]] = defaultdict(set)
|
|
511
|
+
for expr in case.expressions():
|
|
512
|
+
entities = [entity.name.lower() for entity in expr.entities()]
|
|
513
|
+
for left in entities:
|
|
514
|
+
for right in entities:
|
|
515
|
+
if left != right:
|
|
516
|
+
graph[left].add(right)
|
|
517
|
+
return graph
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def demo_corpus() -> str:
|
|
521
|
+
return "\n".join(
|
|
522
|
+
[
|
|
523
|
+
"INFO DataNode blk_123 timeout connecting to 10.0.0.1",
|
|
524
|
+
"WARN DataNode blk_123 retry after timeout",
|
|
525
|
+
"ERROR DataNode blk_123 failed after retry",
|
|
526
|
+
"INFO API service accepted request /checkout",
|
|
527
|
+
"ERROR DB connection timeout caused retry storm in checkout service",
|
|
528
|
+
"WARN worker restarted after queue saturation cleared",
|
|
529
|
+
]
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def challenge_corpus() -> str:
|
|
534
|
+
"""Adversarial incident library for mode comparison (logs adapter).
|
|
535
|
+
|
|
536
|
+
Blank-line separated blocks become one case each. Designed traps:
|
|
537
|
+
context-only grabs the flashy first block; BM25/dense are pulled by shared
|
|
538
|
+
rare tokens; the KG proxy is pulled by shared component entities; only the
|
|
539
|
+
causal anatomy (timeout -> retries -> failure) identifies true analogs.
|
|
540
|
+
"""
|
|
541
|
+
return "\n\n".join(
|
|
542
|
+
[
|
|
543
|
+
# 0: context-only trap — flashy, totally unrelated, sits first.
|
|
544
|
+
"ERROR AuthService invalid signature on admin token\n"
|
|
545
|
+
"ERROR AuthService possible credential stuffing detected\n"
|
|
546
|
+
"WARN AuthService source addresses blocked by waf",
|
|
547
|
+
# 1: full cascade, payment vocabulary (structural target).
|
|
548
|
+
"ERROR PaymentGateway connection timeout to db-shard-7\n"
|
|
549
|
+
"WARN PaymentGateway retrying transaction batch\n"
|
|
550
|
+
"WARN PaymentGateway retrying transaction batch\n"
|
|
551
|
+
"ERROR PaymentGateway transaction failed after repeated retry\n"
|
|
552
|
+
"ERROR PaymentGateway worker pool exhausted failure",
|
|
553
|
+
# 2: benign timeout mention, no cascade (surface distractor).
|
|
554
|
+
"INFO SearchService deployment completed successfully\n"
|
|
555
|
+
"INFO SearchService timeout setting increased to 30s by operator\n"
|
|
556
|
+
"INFO SearchService cache warmed and serving",
|
|
557
|
+
# 3: healthy pipeline, BackupAgent (entity/component trap).
|
|
558
|
+
"INFO BackupAgent nightly snapshot started on host 10.0.0.1\n"
|
|
559
|
+
"INFO BackupAgent snapshot uploaded to object storage\n"
|
|
560
|
+
"INFO BackupAgent snapshot rotation complete",
|
|
561
|
+
# 4: transient blip sharing rare tokens (fetch/asset/bundle) with Q4.
|
|
562
|
+
"WARN CacheNode fetch timeout for asset bundle\n"
|
|
563
|
+
"INFO CacheNode fetch recovered on second attempt\n"
|
|
564
|
+
"INFO CacheNode asset bundle served normally",
|
|
565
|
+
# 5: retry storm, ApiEdge vocabulary (structural target for Q4).
|
|
566
|
+
"WARN ApiEdge upstream timeout on route /search\n"
|
|
567
|
+
"WARN ApiEdge retrying upstream call\n"
|
|
568
|
+
"WARN ApiEdge retrying upstream call\n"
|
|
569
|
+
"WARN ApiEdge retrying upstream call\n"
|
|
570
|
+
"ERROR ApiEdge circuit breaker opened after retry failure",
|
|
571
|
+
# 6: second cascade, broker vocabulary (alternate structural target).
|
|
572
|
+
"ERROR MsgBroker session timeout waiting for heartbeat ack\n"
|
|
573
|
+
"WARN MsgBroker consumer retrying fetch offset\n"
|
|
574
|
+
"ERROR MsgBroker consumer fetch failed\n"
|
|
575
|
+
"ERROR MsgBroker partition leader election failed",
|
|
576
|
+
# 7: maintenance window, restart without any failure chain.
|
|
577
|
+
"INFO Scheduler maintenance window opened\n"
|
|
578
|
+
"INFO Scheduler workers restarted in rolling order\n"
|
|
579
|
+
"INFO Scheduler maintenance window closed",
|
|
580
|
+
]
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
__all__ = [
|
|
585
|
+
"ComparisonFramework",
|
|
586
|
+
"CorpusItem",
|
|
587
|
+
"ModeResult",
|
|
588
|
+
"MODES",
|
|
589
|
+
"demo_corpus",
|
|
590
|
+
"challenge_corpus",
|
|
591
|
+
]
|