structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Retrieval-by-analogy illicit-detection evaluation on the Elliptic graph.
|
|
2
|
+
|
|
3
|
+
Each transaction is encoded as a *case of graph-neighbourhood typology terms*
|
|
4
|
+
(see :mod:`sma.eval.fraud_elliptic.encoder`). The labelled nodes are split into
|
|
5
|
+
a train (index) set and a test set. A memory indexes the train cases; for each
|
|
6
|
+
test node we retrieve its top-k analogs and vote their (known) labels — weighted
|
|
7
|
+
by retrieval confidence — into an illicit score. This is the *same* analogical
|
|
8
|
+
retrieval SMA is built for, now used as a kNN classifier so the metric is
|
|
9
|
+
detection quality (macro-F1, ROC-AUC), not key-recall.
|
|
10
|
+
|
|
11
|
+
Compared memories (frozen, reused read-only from :mod:`sma.eval.agentic`):
|
|
12
|
+
|
|
13
|
+
* ``sma`` — mount the typology lattice; index neighbourhood cases via MacFac;
|
|
14
|
+
* ``dense`` — BGE-small embeddings over the term-name text of each case;
|
|
15
|
+
* ``bm25`` — lexical BM25 over the same term-name text.
|
|
16
|
+
|
|
17
|
+
A ``logreg`` baseline (logistic regression on the raw 166 flat features) is run
|
|
18
|
+
for context — the flat-tabular method that the 4b finance null showed SMA cannot
|
|
19
|
+
beat when there is no cross-record structure.
|
|
20
|
+
|
|
21
|
+
Leak guard: the encoder reads neighbour labels ONLY from the train split, and a
|
|
22
|
+
node's own class is never emitted. Test nodes are encoded against train-visible
|
|
23
|
+
labels, so no test label ever enters any case.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import random
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
|
|
31
|
+
import numpy as np
|
|
32
|
+
from sklearn.linear_model import LogisticRegression
|
|
33
|
+
from sklearn.metrics import f1_score, roc_auc_score
|
|
34
|
+
from sklearn.preprocessing import StandardScaler
|
|
35
|
+
|
|
36
|
+
from sma.eval.agentic.memories import (
|
|
37
|
+
BM25Memory,
|
|
38
|
+
DenseMemory,
|
|
39
|
+
IndexItem,
|
|
40
|
+
Query,
|
|
41
|
+
SmaMemory,
|
|
42
|
+
)
|
|
43
|
+
from sma.eval.fraud_elliptic.encoder import (
|
|
44
|
+
ILLICIT,
|
|
45
|
+
LICIT,
|
|
46
|
+
EllipticGraph,
|
|
47
|
+
NeighbourhoodEncoder,
|
|
48
|
+
build_typology,
|
|
49
|
+
)
|
|
50
|
+
from sma.ontology import mount
|
|
51
|
+
|
|
52
|
+
# Positive class = illicit.
|
|
53
|
+
POS = ILLICIT
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class Split:
|
|
58
|
+
train: list[str]
|
|
59
|
+
test: list[str]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def stratified_split(g: EllipticGraph, frac_test: float, seed: int,
|
|
63
|
+
n_max: int | None = None) -> Split:
|
|
64
|
+
"""Stratified train/test split over labelled nodes (class-balanced holdout)."""
|
|
65
|
+
rng = random.Random(seed)
|
|
66
|
+
illicit = [t for t in g.labelled_ids() if g.label[t] == ILLICIT]
|
|
67
|
+
licit = [t for t in g.labelled_ids() if g.label[t] == LICIT]
|
|
68
|
+
rng.shuffle(illicit)
|
|
69
|
+
rng.shuffle(licit)
|
|
70
|
+
if n_max is not None:
|
|
71
|
+
# Cap total while preserving the natural class ratio (illicit is ~10%).
|
|
72
|
+
ratio = len(illicit) / (len(illicit) + len(licit))
|
|
73
|
+
n_ill = min(len(illicit), int(round(n_max * ratio)))
|
|
74
|
+
n_lic = min(len(licit), n_max - n_ill)
|
|
75
|
+
illicit, licit = illicit[:n_ill], licit[:n_lic]
|
|
76
|
+
|
|
77
|
+
def cut(xs: list[str]) -> tuple[list[str], list[str]]:
|
|
78
|
+
k = int(round(len(xs) * frac_test))
|
|
79
|
+
return xs[k:], xs[:k] # train, test
|
|
80
|
+
|
|
81
|
+
tr_i, te_i = cut(illicit)
|
|
82
|
+
tr_l, te_l = cut(licit)
|
|
83
|
+
return Split(train=sorted(tr_i + tr_l), test=sorted(te_i + te_l))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _case_text(terms: list[str]) -> str:
|
|
87
|
+
"""Term-name text for the lexical / dense baselines (names == ids here)."""
|
|
88
|
+
return " ".join(t.replace("_", " ") for t in terms)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _build_memories(mounted):
|
|
92
|
+
"""Fresh instances of the three frozen retrieval memories."""
|
|
93
|
+
return [SmaMemory(mounted), DenseMemory(), BM25Memory()]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _knn_vote(mem, query: Query, train_label: dict[str, str], k: int) -> float:
|
|
97
|
+
"""Confidence-weighted illicit vote over a memory's top-k analogs in [0,1]."""
|
|
98
|
+
res = mem.retrieve(query, k=k)
|
|
99
|
+
if not res:
|
|
100
|
+
return 0.0
|
|
101
|
+
num = 0.0
|
|
102
|
+
den = 0.0
|
|
103
|
+
for r in res:
|
|
104
|
+
lab = train_label.get(r.key)
|
|
105
|
+
if lab is None:
|
|
106
|
+
continue
|
|
107
|
+
w = max(r.score, 1e-9)
|
|
108
|
+
den += w
|
|
109
|
+
if lab == POS:
|
|
110
|
+
num += w
|
|
111
|
+
return num / den if den > 0 else 0.0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _best_f1_threshold(scores: list[float], labels: list[int]) -> float:
|
|
115
|
+
"""Threshold (on a calibration split) that maximizes macro-F1.
|
|
116
|
+
|
|
117
|
+
The kNN illicit vote is strongly compressed toward 0 by the ~10% illicit
|
|
118
|
+
base rate, so a fixed 0.5 cut makes every method predict all-licit. Each
|
|
119
|
+
method therefore gets its own threshold, chosen on a DISJOINT calibration
|
|
120
|
+
slice of the train split (never on test) by sweeping candidate cuts.
|
|
121
|
+
"""
|
|
122
|
+
s = np.asarray(scores)
|
|
123
|
+
y = np.asarray(labels)
|
|
124
|
+
if len(set(labels)) < 2 or s.size == 0:
|
|
125
|
+
return 0.5
|
|
126
|
+
cands = sorted(set(s.tolist()))
|
|
127
|
+
best_t, best_f1 = 0.5, -1.0
|
|
128
|
+
for c in cands:
|
|
129
|
+
f1 = f1_score(y, (s > c).astype(int), average="macro", zero_division=0)
|
|
130
|
+
if f1 > best_f1:
|
|
131
|
+
best_f1, best_t = f1, float(c)
|
|
132
|
+
return best_t
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def run_elliptic(
|
|
136
|
+
g: EllipticGraph,
|
|
137
|
+
*,
|
|
138
|
+
seeds=(7, 17, 23),
|
|
139
|
+
frac_test: float = 0.3,
|
|
140
|
+
k: int = 15,
|
|
141
|
+
n_max: int | None = 4000,
|
|
142
|
+
calib_frac: float = 0.3,
|
|
143
|
+
include_logreg: bool = True,
|
|
144
|
+
) -> dict:
|
|
145
|
+
"""Run the retrieval-by-analogy illicit-detection evaluation.
|
|
146
|
+
|
|
147
|
+
Returns a result dict with per-method pooled macro-F1 / ROC-AUC across seeds
|
|
148
|
+
and a per-test-node paired record for the bootstrap (SMA vs best baseline).
|
|
149
|
+
Each method's decision threshold is calibrated per seed on a disjoint slice
|
|
150
|
+
of the train split (``calib_frac``); ROC-AUC is threshold-free.
|
|
151
|
+
"""
|
|
152
|
+
typ = build_typology()
|
|
153
|
+
mounted = mount(typ)
|
|
154
|
+
|
|
155
|
+
method_names = ["sma", "dense", "bm25"] + (["logreg"] if include_logreg else [])
|
|
156
|
+
# Pooled per-node arrays across seeds.
|
|
157
|
+
scores: dict[str, list[float]] = {m: [] for m in method_names}
|
|
158
|
+
preds: dict[str, list[int]] = {m: [] for m in method_names}
|
|
159
|
+
truth: list[int] = []
|
|
160
|
+
thresholds: dict[str, list[float]] = {m: [] for m in method_names}
|
|
161
|
+
|
|
162
|
+
for seed in seeds:
|
|
163
|
+
split = stratified_split(g, frac_test=frac_test, seed=seed, n_max=n_max)
|
|
164
|
+
# Carve a calibration slice out of train (disjoint from index and test).
|
|
165
|
+
rng = random.Random(seed * 7919 + 1)
|
|
166
|
+
train_all = list(split.train)
|
|
167
|
+
rng.shuffle(train_all)
|
|
168
|
+
n_cal = int(round(len(train_all) * calib_frac))
|
|
169
|
+
calib_ids = sorted(train_all[:n_cal])
|
|
170
|
+
index_ids = sorted(train_all[n_cal:])
|
|
171
|
+
|
|
172
|
+
# Encoder reads neighbour labels ONLY from the indexed train split.
|
|
173
|
+
index_label = {t: g.label[t] for t in index_ids}
|
|
174
|
+
enc = NeighbourhoodEncoder(graph=g, visible_labels=index_label)
|
|
175
|
+
|
|
176
|
+
# Index train cases in every retrieval memory (identical input).
|
|
177
|
+
items = []
|
|
178
|
+
for t in index_ids:
|
|
179
|
+
terms = enc.encode(t)
|
|
180
|
+
items.append(IndexItem(
|
|
181
|
+
key=t, term_ids=frozenset(terms), text=_case_text(terms), meta={"id": t}
|
|
182
|
+
))
|
|
183
|
+
memories = _build_memories(mounted)
|
|
184
|
+
for mem in memories:
|
|
185
|
+
mem.index(items)
|
|
186
|
+
|
|
187
|
+
def knn_scores(node_ids):
|
|
188
|
+
out = {m.name: [] for m in memories}
|
|
189
|
+
for t in node_ids:
|
|
190
|
+
qterms = enc.encode(t)
|
|
191
|
+
query = Query(term_ids=frozenset(qterms), text=_case_text(qterms))
|
|
192
|
+
for mem in memories:
|
|
193
|
+
out[mem.name].append(_knn_vote(mem, query, index_label, k))
|
|
194
|
+
return out
|
|
195
|
+
|
|
196
|
+
# Calibrate each retrieval method's threshold on the calibration slice.
|
|
197
|
+
cal_truth = [1 if g.label[t] == POS else 0 for t in calib_ids]
|
|
198
|
+
cal_scores = knn_scores(calib_ids)
|
|
199
|
+
seed_thresh = {
|
|
200
|
+
m: _best_f1_threshold(cal_scores[m], cal_truth) for m in cal_scores
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
# Score + predict the test split.
|
|
204
|
+
test_scores = knn_scores(split.test)
|
|
205
|
+
for t in split.test:
|
|
206
|
+
truth.append(1 if g.label[t] == POS else 0)
|
|
207
|
+
for m in cal_scores:
|
|
208
|
+
thr = seed_thresh[m]
|
|
209
|
+
thresholds[m].append(thr)
|
|
210
|
+
scores[m].extend(test_scores[m])
|
|
211
|
+
preds[m].extend(int(s > thr) for s in test_scores[m])
|
|
212
|
+
|
|
213
|
+
# Flat logistic-regression baseline on the raw 166 features (context).
|
|
214
|
+
if include_logreg:
|
|
215
|
+
Xtr = np.array([g.feats[t][1:] for t in index_ids]) # drop time at idx0
|
|
216
|
+
ytr = np.array([1 if g.label[t] == POS else 0 for t in index_ids])
|
|
217
|
+
Xcal = np.array([g.feats[t][1:] for t in calib_ids])
|
|
218
|
+
Xte = np.array([g.feats[t][1:] for t in split.test])
|
|
219
|
+
scaler = StandardScaler().fit(Xtr)
|
|
220
|
+
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
|
|
221
|
+
clf.fit(scaler.transform(Xtr), ytr)
|
|
222
|
+
cal_p = clf.predict_proba(scaler.transform(Xcal))[:, 1]
|
|
223
|
+
lr_thr = _best_f1_threshold(list(cal_p), cal_truth)
|
|
224
|
+
te_p = clf.predict_proba(scaler.transform(Xte))[:, 1]
|
|
225
|
+
thresholds["logreg"].append(lr_thr)
|
|
226
|
+
scores["logreg"].extend(float(p) for p in te_p)
|
|
227
|
+
preds["logreg"].extend(int(p > lr_thr) for p in te_p)
|
|
228
|
+
|
|
229
|
+
truth_arr = np.array(truth)
|
|
230
|
+
per_method: dict[str, dict] = {}
|
|
231
|
+
for m in method_names:
|
|
232
|
+
s = np.array(scores[m])
|
|
233
|
+
pred = np.array(preds[m])
|
|
234
|
+
try:
|
|
235
|
+
auc = float(roc_auc_score(truth_arr, s)) if len(set(truth)) > 1 else float("nan")
|
|
236
|
+
except ValueError:
|
|
237
|
+
auc = float("nan")
|
|
238
|
+
per_method[m] = {
|
|
239
|
+
"macro_f1": float(f1_score(truth_arr, pred, average="macro", zero_division=0)),
|
|
240
|
+
"illicit_f1": float(f1_score(truth_arr, pred, pos_label=1, zero_division=0)),
|
|
241
|
+
"roc_auc": auc,
|
|
242
|
+
"threshold": float(np.mean(thresholds[m])) if thresholds[m] else 0.5,
|
|
243
|
+
"n": int(len(s)),
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Primary: SMA vs best retrieval baseline (by macro-F1) on per-node squared
|
|
247
|
+
# error of the illicit score (lower better) -> paired bootstrap on accuracy.
|
|
248
|
+
from sma.eval.stats import paired_bootstrap
|
|
249
|
+
|
|
250
|
+
retrieval_baselines = [m for m in ("dense", "bm25") if m in per_method]
|
|
251
|
+
best = max(retrieval_baselines, key=lambda m: per_method[m]["macro_f1"])
|
|
252
|
+
# Per-node correctness (calibrated prediction) for SMA vs best baseline.
|
|
253
|
+
sma_correct = [(1.0 if preds["sma"][i] == truth[i] else 0.0) for i in range(len(truth))]
|
|
254
|
+
base_correct = [(1.0 if preds[best][i] == truth[i] else 0.0) for i in range(len(truth))]
|
|
255
|
+
bs = paired_bootstrap(sma_correct, base_correct)
|
|
256
|
+
primary = {
|
|
257
|
+
"a": "sma", "b": best,
|
|
258
|
+
"delta_acc": bs["delta"], "ci_low": bs["ci_low"],
|
|
259
|
+
"ci_high": bs["ci_high"], "p_value": bs["p_value"],
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return {
|
|
263
|
+
"arm": "fraud_elliptic",
|
|
264
|
+
"n_test_pooled": len(truth),
|
|
265
|
+
"n_illicit": int(truth_arr.sum()),
|
|
266
|
+
"k": k, "seeds": list(seeds),
|
|
267
|
+
"per_method": per_method,
|
|
268
|
+
"primary": primary,
|
|
269
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Unit tests for the Elliptic graph-neighbourhood encoder + typology lattice."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from sma.eval.fraud_elliptic.encoder import (
|
|
6
|
+
ILLICIT,
|
|
7
|
+
LICIT,
|
|
8
|
+
UNKNOWN,
|
|
9
|
+
EllipticGraph,
|
|
10
|
+
NeighbourhoodEncoder,
|
|
11
|
+
_count_class,
|
|
12
|
+
_degree_class,
|
|
13
|
+
_tier,
|
|
14
|
+
build_typology,
|
|
15
|
+
)
|
|
16
|
+
from sma.ontology import mount
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _toy_graph() -> EllipticGraph:
|
|
20
|
+
# node "C" is the query; A,B are predecessors, D,E,F are successors.
|
|
21
|
+
feats = {
|
|
22
|
+
"A": [1.0, 0.0, 0.0], "B": [1.0, 0.0, 0.0],
|
|
23
|
+
"C": [2.0, 0.9, -0.9], # time=2, agg-val high-ish, local-val low-ish
|
|
24
|
+
"D": [3.0, 0.0, 0.0], "E": [3.0, 0.0, 0.0], "F": [3.0, 0.0, 0.0],
|
|
25
|
+
}
|
|
26
|
+
g = EllipticGraph(
|
|
27
|
+
time_step={k: int(v[0]) for k, v in feats.items()},
|
|
28
|
+
label={"A": ILLICIT, "B": ILLICIT, "C": ILLICIT, "D": LICIT, "E": UNKNOWN, "F": LICIT},
|
|
29
|
+
feats=feats,
|
|
30
|
+
preds={"C": ["A", "B"], "A": [], "B": [], "D": ["C"], "E": ["C"], "F": ["C"]},
|
|
31
|
+
succs={"C": ["D", "E", "F"], "A": ["C"], "B": ["C"], "D": [], "E": [], "F": []},
|
|
32
|
+
)
|
|
33
|
+
return g
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_degree_and_count_buckets():
|
|
37
|
+
assert _degree_class(0) == "none"
|
|
38
|
+
assert _degree_class(1) == "one"
|
|
39
|
+
assert _degree_class(3) == "few"
|
|
40
|
+
assert _degree_class(9) == "many"
|
|
41
|
+
assert _count_class(0) == "none"
|
|
42
|
+
assert _count_class(2) == "some"
|
|
43
|
+
assert _count_class(5) == "many"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_tier():
|
|
47
|
+
assert _tier(-1.0, -0.3, 0.3) == "low"
|
|
48
|
+
assert _tier(0.0, -0.3, 0.3) == "mid"
|
|
49
|
+
assert _tier(1.0, -0.3, 0.3) == "high"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_encoder_emits_topology_terms():
|
|
53
|
+
g = _toy_graph()
|
|
54
|
+
enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
|
|
55
|
+
terms = enc.encode("C")
|
|
56
|
+
# 2 predecessors -> fanIn_few ; 3 successors -> fanIn... fanOut_few
|
|
57
|
+
assert "fanIn_few" in terms
|
|
58
|
+
assert "fanOut_few" in terms
|
|
59
|
+
assert any(t.startswith("temp_") for t in terms)
|
|
60
|
+
assert any(t.startswith("inVal_") for t in terms)
|
|
61
|
+
assert any(t.startswith("outVal_") for t in terms)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_neighbour_label_context_counts_visible_only():
|
|
65
|
+
g = _toy_graph()
|
|
66
|
+
# All labels visible: C has 2 illicit predecessors (A,B), 2 licit successors (D,F).
|
|
67
|
+
enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
|
|
68
|
+
terms = enc.encode("C")
|
|
69
|
+
assert "nbrIllicit_some" in terms # 2 illicit -> some
|
|
70
|
+
assert "nbrLicit_some" in terms # 2 licit -> some
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_leak_guard_hides_held_out_neighbour_labels():
|
|
74
|
+
g = _toy_graph()
|
|
75
|
+
# Visible = only the index split (exclude neighbours D,F so their licit labels hide).
|
|
76
|
+
visible = {"A": ILLICIT, "B": ILLICIT} # D, E, F not visible
|
|
77
|
+
enc = NeighbourhoodEncoder(graph=g, visible_labels=visible)
|
|
78
|
+
terms = enc.encode("C")
|
|
79
|
+
assert "nbrIllicit_some" in terms
|
|
80
|
+
assert "nbrLicit_none" in terms # successors' licit labels are not visible
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_self_label_never_leaks():
|
|
84
|
+
# A node that is its own neighbour (self-loop) must not count its own label.
|
|
85
|
+
g = _toy_graph()
|
|
86
|
+
g.preds["C"] = ["C", "A"] # self-loop predecessor
|
|
87
|
+
enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
|
|
88
|
+
terms = enc.encode("C")
|
|
89
|
+
# Only A among predecessors is illicit-and-not-self; the self-loop C is skipped.
|
|
90
|
+
# successors D,F licit. So illicit count = 1 (A) -> some.
|
|
91
|
+
assert "nbrIllicit_some" in terms
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_typology_lattice_is_mountable_and_acyclic_ascent():
|
|
95
|
+
graph = build_typology()
|
|
96
|
+
# Every parent referenced must exist as a term (no dangling is-a edges).
|
|
97
|
+
for tid, term in graph.terms.items():
|
|
98
|
+
for p in term.parents:
|
|
99
|
+
assert p in graph.terms, f"{tid} -> missing parent {p}"
|
|
100
|
+
# Mount must populate a lattice and let fanOut_many ascend to illicitTypology.
|
|
101
|
+
mounted = mount(graph)
|
|
102
|
+
edges = list(graph.is_a_edges())
|
|
103
|
+
assert ("fanOut_many", "illicitTypology") in edges
|
|
104
|
+
assert ("fanOut_many", "fanOut_any") in edges
|
|
105
|
+
assert mounted.canon is not None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_build_case_emits_higher_order_relations():
|
|
109
|
+
# When two related descriptor terms are both present and the typology wires a
|
|
110
|
+
# typed relation, mount().build_case must emit the higher-order statement.
|
|
111
|
+
from sma.ontology.graph import Term
|
|
112
|
+
|
|
113
|
+
graph = build_typology()
|
|
114
|
+
# Add a typed flowsTo relation between own topology and neighbour context.
|
|
115
|
+
graph.terms["fanOut_many"] = Term(
|
|
116
|
+
id="fanOut_many", name="fanOut many",
|
|
117
|
+
parents=("fanOut_any", "illicitTypology"),
|
|
118
|
+
relations=(("flowsTo", "nbrIllicit_many"),),
|
|
119
|
+
)
|
|
120
|
+
mounted = mount(graph)
|
|
121
|
+
case = mounted.build_case(["fanOut_many", "nbrIllicit_many"])
|
|
122
|
+
# The case must contain more than the two unary term statements (the relation).
|
|
123
|
+
assert len(case.statements) >= 3
|
sma/eval/ieee_cis.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""IEEE-CIS Fraud loader + per-transaction artifact builder (4b finance).
|
|
2
|
+
|
|
3
|
+
Real Kaggle transaction-fraud data. Same shape as the healthcare loader: a flat
|
|
4
|
+
CSV-row artifact for the generic structured adapter (the honest 'before') and a
|
|
5
|
+
plain attr=value text for baselines. The 339 anonymized V columns and the label
|
|
6
|
+
are NEVER encoded.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import csv
|
|
11
|
+
import pathlib
|
|
12
|
+
import random
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
# Drop ids, raw timestamp, the LABEL, and all opaque engineered columns (V*, id_*).
|
|
16
|
+
_DROP_EXACT = {"TransactionID", "TransactionDT", "isFraud"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _keep(col: str) -> bool:
|
|
20
|
+
return col not in _DROP_EXACT and not col.startswith("V") and not col.startswith("id_")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class Txn:
|
|
25
|
+
tid: str
|
|
26
|
+
fields: dict[str, str]
|
|
27
|
+
label: str # "fraud" vs "legit"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _csv_path() -> pathlib.Path:
|
|
31
|
+
return pathlib.Path("data/raw/ieee_cis/train_transaction.csv")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_transactions(sample: int = 1500, seed: int = 7, balanced: bool = True,
|
|
35
|
+
scan_cap: int = 120000) -> list[Txn]:
|
|
36
|
+
"""Stream up to scan_cap rows, collect a balanced fraud/legit sample
|
|
37
|
+
(fraud is ~3.5%, so a balanced set makes retrieval-by-analogy meaningful)."""
|
|
38
|
+
half = sample // 2
|
|
39
|
+
fraud: list[Txn] = []
|
|
40
|
+
legit: list[Txn] = []
|
|
41
|
+
with _csv_path().open() as fh:
|
|
42
|
+
r = csv.DictReader(fh)
|
|
43
|
+
for i, row in enumerate(r):
|
|
44
|
+
if i >= scan_cap:
|
|
45
|
+
break
|
|
46
|
+
fields = {k: v for k, v in row.items() if _keep(k) and v not in ("", "NaN")}
|
|
47
|
+
t = Txn(row["TransactionID"], fields, "fraud" if row["isFraud"] == "1" else "legit")
|
|
48
|
+
(fraud if t.label == "fraud" else legit).append(t)
|
|
49
|
+
if balanced and len(fraud) >= half and len(legit) >= half:
|
|
50
|
+
break
|
|
51
|
+
rng = random.Random(seed)
|
|
52
|
+
if balanced:
|
|
53
|
+
out = fraud[:half] + legit[:half]
|
|
54
|
+
else:
|
|
55
|
+
out = (fraud + legit)[:sample]
|
|
56
|
+
rng.shuffle(out)
|
|
57
|
+
return out
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def row_csv(t: Txn) -> str:
|
|
61
|
+
keys = sorted(t.fields)
|
|
62
|
+
return ",".join(keys) + "\n" + ",".join(t.fields[k] for k in keys) + "\n"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def row_text(t: Txn) -> str:
|
|
66
|
+
return " ".join(f"{k}={v}" for k, v in sorted(t.fields.items()))
|
sma/eval/loghub.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""LogHub acquisition metadata helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import pathlib
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_manifest(path: str | pathlib.Path = "data/manifests/datasets.json") -> dict:
|
|
10
|
+
return json.loads(pathlib.Path(path).read_text())
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def loghub_files(path: str | pathlib.Path = "data/manifests/datasets.json") -> dict:
|
|
14
|
+
manifest = load_manifest(path)
|
|
15
|
+
return manifest["loghub_raw"]["files"]
|
|
16
|
+
|