structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,269 @@
1
+ """Retrieval-by-analogy illicit-detection evaluation on the Elliptic graph.
2
+
3
+ Each transaction is encoded as a *case of graph-neighbourhood typology terms*
4
+ (see :mod:`sma.eval.fraud_elliptic.encoder`). The labelled nodes are split into
5
+ a train (index) set and a test set. A memory indexes the train cases; for each
6
+ test node we retrieve its top-k analogs and vote their (known) labels — weighted
7
+ by retrieval confidence — into an illicit score. This is the *same* analogical
8
+ retrieval SMA is built for, now used as a kNN classifier so the metric is
9
+ detection quality (macro-F1, ROC-AUC), not key-recall.
10
+
11
+ Compared memories (frozen, reused read-only from :mod:`sma.eval.agentic`):
12
+
13
+ * ``sma`` — mount the typology lattice; index neighbourhood cases via MacFac;
14
+ * ``dense`` — BGE-small embeddings over the term-name text of each case;
15
+ * ``bm25`` — lexical BM25 over the same term-name text.
16
+
17
+ A ``logreg`` baseline (logistic regression on the raw 166 flat features) is run
18
+ for context — the flat-tabular method that the 4b finance null showed SMA cannot
19
+ beat when there is no cross-record structure.
20
+
21
+ Leak guard: the encoder reads neighbour labels ONLY from the train split, and a
22
+ node's own class is never emitted. Test nodes are encoded against train-visible
23
+ labels, so no test label ever enters any case.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import random
29
+ from dataclasses import dataclass
30
+
31
+ import numpy as np
32
+ from sklearn.linear_model import LogisticRegression
33
+ from sklearn.metrics import f1_score, roc_auc_score
34
+ from sklearn.preprocessing import StandardScaler
35
+
36
+ from sma.eval.agentic.memories import (
37
+ BM25Memory,
38
+ DenseMemory,
39
+ IndexItem,
40
+ Query,
41
+ SmaMemory,
42
+ )
43
+ from sma.eval.fraud_elliptic.encoder import (
44
+ ILLICIT,
45
+ LICIT,
46
+ EllipticGraph,
47
+ NeighbourhoodEncoder,
48
+ build_typology,
49
+ )
50
+ from sma.ontology import mount
51
+
52
+ # Positive class = illicit.
53
+ POS = ILLICIT
54
+
55
+
56
+ @dataclass
57
+ class Split:
58
+ train: list[str]
59
+ test: list[str]
60
+
61
+
62
+ def stratified_split(g: EllipticGraph, frac_test: float, seed: int,
63
+ n_max: int | None = None) -> Split:
64
+ """Stratified train/test split over labelled nodes (class-balanced holdout)."""
65
+ rng = random.Random(seed)
66
+ illicit = [t for t in g.labelled_ids() if g.label[t] == ILLICIT]
67
+ licit = [t for t in g.labelled_ids() if g.label[t] == LICIT]
68
+ rng.shuffle(illicit)
69
+ rng.shuffle(licit)
70
+ if n_max is not None:
71
+ # Cap total while preserving the natural class ratio (illicit is ~10%).
72
+ ratio = len(illicit) / (len(illicit) + len(licit))
73
+ n_ill = min(len(illicit), int(round(n_max * ratio)))
74
+ n_lic = min(len(licit), n_max - n_ill)
75
+ illicit, licit = illicit[:n_ill], licit[:n_lic]
76
+
77
+ def cut(xs: list[str]) -> tuple[list[str], list[str]]:
78
+ k = int(round(len(xs) * frac_test))
79
+ return xs[k:], xs[:k] # train, test
80
+
81
+ tr_i, te_i = cut(illicit)
82
+ tr_l, te_l = cut(licit)
83
+ return Split(train=sorted(tr_i + tr_l), test=sorted(te_i + te_l))
84
+
85
+
86
+ def _case_text(terms: list[str]) -> str:
87
+ """Term-name text for the lexical / dense baselines (names == ids here)."""
88
+ return " ".join(t.replace("_", " ") for t in terms)
89
+
90
+
91
+ def _build_memories(mounted):
92
+ """Fresh instances of the three frozen retrieval memories."""
93
+ return [SmaMemory(mounted), DenseMemory(), BM25Memory()]
94
+
95
+
96
+ def _knn_vote(mem, query: Query, train_label: dict[str, str], k: int) -> float:
97
+ """Confidence-weighted illicit vote over a memory's top-k analogs in [0,1]."""
98
+ res = mem.retrieve(query, k=k)
99
+ if not res:
100
+ return 0.0
101
+ num = 0.0
102
+ den = 0.0
103
+ for r in res:
104
+ lab = train_label.get(r.key)
105
+ if lab is None:
106
+ continue
107
+ w = max(r.score, 1e-9)
108
+ den += w
109
+ if lab == POS:
110
+ num += w
111
+ return num / den if den > 0 else 0.0
112
+
113
+
114
+ def _best_f1_threshold(scores: list[float], labels: list[int]) -> float:
115
+ """Threshold (on a calibration split) that maximizes macro-F1.
116
+
117
+ The kNN illicit vote is strongly compressed toward 0 by the ~10% illicit
118
+ base rate, so a fixed 0.5 cut makes every method predict all-licit. Each
119
+ method therefore gets its own threshold, chosen on a DISJOINT calibration
120
+ slice of the train split (never on test) by sweeping candidate cuts.
121
+ """
122
+ s = np.asarray(scores)
123
+ y = np.asarray(labels)
124
+ if len(set(labels)) < 2 or s.size == 0:
125
+ return 0.5
126
+ cands = sorted(set(s.tolist()))
127
+ best_t, best_f1 = 0.5, -1.0
128
+ for c in cands:
129
+ f1 = f1_score(y, (s > c).astype(int), average="macro", zero_division=0)
130
+ if f1 > best_f1:
131
+ best_f1, best_t = f1, float(c)
132
+ return best_t
133
+
134
+
135
+ def run_elliptic(
136
+ g: EllipticGraph,
137
+ *,
138
+ seeds=(7, 17, 23),
139
+ frac_test: float = 0.3,
140
+ k: int = 15,
141
+ n_max: int | None = 4000,
142
+ calib_frac: float = 0.3,
143
+ include_logreg: bool = True,
144
+ ) -> dict:
145
+ """Run the retrieval-by-analogy illicit-detection evaluation.
146
+
147
+ Returns a result dict with per-method pooled macro-F1 / ROC-AUC across seeds
148
+ and a per-test-node paired record for the bootstrap (SMA vs best baseline).
149
+ Each method's decision threshold is calibrated per seed on a disjoint slice
150
+ of the train split (``calib_frac``); ROC-AUC is threshold-free.
151
+ """
152
+ typ = build_typology()
153
+ mounted = mount(typ)
154
+
155
+ method_names = ["sma", "dense", "bm25"] + (["logreg"] if include_logreg else [])
156
+ # Pooled per-node arrays across seeds.
157
+ scores: dict[str, list[float]] = {m: [] for m in method_names}
158
+ preds: dict[str, list[int]] = {m: [] for m in method_names}
159
+ truth: list[int] = []
160
+ thresholds: dict[str, list[float]] = {m: [] for m in method_names}
161
+
162
+ for seed in seeds:
163
+ split = stratified_split(g, frac_test=frac_test, seed=seed, n_max=n_max)
164
+ # Carve a calibration slice out of train (disjoint from index and test).
165
+ rng = random.Random(seed * 7919 + 1)
166
+ train_all = list(split.train)
167
+ rng.shuffle(train_all)
168
+ n_cal = int(round(len(train_all) * calib_frac))
169
+ calib_ids = sorted(train_all[:n_cal])
170
+ index_ids = sorted(train_all[n_cal:])
171
+
172
+ # Encoder reads neighbour labels ONLY from the indexed train split.
173
+ index_label = {t: g.label[t] for t in index_ids}
174
+ enc = NeighbourhoodEncoder(graph=g, visible_labels=index_label)
175
+
176
+ # Index train cases in every retrieval memory (identical input).
177
+ items = []
178
+ for t in index_ids:
179
+ terms = enc.encode(t)
180
+ items.append(IndexItem(
181
+ key=t, term_ids=frozenset(terms), text=_case_text(terms), meta={"id": t}
182
+ ))
183
+ memories = _build_memories(mounted)
184
+ for mem in memories:
185
+ mem.index(items)
186
+
187
+ def knn_scores(node_ids):
188
+ out = {m.name: [] for m in memories}
189
+ for t in node_ids:
190
+ qterms = enc.encode(t)
191
+ query = Query(term_ids=frozenset(qterms), text=_case_text(qterms))
192
+ for mem in memories:
193
+ out[mem.name].append(_knn_vote(mem, query, index_label, k))
194
+ return out
195
+
196
+ # Calibrate each retrieval method's threshold on the calibration slice.
197
+ cal_truth = [1 if g.label[t] == POS else 0 for t in calib_ids]
198
+ cal_scores = knn_scores(calib_ids)
199
+ seed_thresh = {
200
+ m: _best_f1_threshold(cal_scores[m], cal_truth) for m in cal_scores
201
+ }
202
+
203
+ # Score + predict the test split.
204
+ test_scores = knn_scores(split.test)
205
+ for t in split.test:
206
+ truth.append(1 if g.label[t] == POS else 0)
207
+ for m in cal_scores:
208
+ thr = seed_thresh[m]
209
+ thresholds[m].append(thr)
210
+ scores[m].extend(test_scores[m])
211
+ preds[m].extend(int(s > thr) for s in test_scores[m])
212
+
213
+ # Flat logistic-regression baseline on the raw 166 features (context).
214
+ if include_logreg:
215
+ Xtr = np.array([g.feats[t][1:] for t in index_ids]) # drop time at idx0
216
+ ytr = np.array([1 if g.label[t] == POS else 0 for t in index_ids])
217
+ Xcal = np.array([g.feats[t][1:] for t in calib_ids])
218
+ Xte = np.array([g.feats[t][1:] for t in split.test])
219
+ scaler = StandardScaler().fit(Xtr)
220
+ clf = LogisticRegression(max_iter=2000, class_weight="balanced")
221
+ clf.fit(scaler.transform(Xtr), ytr)
222
+ cal_p = clf.predict_proba(scaler.transform(Xcal))[:, 1]
223
+ lr_thr = _best_f1_threshold(list(cal_p), cal_truth)
224
+ te_p = clf.predict_proba(scaler.transform(Xte))[:, 1]
225
+ thresholds["logreg"].append(lr_thr)
226
+ scores["logreg"].extend(float(p) for p in te_p)
227
+ preds["logreg"].extend(int(p > lr_thr) for p in te_p)
228
+
229
+ truth_arr = np.array(truth)
230
+ per_method: dict[str, dict] = {}
231
+ for m in method_names:
232
+ s = np.array(scores[m])
233
+ pred = np.array(preds[m])
234
+ try:
235
+ auc = float(roc_auc_score(truth_arr, s)) if len(set(truth)) > 1 else float("nan")
236
+ except ValueError:
237
+ auc = float("nan")
238
+ per_method[m] = {
239
+ "macro_f1": float(f1_score(truth_arr, pred, average="macro", zero_division=0)),
240
+ "illicit_f1": float(f1_score(truth_arr, pred, pos_label=1, zero_division=0)),
241
+ "roc_auc": auc,
242
+ "threshold": float(np.mean(thresholds[m])) if thresholds[m] else 0.5,
243
+ "n": int(len(s)),
244
+ }
245
+
246
+ # Primary: SMA vs best retrieval baseline (by macro-F1) on per-node squared
247
+ # error of the illicit score (lower better) -> paired bootstrap on accuracy.
248
+ from sma.eval.stats import paired_bootstrap
249
+
250
+ retrieval_baselines = [m for m in ("dense", "bm25") if m in per_method]
251
+ best = max(retrieval_baselines, key=lambda m: per_method[m]["macro_f1"])
252
+ # Per-node correctness (calibrated prediction) for SMA vs best baseline.
253
+ sma_correct = [(1.0 if preds["sma"][i] == truth[i] else 0.0) for i in range(len(truth))]
254
+ base_correct = [(1.0 if preds[best][i] == truth[i] else 0.0) for i in range(len(truth))]
255
+ bs = paired_bootstrap(sma_correct, base_correct)
256
+ primary = {
257
+ "a": "sma", "b": best,
258
+ "delta_acc": bs["delta"], "ci_low": bs["ci_low"],
259
+ "ci_high": bs["ci_high"], "p_value": bs["p_value"],
260
+ }
261
+
262
+ return {
263
+ "arm": "fraud_elliptic",
264
+ "n_test_pooled": len(truth),
265
+ "n_illicit": int(truth_arr.sum()),
266
+ "k": k, "seeds": list(seeds),
267
+ "per_method": per_method,
268
+ "primary": primary,
269
+ }
@@ -0,0 +1,123 @@
1
+ """Unit tests for the Elliptic graph-neighbourhood encoder + typology lattice."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from sma.eval.fraud_elliptic.encoder import (
6
+ ILLICIT,
7
+ LICIT,
8
+ UNKNOWN,
9
+ EllipticGraph,
10
+ NeighbourhoodEncoder,
11
+ _count_class,
12
+ _degree_class,
13
+ _tier,
14
+ build_typology,
15
+ )
16
+ from sma.ontology import mount
17
+
18
+
19
+ def _toy_graph() -> EllipticGraph:
20
+ # node "C" is the query; A,B are predecessors, D,E,F are successors.
21
+ feats = {
22
+ "A": [1.0, 0.0, 0.0], "B": [1.0, 0.0, 0.0],
23
+ "C": [2.0, 0.9, -0.9], # time=2, agg-val high-ish, local-val low-ish
24
+ "D": [3.0, 0.0, 0.0], "E": [3.0, 0.0, 0.0], "F": [3.0, 0.0, 0.0],
25
+ }
26
+ g = EllipticGraph(
27
+ time_step={k: int(v[0]) for k, v in feats.items()},
28
+ label={"A": ILLICIT, "B": ILLICIT, "C": ILLICIT, "D": LICIT, "E": UNKNOWN, "F": LICIT},
29
+ feats=feats,
30
+ preds={"C": ["A", "B"], "A": [], "B": [], "D": ["C"], "E": ["C"], "F": ["C"]},
31
+ succs={"C": ["D", "E", "F"], "A": ["C"], "B": ["C"], "D": [], "E": [], "F": []},
32
+ )
33
+ return g
34
+
35
+
36
+ def test_degree_and_count_buckets():
37
+ assert _degree_class(0) == "none"
38
+ assert _degree_class(1) == "one"
39
+ assert _degree_class(3) == "few"
40
+ assert _degree_class(9) == "many"
41
+ assert _count_class(0) == "none"
42
+ assert _count_class(2) == "some"
43
+ assert _count_class(5) == "many"
44
+
45
+
46
+ def test_tier():
47
+ assert _tier(-1.0, -0.3, 0.3) == "low"
48
+ assert _tier(0.0, -0.3, 0.3) == "mid"
49
+ assert _tier(1.0, -0.3, 0.3) == "high"
50
+
51
+
52
+ def test_encoder_emits_topology_terms():
53
+ g = _toy_graph()
54
+ enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
55
+ terms = enc.encode("C")
56
+ # 2 predecessors -> fanIn_few ; 3 successors -> fanIn... fanOut_few
57
+ assert "fanIn_few" in terms
58
+ assert "fanOut_few" in terms
59
+ assert any(t.startswith("temp_") for t in terms)
60
+ assert any(t.startswith("inVal_") for t in terms)
61
+ assert any(t.startswith("outVal_") for t in terms)
62
+
63
+
64
+ def test_neighbour_label_context_counts_visible_only():
65
+ g = _toy_graph()
66
+ # All labels visible: C has 2 illicit predecessors (A,B), 2 licit successors (D,F).
67
+ enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
68
+ terms = enc.encode("C")
69
+ assert "nbrIllicit_some" in terms # 2 illicit -> some
70
+ assert "nbrLicit_some" in terms # 2 licit -> some
71
+
72
+
73
+ def test_leak_guard_hides_held_out_neighbour_labels():
74
+ g = _toy_graph()
75
+ # Visible = only the index split (exclude neighbours D,F so their licit labels hide).
76
+ visible = {"A": ILLICIT, "B": ILLICIT} # D, E, F not visible
77
+ enc = NeighbourhoodEncoder(graph=g, visible_labels=visible)
78
+ terms = enc.encode("C")
79
+ assert "nbrIllicit_some" in terms
80
+ assert "nbrLicit_none" in terms # successors' licit labels are not visible
81
+
82
+
83
+ def test_self_label_never_leaks():
84
+ # A node that is its own neighbour (self-loop) must not count its own label.
85
+ g = _toy_graph()
86
+ g.preds["C"] = ["C", "A"] # self-loop predecessor
87
+ enc = NeighbourhoodEncoder(graph=g, visible_labels=g.label)
88
+ terms = enc.encode("C")
89
+ # Only A among predecessors is illicit-and-not-self; the self-loop C is skipped.
90
+ # successors D,F licit. So illicit count = 1 (A) -> some.
91
+ assert "nbrIllicit_some" in terms
92
+
93
+
94
+ def test_typology_lattice_is_mountable_and_acyclic_ascent():
95
+ graph = build_typology()
96
+ # Every parent referenced must exist as a term (no dangling is-a edges).
97
+ for tid, term in graph.terms.items():
98
+ for p in term.parents:
99
+ assert p in graph.terms, f"{tid} -> missing parent {p}"
100
+ # Mount must populate a lattice and let fanOut_many ascend to illicitTypology.
101
+ mounted = mount(graph)
102
+ edges = list(graph.is_a_edges())
103
+ assert ("fanOut_many", "illicitTypology") in edges
104
+ assert ("fanOut_many", "fanOut_any") in edges
105
+ assert mounted.canon is not None
106
+
107
+
108
+ def test_build_case_emits_higher_order_relations():
109
+ # When two related descriptor terms are both present and the typology wires a
110
+ # typed relation, mount().build_case must emit the higher-order statement.
111
+ from sma.ontology.graph import Term
112
+
113
+ graph = build_typology()
114
+ # Add a typed flowsTo relation between own topology and neighbour context.
115
+ graph.terms["fanOut_many"] = Term(
116
+ id="fanOut_many", name="fanOut many",
117
+ parents=("fanOut_any", "illicitTypology"),
118
+ relations=(("flowsTo", "nbrIllicit_many"),),
119
+ )
120
+ mounted = mount(graph)
121
+ case = mounted.build_case(["fanOut_many", "nbrIllicit_many"])
122
+ # The case must contain more than the two unary term statements (the relation).
123
+ assert len(case.statements) >= 3
sma/eval/ieee_cis.py ADDED
@@ -0,0 +1,66 @@
1
+ """IEEE-CIS Fraud loader + per-transaction artifact builder (4b finance).
2
+
3
+ Real Kaggle transaction-fraud data. Same shape as the healthcare loader: a flat
4
+ CSV-row artifact for the generic structured adapter (the honest 'before') and a
5
+ plain attr=value text for baselines. The 339 anonymized V columns and the label
6
+ are NEVER encoded.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import csv
11
+ import pathlib
12
+ import random
13
+ from dataclasses import dataclass
14
+
15
+ # Drop ids, raw timestamp, the LABEL, and all opaque engineered columns (V*, id_*).
16
+ _DROP_EXACT = {"TransactionID", "TransactionDT", "isFraud"}
17
+
18
+
19
+ def _keep(col: str) -> bool:
20
+ return col not in _DROP_EXACT and not col.startswith("V") and not col.startswith("id_")
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class Txn:
25
+ tid: str
26
+ fields: dict[str, str]
27
+ label: str # "fraud" vs "legit"
28
+
29
+
30
+ def _csv_path() -> pathlib.Path:
31
+ return pathlib.Path("data/raw/ieee_cis/train_transaction.csv")
32
+
33
+
34
+ def load_transactions(sample: int = 1500, seed: int = 7, balanced: bool = True,
35
+ scan_cap: int = 120000) -> list[Txn]:
36
+ """Stream up to scan_cap rows, collect a balanced fraud/legit sample
37
+ (fraud is ~3.5%, so a balanced set makes retrieval-by-analogy meaningful)."""
38
+ half = sample // 2
39
+ fraud: list[Txn] = []
40
+ legit: list[Txn] = []
41
+ with _csv_path().open() as fh:
42
+ r = csv.DictReader(fh)
43
+ for i, row in enumerate(r):
44
+ if i >= scan_cap:
45
+ break
46
+ fields = {k: v for k, v in row.items() if _keep(k) and v not in ("", "NaN")}
47
+ t = Txn(row["TransactionID"], fields, "fraud" if row["isFraud"] == "1" else "legit")
48
+ (fraud if t.label == "fraud" else legit).append(t)
49
+ if balanced and len(fraud) >= half and len(legit) >= half:
50
+ break
51
+ rng = random.Random(seed)
52
+ if balanced:
53
+ out = fraud[:half] + legit[:half]
54
+ else:
55
+ out = (fraud + legit)[:sample]
56
+ rng.shuffle(out)
57
+ return out
58
+
59
+
60
+ def row_csv(t: Txn) -> str:
61
+ keys = sorted(t.fields)
62
+ return ",".join(keys) + "\n" + ",".join(t.fields[k] for k in keys) + "\n"
63
+
64
+
65
+ def row_text(t: Txn) -> str:
66
+ return " ".join(f"{k}={v}" for k, v in sorted(t.fields.items()))
sma/eval/loghub.py ADDED
@@ -0,0 +1,16 @@
1
+ """LogHub acquisition metadata helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import pathlib
7
+
8
+
9
+ def load_manifest(path: str | pathlib.Path = "data/manifests/datasets.json") -> dict:
10
+ return json.loads(pathlib.Path(path).read_text())
11
+
12
+
13
+ def loghub_files(path: str | pathlib.Path = "data/manifests/datasets.json") -> dict:
14
+ manifest = load_manifest(path)
15
+ return manifest["loghub_raw"]["files"]
16
+