structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/eval/drift_env.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Seeded synthetic ops drift environment."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class DriftStep:
|
|
11
|
+
t: int
|
|
12
|
+
event: str
|
|
13
|
+
ground_truth: dict[str, str]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def generate_drift_world(steps: int = 20, seed: int = 7) -> list[DriftStep]:
|
|
17
|
+
rng = random.Random(seed)
|
|
18
|
+
services = ["api", "db", "queue", "worker"]
|
|
19
|
+
states = {svc: "ok" for svc in services}
|
|
20
|
+
out: list[DriftStep] = []
|
|
21
|
+
for t in range(steps):
|
|
22
|
+
svc = rng.choice(services)
|
|
23
|
+
states[svc] = rng.choice(["ok", "timeout", "saturated", "restarting"])
|
|
24
|
+
out.append(DriftStep(t=t, event=f"{svc} -> {states[svc]}", ground_truth=dict(states)))
|
|
25
|
+
return out
|
|
26
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Standard concept-drift metrics adapted to the agent-memory setting."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def update_recovery(correctness: list[int], change_idx: int) -> int | None:
|
|
6
|
+
"""Sessions after the change until the memory returns the NEW value and keeps it (0 = recovered in the change session). None if it never recovers."""
|
|
7
|
+
for i in range(change_idx, len(correctness)):
|
|
8
|
+
if correctness[i] == 1 and all(c == 1 for c in correctness[i:]):
|
|
9
|
+
return i - change_idx
|
|
10
|
+
return None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def detection_delay(flags: list[bool], change_idx: int) -> int | None:
|
|
14
|
+
"""Sessions after the change until the detector first fires. None if never."""
|
|
15
|
+
for i in range(change_idx, len(flags)):
|
|
16
|
+
if flags[i]:
|
|
17
|
+
return i - change_idx
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def staleness_rate(correctness: list[int], change_idx: int) -> float:
|
|
22
|
+
"""Fraction of post-change probes that still returned the OLD value."""
|
|
23
|
+
post = correctness[change_idx:]
|
|
24
|
+
return 0.0 if not post else sum(1 for c in post if c == 0) / len(post)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Deterministic failure-family labels for LogHub sessions.
|
|
2
|
+
|
|
3
|
+
The binary Anomaly/Normal label is too coarse to tell whether retrieval
|
|
4
|
+
surfaced the *right kind* of failure. This module derives deterministic
|
|
5
|
+
root-cause "families" so the eval can score family-hit@k: did the top-k
|
|
6
|
+
retrieved sessions share the query's failure family, not merely its
|
|
7
|
+
binary class.
|
|
8
|
+
|
|
9
|
+
HDFS family inventory
|
|
10
|
+
---------------------
|
|
11
|
+
Derived by inspecting all 2,500 anomalous sessions in
|
|
12
|
+
``data/processed/ui_corpus_hdfs.jsonl`` (5,000 sessions, 50/50 split).
|
|
13
|
+
Distinct WARN/ERROR/exception line signatures and their session counts
|
|
14
|
+
(a session may contain several signatures; the rule order below resolves
|
|
15
|
+
co-occurrence by putting rare, specific causes before generic catch-alls):
|
|
16
|
+
|
|
17
|
+
============================== ===== ==========================================
|
|
18
|
+
signature count family assigned
|
|
19
|
+
============================== ===== ==========================================
|
|
20
|
+
BlockInfo not found in 737 DeleteBlockNotFound
|
|
21
|
+
volumeMap (delete error)
|
|
22
|
+
writeBlock ... Could not read 494 StreamReadFailure
|
|
23
|
+
from stream (IOException)
|
|
24
|
+
Got exception while serving 462 ServeBlockException
|
|
25
|
+
Redundant addStoredBlock 138 RedundantAddStoredBlock
|
|
26
|
+
Connection reset by peer 11 ConnectionReset
|
|
27
|
+
PendingReplicationMonitor 8 ReplicationMonitorTimeout
|
|
28
|
+
timed out
|
|
29
|
+
EOFException 7 EOFException
|
|
30
|
+
Interrupt family (Interrupted- 8 Interrupt
|
|
31
|
+
IOException, ClosedByInterrupt-
|
|
32
|
+
Exception, Interrupted receive)
|
|
33
|
+
SocketTimeoutException 3 SocketTimeout
|
|
34
|
+
Broken pipe 2 BrokenPipe
|
|
35
|
+
No route to host 1 NoRouteToHost
|
|
36
|
+
(no failure line at all) 832 other_anomaly (sequence-shape anomalies)
|
|
37
|
+
============================== ===== ==========================================
|
|
38
|
+
|
|
39
|
+
Note: a bare ``java\\.[...](\\w+Exception|\\w+Error)`` regex labels 498 of
|
|
40
|
+
these sessions "IOException", which is a useless catch-all (it covers
|
|
41
|
+
ConnectionReset, BrokenPipe, StreamReadFailure, ...). The rules below
|
|
42
|
+
therefore split IOException by its message and only fall back to the raw
|
|
43
|
+
exception class for classes not already covered.
|
|
44
|
+
|
|
45
|
+
BGL families come from the alert-category column of the raw BGL.log
|
|
46
|
+
(first whitespace token, e.g. KERNDTLB, APPSEV). That column is stripped
|
|
47
|
+
from session *text* by the sampler (label-leak fix), so ground-truth
|
|
48
|
+
families must be read from the raw log, keyed by the sampler's
|
|
49
|
+
``bgl_<node>_<window>`` scheme.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import pathlib
|
|
55
|
+
import re
|
|
56
|
+
import zipfile
|
|
57
|
+
from collections import Counter, defaultdict
|
|
58
|
+
|
|
59
|
+
# Generic Java exception/error class fallback, e.g. "EOFException".
|
|
60
|
+
_JAVA_EXC_RE = re.compile(r"java\.[a-zA-Z.]*\b(\w+Exception|\w+Error)")
|
|
61
|
+
|
|
62
|
+
# Ordered (family, predicate-substring(s)) rules. First match wins.
|
|
63
|
+
# Rare, specific root causes come first; broad catch-alls last, so a
|
|
64
|
+
# session showing "Connection reset by peer" inside an IOException is
|
|
65
|
+
# labelled ConnectionReset rather than the generic class name.
|
|
66
|
+
_HDFS_RULES: tuple[tuple[str, tuple[str, ...]], ...] = (
|
|
67
|
+
("ConnectionReset", ("Connection reset",)),
|
|
68
|
+
("BrokenPipe", ("Broken pipe",)),
|
|
69
|
+
("NoRouteToHost", ("No route to host", "NoRouteToHostException")),
|
|
70
|
+
("ReplicationMonitorTimeout", ("PendingReplicationMonitor timed out",)),
|
|
71
|
+
("SocketTimeout", ("SocketTimeoutException", "millis timeout")),
|
|
72
|
+
("EOFException", ("EOFException",)),
|
|
73
|
+
(
|
|
74
|
+
"Interrupt",
|
|
75
|
+
(
|
|
76
|
+
"InterruptedIOException",
|
|
77
|
+
"ClosedByInterruptException",
|
|
78
|
+
"Interrupted receiveBlock",
|
|
79
|
+
"interrupt",
|
|
80
|
+
),
|
|
81
|
+
),
|
|
82
|
+
("Checksum", ("checksum", "Checksum")),
|
|
83
|
+
("StreamReadFailure", ("Could not read from stream",)),
|
|
84
|
+
("DeleteBlockNotFound", ("BlockInfo not found in volumeMap",)),
|
|
85
|
+
("ServeBlockException", ("Got exception while serving",)),
|
|
86
|
+
("RedundantAddStoredBlock", ("Redundant addStoredBlock",)),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def hdfs_family(session_text: str, label: str = "Anomaly") -> str:
|
|
91
|
+
"""Deterministic failure family for an HDFS block session.
|
|
92
|
+
|
|
93
|
+
Rules (ordered, first match wins):
|
|
94
|
+
1. Specific failure markers / refined exception messages from the
|
|
95
|
+
table in the module docstring.
|
|
96
|
+
2. Any remaining ``java.*Exception|Error`` class name (regex), for
|
|
97
|
+
classes the explicit rules do not cover.
|
|
98
|
+
3. ``other_anomaly`` for anomalous sessions with no failure text
|
|
99
|
+
(HDFS labels many sessions anomalous purely on event-sequence
|
|
100
|
+
shape; they contain only INFO lines).
|
|
101
|
+
|
|
102
|
+
``label`` is the binary ground-truth label; normal sessions always
|
|
103
|
+
return ``"normal"`` regardless of text (a handful of normal sessions
|
|
104
|
+
contain benign warning lines, and family metrics only score
|
|
105
|
+
anomalies).
|
|
106
|
+
"""
|
|
107
|
+
if label == "Normal":
|
|
108
|
+
return "normal"
|
|
109
|
+
for family, needles in _HDFS_RULES:
|
|
110
|
+
for needle in needles:
|
|
111
|
+
if needle in session_text:
|
|
112
|
+
return family
|
|
113
|
+
m = _JAVA_EXC_RE.search(session_text)
|
|
114
|
+
if m:
|
|
115
|
+
return m.group(1)
|
|
116
|
+
return "other_anomaly"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def bgl_family(
|
|
120
|
+
zip_path: pathlib.Path | str, session_keys: set[str] | list[str]
|
|
121
|
+
) -> dict[str, str]:
|
|
122
|
+
"""Ground-truth families for BGL sessions from the alert-category column.
|
|
123
|
+
|
|
124
|
+
Streams the raw ``BGL.log`` inside ``zip_path`` and, for every line
|
|
125
|
+
belonging to one of ``session_keys``, collects the alert-category
|
|
126
|
+
column (first whitespace token; ``-`` means non-alert). The session's
|
|
127
|
+
family is the most frequent non-``-`` category in its window, ties
|
|
128
|
+
broken alphabetically; sessions whose lines are all ``-`` get
|
|
129
|
+
``"normal"``.
|
|
130
|
+
|
|
131
|
+
Keys use the exact scheme of ``sample_bgl_stratified`` in
|
|
132
|
+
``sma.eval.loghub_eval``: ``bgl_<node>_<window>`` with
|
|
133
|
+
``window = unix_timestamp // 60`` and the same line-parsing filter
|
|
134
|
+
(``split(maxsplit=5)``, >= 5 fields, integer timestamp).
|
|
135
|
+
|
|
136
|
+
Returns a dict mapping every key in ``session_keys`` to its family
|
|
137
|
+
(keys with no matching log lines are omitted, mirroring the sampler,
|
|
138
|
+
which also drops empty sessions).
|
|
139
|
+
"""
|
|
140
|
+
wanted = set(session_keys)
|
|
141
|
+
cat_counts: dict[str, Counter] = defaultdict(Counter)
|
|
142
|
+
with zipfile.ZipFile(zip_path, "r") as z:
|
|
143
|
+
with z.open("BGL.log") as fh:
|
|
144
|
+
for line_bytes in fh:
|
|
145
|
+
line = line_bytes.decode("utf-8", errors="ignore")
|
|
146
|
+
parts = line.split(maxsplit=5)
|
|
147
|
+
if len(parts) < 5:
|
|
148
|
+
continue
|
|
149
|
+
try:
|
|
150
|
+
timestamp = int(parts[1])
|
|
151
|
+
except ValueError:
|
|
152
|
+
continue
|
|
153
|
+
node_id = parts[3]
|
|
154
|
+
window = timestamp // 60
|
|
155
|
+
session_key = f"bgl_{node_id}_{window}"
|
|
156
|
+
if session_key in wanted:
|
|
157
|
+
cat_counts[session_key][parts[0]] += 1
|
|
158
|
+
|
|
159
|
+
families: dict[str, str] = {}
|
|
160
|
+
for key, counts in cat_counts.items():
|
|
161
|
+
alerts = {c: n for c, n in counts.items() if c != "-"}
|
|
162
|
+
if not alerts:
|
|
163
|
+
families[key] = "normal"
|
|
164
|
+
else:
|
|
165
|
+
# Most frequent alert category; alphabetical tie-break.
|
|
166
|
+
families[key] = min(alerts, key=lambda c: (-alerts[c], c))
|
|
167
|
+
return families
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Structural-fraud arm: graph-neighbourhood encoding of the Elliptic Bitcoin
|
|
2
|
+
transaction graph for retrieval-by-analogy illicit detection.
|
|
3
|
+
|
|
4
|
+
The flat-tabular finance null (4b) showed SMA has no edge when each record is
|
|
5
|
+
encoded independently: there is no cross-record structure to map. Elliptic is a
|
|
6
|
+
*graph* (≈203k transaction nodes, 166 features, plus a directed bitcoin-flow
|
|
7
|
+
edgelist), so it carries the predecessor/successor topology a single flat row
|
|
8
|
+
lacks. This module encodes each transaction's local neighbourhood as a case of
|
|
9
|
+
**higher-order relations** over a licit/illicit *typology lattice* — fan-in/out
|
|
10
|
+
degree class, in/out value tier, temporal step, and (leak-guarded) neighbour
|
|
11
|
+
label context wired by ``flowsFrom``/``flowsTo`` — so SMA can structure-map an
|
|
12
|
+
illicit-pattern analog where flat/vector methods see only an isolated vector.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from sma.eval.fraud_elliptic.encoder import (
|
|
18
|
+
EllipticGraph,
|
|
19
|
+
NeighbourhoodEncoder,
|
|
20
|
+
build_typology,
|
|
21
|
+
load_elliptic,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"EllipticGraph",
|
|
26
|
+
"NeighbourhoodEncoder",
|
|
27
|
+
"build_typology",
|
|
28
|
+
"load_elliptic",
|
|
29
|
+
]
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Graph-neighbourhood encoder + licit/illicit typology lattice for Elliptic.
|
|
2
|
+
|
|
3
|
+
The Elliptic dataset ships three files:
|
|
4
|
+
|
|
5
|
+
* ``elliptic_txs_features.csv`` — no header; col 0 = txId, col 1 = time-step
|
|
6
|
+
(1..49), cols 2..167 = 166 anonymized features (the first ~93 are local tx
|
|
7
|
+
features, the rest are aggregations of one-hop neighbour features);
|
|
8
|
+
* ``elliptic_txs_classes.csv`` — ``txId,class`` with class in {1 (illicit),
|
|
9
|
+
2 (licit), unknown};
|
|
10
|
+
* ``elliptic_txs_edgelist.csv`` — ``txId1,txId2`` directed bitcoin flows.
|
|
11
|
+
|
|
12
|
+
The encoder turns one transaction into a *case over a typology of
|
|
13
|
+
graph-neighbourhood descriptors*, NOT the 166 flat features:
|
|
14
|
+
|
|
15
|
+
- ``fanIn_*`` / ``fanOut_*`` — predecessor / successor degree class;
|
|
16
|
+
- ``inVal_*`` / ``outVal_*`` — value tier of incoming / outgoing flow,
|
|
17
|
+
read from the local value feature aggregated over neighbours;
|
|
18
|
+
- ``temp_*`` — temporal-step bucket;
|
|
19
|
+
- ``nbrIllicit_*`` / ``nbrLicit_*`` — neighbour *label context*: how many
|
|
20
|
+
predecessors / successors are known-illicit / known-licit. **Label-leak
|
|
21
|
+
guard:** a node's OWN class is never emitted, and neighbour labels are
|
|
22
|
+
only those visible in the indexed (train) split passed to the encoder.
|
|
23
|
+
|
|
24
|
+
These descriptor terms hang off an is-a typology lattice (e.g. ``fanOut_high``
|
|
25
|
+
is_a ``fanOut_any`` is_a ``flowTopology``; ``nbrIllicit_many`` is_a
|
|
26
|
+
``illicitContext`` is_a ``neighbourContext``), so SMA can ascend a too-specific
|
|
27
|
+
observation to a shared ancestor when structure-mapping. Higher-order
|
|
28
|
+
``flowsFrom`` / ``flowsTo`` relations wire the node's own topology descriptor to
|
|
29
|
+
its neighbour-context descriptor, giving the cross-record structure flat-tabular
|
|
30
|
+
encodings discard.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import csv
|
|
36
|
+
import pathlib
|
|
37
|
+
from dataclasses import dataclass, field
|
|
38
|
+
|
|
39
|
+
from sma.ontology.graph import OntologyGraph, Term
|
|
40
|
+
|
|
41
|
+
# Elliptic class codes (string, as they appear in the CSV).
|
|
42
|
+
ILLICIT, LICIT, UNKNOWN = "1", "2", "unknown"
|
|
43
|
+
|
|
44
|
+
# Feature-column layout in elliptic_txs_features.csv (0-based over the CSV row).
|
|
45
|
+
COL_TXID = 0
|
|
46
|
+
COL_TIME = 1
|
|
47
|
+
# Local features start at col 2. Two anonymized local features used as proxy
|
|
48
|
+
# value channels (the dataset is anonymized; these are stable per-tx scalars).
|
|
49
|
+
COL_LOCAL_VALUE = 2 # first local feature — proxy for a transaction-value channel
|
|
50
|
+
COL_AGG_VALUE = 95 # first aggregated (neighbour) feature — neighbour-value proxy
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _tier(value: float, lo: float, hi: float) -> str:
|
|
54
|
+
"""Three-way tier label for a z-scored feature: low / mid / high."""
|
|
55
|
+
if value <= lo:
|
|
56
|
+
return "low"
|
|
57
|
+
if value >= hi:
|
|
58
|
+
return "high"
|
|
59
|
+
return "mid"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _degree_class(deg: int) -> str:
|
|
63
|
+
"""Bucket a degree into none / one / few / many."""
|
|
64
|
+
if deg == 0:
|
|
65
|
+
return "none"
|
|
66
|
+
if deg == 1:
|
|
67
|
+
return "one"
|
|
68
|
+
if deg <= 4:
|
|
69
|
+
return "few"
|
|
70
|
+
return "many"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _count_class(count: int) -> str:
|
|
74
|
+
"""Bucket a neighbour-label count into none / some / many."""
|
|
75
|
+
if count == 0:
|
|
76
|
+
return "none"
|
|
77
|
+
if count <= 2:
|
|
78
|
+
return "some"
|
|
79
|
+
return "many"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class EllipticGraph:
|
|
84
|
+
"""In-memory Elliptic transaction graph."""
|
|
85
|
+
|
|
86
|
+
time_step: dict[str, int]
|
|
87
|
+
label: dict[str, str] # txId -> {"1","2","unknown"}
|
|
88
|
+
feats: dict[str, list[float]] # txId -> full feature row (incl. time at idx 0)
|
|
89
|
+
preds: dict[str, list[str]] = field(default_factory=dict) # txId -> predecessors
|
|
90
|
+
succs: dict[str, list[str]] = field(default_factory=dict) # txId -> successors
|
|
91
|
+
|
|
92
|
+
def labelled_ids(self) -> list[str]:
|
|
93
|
+
"""Sorted txIds with a known (non-unknown) label."""
|
|
94
|
+
return sorted(t for t, c in self.label.items() if c in (ILLICIT, LICIT))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def load_elliptic(data_dir: str) -> EllipticGraph:
|
|
98
|
+
"""Load the three Elliptic CSVs from ``data_dir`` into an :class:`EllipticGraph`."""
|
|
99
|
+
d = pathlib.Path(data_dir)
|
|
100
|
+
feats_p = d / "elliptic_txs_features.csv"
|
|
101
|
+
classes_p = d / "elliptic_txs_classes.csv"
|
|
102
|
+
edges_p = d / "elliptic_txs_edgelist.csv"
|
|
103
|
+
|
|
104
|
+
feats: dict[str, list[float]] = {}
|
|
105
|
+
time_step: dict[str, int] = {}
|
|
106
|
+
with feats_p.open(encoding="utf-8") as fh:
|
|
107
|
+
for row in csv.reader(fh):
|
|
108
|
+
if not row:
|
|
109
|
+
continue
|
|
110
|
+
txid = row[COL_TXID]
|
|
111
|
+
vals = [float(x) for x in row[1:]] # idx0 = time-step, then 166 feats
|
|
112
|
+
feats[txid] = vals
|
|
113
|
+
time_step[txid] = int(round(vals[0]))
|
|
114
|
+
|
|
115
|
+
label: dict[str, str] = {}
|
|
116
|
+
with classes_p.open(encoding="utf-8") as fh:
|
|
117
|
+
rd = csv.reader(fh)
|
|
118
|
+
next(rd, None) # header: txId,class
|
|
119
|
+
for row in rd:
|
|
120
|
+
if len(row) >= 2:
|
|
121
|
+
label[row[0]] = row[1]
|
|
122
|
+
|
|
123
|
+
preds: dict[str, list[str]] = {t: [] for t in feats}
|
|
124
|
+
succs: dict[str, list[str]] = {t: [] for t in feats}
|
|
125
|
+
with edges_p.open(encoding="utf-8") as fh:
|
|
126
|
+
rd = csv.reader(fh)
|
|
127
|
+
next(rd, None) # header: txId1,txId2
|
|
128
|
+
for row in rd:
|
|
129
|
+
if len(row) < 2:
|
|
130
|
+
continue
|
|
131
|
+
a, b = row[0], row[1]
|
|
132
|
+
if a in succs and b in preds:
|
|
133
|
+
succs[a].append(b) # a -> b : a flows to b
|
|
134
|
+
preds[b].append(a) # b has predecessor a
|
|
135
|
+
|
|
136
|
+
return EllipticGraph(time_step=time_step, label=label, feats=feats, preds=preds, succs=succs)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# Typology vocabulary -------------------------------------------------------
|
|
140
|
+
# Each tuple is (term_id, parent_id). Roots have parent "".
|
|
141
|
+
_DEGREE_BUCKETS = ("none", "one", "few", "many")
|
|
142
|
+
_TIERS = ("low", "mid", "high")
|
|
143
|
+
_TEMP_BUCKETS = ("early", "mid", "late")
|
|
144
|
+
_COUNT_BUCKETS = ("none", "some", "many")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def build_typology(name: str = "elliptic_typology") -> OntologyGraph:
|
|
148
|
+
"""Build the licit/illicit graph-neighbourhood typology lattice.
|
|
149
|
+
|
|
150
|
+
Returns an :class:`OntologyGraph` whose is-a edges let a specific descriptor
|
|
151
|
+
(e.g. ``fanOut_high``) ascend to a shared ancestor (``fanOut_any`` ->
|
|
152
|
+
``flowTopology``) during structure-mapping. Mounting this graph populates the
|
|
153
|
+
predicate lattice; ``NeighbourhoodEncoder`` emits cases over these term ids.
|
|
154
|
+
"""
|
|
155
|
+
terms: dict[str, Term] = {}
|
|
156
|
+
|
|
157
|
+
def add(tid: str, parent: str, nm: str = "") -> None:
|
|
158
|
+
terms[tid] = Term(id=tid, name=nm or tid.replace("_", " "),
|
|
159
|
+
parents=(parent,) if parent else ())
|
|
160
|
+
|
|
161
|
+
# Roots of the typology.
|
|
162
|
+
add("flowTopology", "")
|
|
163
|
+
add("valueProfile", "")
|
|
164
|
+
add("temporalProfile", "")
|
|
165
|
+
add("neighbourContext", "")
|
|
166
|
+
# Two licit/illicit typology poles: descriptor families subsume into these so
|
|
167
|
+
# an "illicit-looking neighbourhood" can match across distinct surface forms.
|
|
168
|
+
add("illicitTypology", "")
|
|
169
|
+
add("licitTypology", "")
|
|
170
|
+
|
|
171
|
+
# fan-in / fan-out degree classes.
|
|
172
|
+
add("fanIn_any", "flowTopology")
|
|
173
|
+
add("fanOut_any", "flowTopology")
|
|
174
|
+
for b in _DEGREE_BUCKETS:
|
|
175
|
+
add(f"fanIn_{b}", "fanIn_any")
|
|
176
|
+
add(f"fanOut_{b}", "fanOut_any")
|
|
177
|
+
# High fan-out / high fan-in are illicit-typology cues (layering / dispersal).
|
|
178
|
+
terms["fanOut_many"].parents = ("fanOut_any", "illicitTypology")
|
|
179
|
+
terms["fanIn_many"].parents = ("fanIn_any", "illicitTypology")
|
|
180
|
+
terms["fanOut_one"].parents = ("fanOut_any", "licitTypology")
|
|
181
|
+
|
|
182
|
+
# value tiers (incoming / outgoing).
|
|
183
|
+
add("inVal_any", "valueProfile")
|
|
184
|
+
add("outVal_any", "valueProfile")
|
|
185
|
+
for t in _TIERS:
|
|
186
|
+
add(f"inVal_{t}", "inVal_any")
|
|
187
|
+
add(f"outVal_{t}", "outVal_any")
|
|
188
|
+
|
|
189
|
+
# temporal buckets.
|
|
190
|
+
add("temp_any", "temporalProfile")
|
|
191
|
+
for b in _TEMP_BUCKETS:
|
|
192
|
+
add(f"temp_{b}", "temp_any")
|
|
193
|
+
|
|
194
|
+
# neighbour label context — known illicit / licit predecessors & successors.
|
|
195
|
+
add("illicitContext", "neighbourContext")
|
|
196
|
+
add("licitContext", "neighbourContext")
|
|
197
|
+
# illicitContext rolls up into the illicit typology pole.
|
|
198
|
+
terms["illicitContext"].parents = ("neighbourContext", "illicitTypology")
|
|
199
|
+
terms["licitContext"].parents = ("neighbourContext", "licitTypology")
|
|
200
|
+
for c in _COUNT_BUCKETS:
|
|
201
|
+
add(f"nbrIllicit_{c}", "illicitContext")
|
|
202
|
+
add(f"nbrLicit_{c}", "licitContext")
|
|
203
|
+
# "no illicit neighbours" is not itself an illicit cue: re-parent to context.
|
|
204
|
+
terms["nbrIllicit_none"].parents = ("neighbourContext",)
|
|
205
|
+
terms["nbrLicit_none"].parents = ("neighbourContext",)
|
|
206
|
+
|
|
207
|
+
# Typed higher-order relations: a node's own out-topology *flowsTo* its
|
|
208
|
+
# successor/neighbour label context, and its in-topology *flowsFrom* its
|
|
209
|
+
# predecessor context. When both endpoints co-occur in a case,
|
|
210
|
+
# mount().build_case emits ``flowsTo(fanOut(subj), nbrIllicit(subj))`` — the
|
|
211
|
+
# cross-record structure SMA maps and flat encodings discard. Relations are
|
|
212
|
+
# declared on the degree-bucket terms (one endpoint) toward the context
|
|
213
|
+
# buckets; build_case only materializes a relation when BOTH terms are
|
|
214
|
+
# present on the same transaction.
|
|
215
|
+
def wire(src: str, rel: str, dsts: tuple[str, ...]) -> None:
|
|
216
|
+
terms[src].relations = terms[src].relations + tuple((rel, d) for d in dsts)
|
|
217
|
+
|
|
218
|
+
illicit_ctx = tuple(f"nbrIllicit_{c}" for c in _COUNT_BUCKETS)
|
|
219
|
+
licit_ctx = tuple(f"nbrLicit_{c}" for c in _COUNT_BUCKETS)
|
|
220
|
+
for b in _DEGREE_BUCKETS:
|
|
221
|
+
wire(f"fanOut_{b}", "flowsTo", illicit_ctx + licit_ctx)
|
|
222
|
+
wire(f"fanIn_{b}", "flowsFrom", illicit_ctx + licit_ctx)
|
|
223
|
+
|
|
224
|
+
return OntologyGraph(name=name, terms=terms)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@dataclass
|
|
228
|
+
class NeighbourhoodEncoder:
|
|
229
|
+
"""Encode a transaction's local graph neighbourhood into typology term ids.
|
|
230
|
+
|
|
231
|
+
``visible_labels`` is the label map the encoder is allowed to READ for
|
|
232
|
+
neighbour context — pass the train/index split's labels so test-node
|
|
233
|
+
neighbour context never peeks at held-out labels. A node's OWN class is
|
|
234
|
+
never emitted regardless. ``lo``/``hi`` are the z-score tier cut points.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
graph: EllipticGraph
|
|
238
|
+
visible_labels: dict[str, str]
|
|
239
|
+
lo: float = -0.3
|
|
240
|
+
hi: float = 0.3
|
|
241
|
+
|
|
242
|
+
def encode(self, txid: str) -> list[str]:
|
|
243
|
+
"""Return the sorted typology term ids describing ``txid``'s neighbourhood."""
|
|
244
|
+
g = self.graph
|
|
245
|
+
terms: list[str] = []
|
|
246
|
+
|
|
247
|
+
preds = g.preds.get(txid, [])
|
|
248
|
+
succs = g.succs.get(txid, [])
|
|
249
|
+
terms.append(f"fanIn_{_degree_class(len(preds))}")
|
|
250
|
+
terms.append(f"fanOut_{_degree_class(len(succs))}")
|
|
251
|
+
|
|
252
|
+
feats = g.feats.get(txid, [])
|
|
253
|
+
# feats[0] is the time-step; local & aggregated value channels follow.
|
|
254
|
+
in_val = feats[COL_AGG_VALUE - 1] if len(feats) > COL_AGG_VALUE - 1 else 0.0
|
|
255
|
+
out_val = feats[COL_LOCAL_VALUE - 1] if len(feats) > COL_LOCAL_VALUE - 1 else 0.0
|
|
256
|
+
terms.append(f"inVal_{_tier(in_val, self.lo, self.hi)}")
|
|
257
|
+
terms.append(f"outVal_{_tier(out_val, self.lo, self.hi)}")
|
|
258
|
+
|
|
259
|
+
ts = g.time_step.get(txid, 1)
|
|
260
|
+
if ts <= 16:
|
|
261
|
+
terms.append("temp_early")
|
|
262
|
+
elif ts <= 33:
|
|
263
|
+
terms.append("temp_mid")
|
|
264
|
+
else:
|
|
265
|
+
terms.append("temp_late")
|
|
266
|
+
|
|
267
|
+
# Neighbour LABEL context — leak-guarded: only visible_labels, never self.
|
|
268
|
+
n_illicit = sum(
|
|
269
|
+
1 for n in preds + succs
|
|
270
|
+
if n != txid and self.visible_labels.get(n) == ILLICIT
|
|
271
|
+
)
|
|
272
|
+
n_licit = sum(
|
|
273
|
+
1 for n in preds + succs
|
|
274
|
+
if n != txid and self.visible_labels.get(n) == LICIT
|
|
275
|
+
)
|
|
276
|
+
terms.append(f"nbrIllicit_{_count_class(n_illicit)}")
|
|
277
|
+
terms.append(f"nbrLicit_{_count_class(n_licit)}")
|
|
278
|
+
|
|
279
|
+
return sorted(set(terms))
|