structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/eval/drift_env.py ADDED
@@ -0,0 +1,26 @@
1
+ """Seeded synthetic ops drift environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class DriftStep:
11
+ t: int
12
+ event: str
13
+ ground_truth: dict[str, str]
14
+
15
+
16
+ def generate_drift_world(steps: int = 20, seed: int = 7) -> list[DriftStep]:
17
+ rng = random.Random(seed)
18
+ services = ["api", "db", "queue", "worker"]
19
+ states = {svc: "ok" for svc in services}
20
+ out: list[DriftStep] = []
21
+ for t in range(steps):
22
+ svc = rng.choice(services)
23
+ states[svc] = rng.choice(["ok", "timeout", "saturated", "restarting"])
24
+ out.append(DriftStep(t=t, event=f"{svc} -> {states[svc]}", ground_truth=dict(states)))
25
+ return out
26
+
@@ -0,0 +1,24 @@
1
+ """Standard concept-drift metrics adapted to the agent-memory setting."""
2
+ from __future__ import annotations
3
+
4
+
5
+ def update_recovery(correctness: list[int], change_idx: int) -> int | None:
6
+ """Sessions after the change until the memory returns the NEW value and keeps it (0 = recovered in the change session). None if it never recovers."""
7
+ for i in range(change_idx, len(correctness)):
8
+ if correctness[i] == 1 and all(c == 1 for c in correctness[i:]):
9
+ return i - change_idx
10
+ return None
11
+
12
+
13
+ def detection_delay(flags: list[bool], change_idx: int) -> int | None:
14
+ """Sessions after the change until the detector first fires. None if never."""
15
+ for i in range(change_idx, len(flags)):
16
+ if flags[i]:
17
+ return i - change_idx
18
+ return None
19
+
20
+
21
+ def staleness_rate(correctness: list[int], change_idx: int) -> float:
22
+ """Fraction of post-change probes that still returned the OLD value."""
23
+ post = correctness[change_idx:]
24
+ return 0.0 if not post else sum(1 for c in post if c == 0) / len(post)
@@ -0,0 +1,167 @@
1
+ """Deterministic failure-family labels for LogHub sessions.
2
+
3
+ The binary Anomaly/Normal label is too coarse to tell whether retrieval
4
+ surfaced the *right kind* of failure. This module derives deterministic
5
+ root-cause "families" so the eval can score family-hit@k: did the top-k
6
+ retrieved sessions share the query's failure family, not merely its
7
+ binary class.
8
+
9
+ HDFS family inventory
10
+ ---------------------
11
+ Derived by inspecting all 2,500 anomalous sessions in
12
+ ``data/processed/ui_corpus_hdfs.jsonl`` (5,000 sessions, 50/50 split).
13
+ Distinct WARN/ERROR/exception line signatures and their session counts
14
+ (a session may contain several signatures; the rule order below resolves
15
+ co-occurrence by putting rare, specific causes before generic catch-alls):
16
+
17
+ ============================== ===== ==========================================
18
+ signature count family assigned
19
+ ============================== ===== ==========================================
20
+ BlockInfo not found in 737 DeleteBlockNotFound
21
+ volumeMap (delete error)
22
+ writeBlock ... Could not read 494 StreamReadFailure
23
+ from stream (IOException)
24
+ Got exception while serving 462 ServeBlockException
25
+ Redundant addStoredBlock 138 RedundantAddStoredBlock
26
+ Connection reset by peer 11 ConnectionReset
27
+ PendingReplicationMonitor 8 ReplicationMonitorTimeout
28
+ timed out
29
+ EOFException 7 EOFException
30
+ Interrupt family (Interrupted- 8 Interrupt
31
+ IOException, ClosedByInterrupt-
32
+ Exception, Interrupted receive)
33
+ SocketTimeoutException 3 SocketTimeout
34
+ Broken pipe 2 BrokenPipe
35
+ No route to host 1 NoRouteToHost
36
+ (no failure line at all) 832 other_anomaly (sequence-shape anomalies)
37
+ ============================== ===== ==========================================
38
+
39
+ Note: a bare ``java\\.[...](\\w+Exception|\\w+Error)`` regex labels 498 of
40
+ these sessions "IOException", which is a useless catch-all (it covers
41
+ ConnectionReset, BrokenPipe, StreamReadFailure, ...). The rules below
42
+ therefore split IOException by its message and only fall back to the raw
43
+ exception class for classes not already covered.
44
+
45
+ BGL families come from the alert-category column of the raw BGL.log
46
+ (first whitespace token, e.g. KERNDTLB, APPSEV). That column is stripped
47
+ from session *text* by the sampler (label-leak fix), so ground-truth
48
+ families must be read from the raw log, keyed by the sampler's
49
+ ``bgl_<node>_<window>`` scheme.
50
+ """
51
+
52
+ from __future__ import annotations
53
+
54
+ import pathlib
55
+ import re
56
+ import zipfile
57
+ from collections import Counter, defaultdict
58
+
59
+ # Generic Java exception/error class fallback, e.g. "EOFException".
60
+ _JAVA_EXC_RE = re.compile(r"java\.[a-zA-Z.]*\b(\w+Exception|\w+Error)")
61
+
62
+ # Ordered (family, predicate-substring(s)) rules. First match wins.
63
+ # Rare, specific root causes come first; broad catch-alls last, so a
64
+ # session showing "Connection reset by peer" inside an IOException is
65
+ # labelled ConnectionReset rather than the generic class name.
66
+ _HDFS_RULES: tuple[tuple[str, tuple[str, ...]], ...] = (
67
+ ("ConnectionReset", ("Connection reset",)),
68
+ ("BrokenPipe", ("Broken pipe",)),
69
+ ("NoRouteToHost", ("No route to host", "NoRouteToHostException")),
70
+ ("ReplicationMonitorTimeout", ("PendingReplicationMonitor timed out",)),
71
+ ("SocketTimeout", ("SocketTimeoutException", "millis timeout")),
72
+ ("EOFException", ("EOFException",)),
73
+ (
74
+ "Interrupt",
75
+ (
76
+ "InterruptedIOException",
77
+ "ClosedByInterruptException",
78
+ "Interrupted receiveBlock",
79
+ "interrupt",
80
+ ),
81
+ ),
82
+ ("Checksum", ("checksum", "Checksum")),
83
+ ("StreamReadFailure", ("Could not read from stream",)),
84
+ ("DeleteBlockNotFound", ("BlockInfo not found in volumeMap",)),
85
+ ("ServeBlockException", ("Got exception while serving",)),
86
+ ("RedundantAddStoredBlock", ("Redundant addStoredBlock",)),
87
+ )
88
+
89
+
90
+ def hdfs_family(session_text: str, label: str = "Anomaly") -> str:
91
+ """Deterministic failure family for an HDFS block session.
92
+
93
+ Rules (ordered, first match wins):
94
+ 1. Specific failure markers / refined exception messages from the
95
+ table in the module docstring.
96
+ 2. Any remaining ``java.*Exception|Error`` class name (regex), for
97
+ classes the explicit rules do not cover.
98
+ 3. ``other_anomaly`` for anomalous sessions with no failure text
99
+ (HDFS labels many sessions anomalous purely on event-sequence
100
+ shape; they contain only INFO lines).
101
+
102
+ ``label`` is the binary ground-truth label; normal sessions always
103
+ return ``"normal"`` regardless of text (a handful of normal sessions
104
+ contain benign warning lines, and family metrics only score
105
+ anomalies).
106
+ """
107
+ if label == "Normal":
108
+ return "normal"
109
+ for family, needles in _HDFS_RULES:
110
+ for needle in needles:
111
+ if needle in session_text:
112
+ return family
113
+ m = _JAVA_EXC_RE.search(session_text)
114
+ if m:
115
+ return m.group(1)
116
+ return "other_anomaly"
117
+
118
+
119
+ def bgl_family(
120
+ zip_path: pathlib.Path | str, session_keys: set[str] | list[str]
121
+ ) -> dict[str, str]:
122
+ """Ground-truth families for BGL sessions from the alert-category column.
123
+
124
+ Streams the raw ``BGL.log`` inside ``zip_path`` and, for every line
125
+ belonging to one of ``session_keys``, collects the alert-category
126
+ column (first whitespace token; ``-`` means non-alert). The session's
127
+ family is the most frequent non-``-`` category in its window, ties
128
+ broken alphabetically; sessions whose lines are all ``-`` get
129
+ ``"normal"``.
130
+
131
+ Keys use the exact scheme of ``sample_bgl_stratified`` in
132
+ ``sma.eval.loghub_eval``: ``bgl_<node>_<window>`` with
133
+ ``window = unix_timestamp // 60`` and the same line-parsing filter
134
+ (``split(maxsplit=5)``, >= 5 fields, integer timestamp).
135
+
136
+ Returns a dict mapping every key in ``session_keys`` to its family
137
+ (keys with no matching log lines are omitted, mirroring the sampler,
138
+ which also drops empty sessions).
139
+ """
140
+ wanted = set(session_keys)
141
+ cat_counts: dict[str, Counter] = defaultdict(Counter)
142
+ with zipfile.ZipFile(zip_path, "r") as z:
143
+ with z.open("BGL.log") as fh:
144
+ for line_bytes in fh:
145
+ line = line_bytes.decode("utf-8", errors="ignore")
146
+ parts = line.split(maxsplit=5)
147
+ if len(parts) < 5:
148
+ continue
149
+ try:
150
+ timestamp = int(parts[1])
151
+ except ValueError:
152
+ continue
153
+ node_id = parts[3]
154
+ window = timestamp // 60
155
+ session_key = f"bgl_{node_id}_{window}"
156
+ if session_key in wanted:
157
+ cat_counts[session_key][parts[0]] += 1
158
+
159
+ families: dict[str, str] = {}
160
+ for key, counts in cat_counts.items():
161
+ alerts = {c: n for c, n in counts.items() if c != "-"}
162
+ if not alerts:
163
+ families[key] = "normal"
164
+ else:
165
+ # Most frequent alert category; alphabetical tie-break.
166
+ families[key] = min(alerts, key=lambda c: (-alerts[c], c))
167
+ return families
@@ -0,0 +1,29 @@
1
+ """Structural-fraud arm: graph-neighbourhood encoding of the Elliptic Bitcoin
2
+ transaction graph for retrieval-by-analogy illicit detection.
3
+
4
+ The flat-tabular finance null (4b) showed SMA has no edge when each record is
5
+ encoded independently: there is no cross-record structure to map. Elliptic is a
6
+ *graph* (≈203k transaction nodes, 166 features, plus a directed bitcoin-flow
7
+ edgelist), so it carries the predecessor/successor topology a single flat row
8
+ lacks. This module encodes each transaction's local neighbourhood as a case of
9
+ **higher-order relations** over a licit/illicit *typology lattice* — fan-in/out
10
+ degree class, in/out value tier, temporal step, and (leak-guarded) neighbour
11
+ label context wired by ``flowsFrom``/``flowsTo`` — so SMA can structure-map an
12
+ illicit-pattern analog where flat/vector methods see only an isolated vector.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from sma.eval.fraud_elliptic.encoder import (
18
+ EllipticGraph,
19
+ NeighbourhoodEncoder,
20
+ build_typology,
21
+ load_elliptic,
22
+ )
23
+
24
+ __all__ = [
25
+ "EllipticGraph",
26
+ "NeighbourhoodEncoder",
27
+ "build_typology",
28
+ "load_elliptic",
29
+ ]
@@ -0,0 +1,279 @@
1
+ """Graph-neighbourhood encoder + licit/illicit typology lattice for Elliptic.
2
+
3
+ The Elliptic dataset ships three files:
4
+
5
+ * ``elliptic_txs_features.csv`` — no header; col 0 = txId, col 1 = time-step
6
+ (1..49), cols 2..167 = 166 anonymized features (the first ~93 are local tx
7
+ features, the rest are aggregations of one-hop neighbour features);
8
+ * ``elliptic_txs_classes.csv`` — ``txId,class`` with class in {1 (illicit),
9
+ 2 (licit), unknown};
10
+ * ``elliptic_txs_edgelist.csv`` — ``txId1,txId2`` directed bitcoin flows.
11
+
12
+ The encoder turns one transaction into a *case over a typology of
13
+ graph-neighbourhood descriptors*, NOT the 166 flat features:
14
+
15
+ - ``fanIn_*`` / ``fanOut_*`` — predecessor / successor degree class;
16
+ - ``inVal_*`` / ``outVal_*`` — value tier of incoming / outgoing flow,
17
+ read from the local value feature aggregated over neighbours;
18
+ - ``temp_*`` — temporal-step bucket;
19
+ - ``nbrIllicit_*`` / ``nbrLicit_*`` — neighbour *label context*: how many
20
+ predecessors / successors are known-illicit / known-licit. **Label-leak
21
+ guard:** a node's OWN class is never emitted, and neighbour labels are
22
+ only those visible in the indexed (train) split passed to the encoder.
23
+
24
+ These descriptor terms hang off an is-a typology lattice (e.g. ``fanOut_high``
25
+ is_a ``fanOut_any`` is_a ``flowTopology``; ``nbrIllicit_many`` is_a
26
+ ``illicitContext`` is_a ``neighbourContext``), so SMA can ascend a too-specific
27
+ observation to a shared ancestor when structure-mapping. Higher-order
28
+ ``flowsFrom`` / ``flowsTo`` relations wire the node's own topology descriptor to
29
+ its neighbour-context descriptor, giving the cross-record structure flat-tabular
30
+ encodings discard.
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import csv
36
+ import pathlib
37
+ from dataclasses import dataclass, field
38
+
39
+ from sma.ontology.graph import OntologyGraph, Term
40
+
41
+ # Elliptic class codes (string, as they appear in the CSV).
42
+ ILLICIT, LICIT, UNKNOWN = "1", "2", "unknown"
43
+
44
+ # Feature-column layout in elliptic_txs_features.csv (0-based over the CSV row).
45
+ COL_TXID = 0
46
+ COL_TIME = 1
47
+ # Local features start at col 2. Two anonymized local features used as proxy
48
+ # value channels (the dataset is anonymized; these are stable per-tx scalars).
49
+ COL_LOCAL_VALUE = 2 # first local feature — proxy for a transaction-value channel
50
+ COL_AGG_VALUE = 95 # first aggregated (neighbour) feature — neighbour-value proxy
51
+
52
+
53
+ def _tier(value: float, lo: float, hi: float) -> str:
54
+ """Three-way tier label for a z-scored feature: low / mid / high."""
55
+ if value <= lo:
56
+ return "low"
57
+ if value >= hi:
58
+ return "high"
59
+ return "mid"
60
+
61
+
62
+ def _degree_class(deg: int) -> str:
63
+ """Bucket a degree into none / one / few / many."""
64
+ if deg == 0:
65
+ return "none"
66
+ if deg == 1:
67
+ return "one"
68
+ if deg <= 4:
69
+ return "few"
70
+ return "many"
71
+
72
+
73
+ def _count_class(count: int) -> str:
74
+ """Bucket a neighbour-label count into none / some / many."""
75
+ if count == 0:
76
+ return "none"
77
+ if count <= 2:
78
+ return "some"
79
+ return "many"
80
+
81
+
82
+ @dataclass
83
+ class EllipticGraph:
84
+ """In-memory Elliptic transaction graph."""
85
+
86
+ time_step: dict[str, int]
87
+ label: dict[str, str] # txId -> {"1","2","unknown"}
88
+ feats: dict[str, list[float]] # txId -> full feature row (incl. time at idx 0)
89
+ preds: dict[str, list[str]] = field(default_factory=dict) # txId -> predecessors
90
+ succs: dict[str, list[str]] = field(default_factory=dict) # txId -> successors
91
+
92
+ def labelled_ids(self) -> list[str]:
93
+ """Sorted txIds with a known (non-unknown) label."""
94
+ return sorted(t for t, c in self.label.items() if c in (ILLICIT, LICIT))
95
+
96
+
97
+ def load_elliptic(data_dir: str) -> EllipticGraph:
98
+ """Load the three Elliptic CSVs from ``data_dir`` into an :class:`EllipticGraph`."""
99
+ d = pathlib.Path(data_dir)
100
+ feats_p = d / "elliptic_txs_features.csv"
101
+ classes_p = d / "elliptic_txs_classes.csv"
102
+ edges_p = d / "elliptic_txs_edgelist.csv"
103
+
104
+ feats: dict[str, list[float]] = {}
105
+ time_step: dict[str, int] = {}
106
+ with feats_p.open(encoding="utf-8") as fh:
107
+ for row in csv.reader(fh):
108
+ if not row:
109
+ continue
110
+ txid = row[COL_TXID]
111
+ vals = [float(x) for x in row[1:]] # idx0 = time-step, then 166 feats
112
+ feats[txid] = vals
113
+ time_step[txid] = int(round(vals[0]))
114
+
115
+ label: dict[str, str] = {}
116
+ with classes_p.open(encoding="utf-8") as fh:
117
+ rd = csv.reader(fh)
118
+ next(rd, None) # header: txId,class
119
+ for row in rd:
120
+ if len(row) >= 2:
121
+ label[row[0]] = row[1]
122
+
123
+ preds: dict[str, list[str]] = {t: [] for t in feats}
124
+ succs: dict[str, list[str]] = {t: [] for t in feats}
125
+ with edges_p.open(encoding="utf-8") as fh:
126
+ rd = csv.reader(fh)
127
+ next(rd, None) # header: txId1,txId2
128
+ for row in rd:
129
+ if len(row) < 2:
130
+ continue
131
+ a, b = row[0], row[1]
132
+ if a in succs and b in preds:
133
+ succs[a].append(b) # a -> b : a flows to b
134
+ preds[b].append(a) # b has predecessor a
135
+
136
+ return EllipticGraph(time_step=time_step, label=label, feats=feats, preds=preds, succs=succs)
137
+
138
+
139
+ # Typology vocabulary -------------------------------------------------------
140
+ # Each tuple is (term_id, parent_id). Roots have parent "".
141
+ _DEGREE_BUCKETS = ("none", "one", "few", "many")
142
+ _TIERS = ("low", "mid", "high")
143
+ _TEMP_BUCKETS = ("early", "mid", "late")
144
+ _COUNT_BUCKETS = ("none", "some", "many")
145
+
146
+
147
+ def build_typology(name: str = "elliptic_typology") -> OntologyGraph:
148
+ """Build the licit/illicit graph-neighbourhood typology lattice.
149
+
150
+ Returns an :class:`OntologyGraph` whose is-a edges let a specific descriptor
151
+ (e.g. ``fanOut_high``) ascend to a shared ancestor (``fanOut_any`` ->
152
+ ``flowTopology``) during structure-mapping. Mounting this graph populates the
153
+ predicate lattice; ``NeighbourhoodEncoder`` emits cases over these term ids.
154
+ """
155
+ terms: dict[str, Term] = {}
156
+
157
+ def add(tid: str, parent: str, nm: str = "") -> None:
158
+ terms[tid] = Term(id=tid, name=nm or tid.replace("_", " "),
159
+ parents=(parent,) if parent else ())
160
+
161
+ # Roots of the typology.
162
+ add("flowTopology", "")
163
+ add("valueProfile", "")
164
+ add("temporalProfile", "")
165
+ add("neighbourContext", "")
166
+ # Two licit/illicit typology poles: descriptor families subsume into these so
167
+ # an "illicit-looking neighbourhood" can match across distinct surface forms.
168
+ add("illicitTypology", "")
169
+ add("licitTypology", "")
170
+
171
+ # fan-in / fan-out degree classes.
172
+ add("fanIn_any", "flowTopology")
173
+ add("fanOut_any", "flowTopology")
174
+ for b in _DEGREE_BUCKETS:
175
+ add(f"fanIn_{b}", "fanIn_any")
176
+ add(f"fanOut_{b}", "fanOut_any")
177
+ # High fan-out / high fan-in are illicit-typology cues (layering / dispersal).
178
+ terms["fanOut_many"].parents = ("fanOut_any", "illicitTypology")
179
+ terms["fanIn_many"].parents = ("fanIn_any", "illicitTypology")
180
+ terms["fanOut_one"].parents = ("fanOut_any", "licitTypology")
181
+
182
+ # value tiers (incoming / outgoing).
183
+ add("inVal_any", "valueProfile")
184
+ add("outVal_any", "valueProfile")
185
+ for t in _TIERS:
186
+ add(f"inVal_{t}", "inVal_any")
187
+ add(f"outVal_{t}", "outVal_any")
188
+
189
+ # temporal buckets.
190
+ add("temp_any", "temporalProfile")
191
+ for b in _TEMP_BUCKETS:
192
+ add(f"temp_{b}", "temp_any")
193
+
194
+ # neighbour label context — known illicit / licit predecessors & successors.
195
+ add("illicitContext", "neighbourContext")
196
+ add("licitContext", "neighbourContext")
197
+ # illicitContext rolls up into the illicit typology pole.
198
+ terms["illicitContext"].parents = ("neighbourContext", "illicitTypology")
199
+ terms["licitContext"].parents = ("neighbourContext", "licitTypology")
200
+ for c in _COUNT_BUCKETS:
201
+ add(f"nbrIllicit_{c}", "illicitContext")
202
+ add(f"nbrLicit_{c}", "licitContext")
203
+ # "no illicit neighbours" is not itself an illicit cue: re-parent to context.
204
+ terms["nbrIllicit_none"].parents = ("neighbourContext",)
205
+ terms["nbrLicit_none"].parents = ("neighbourContext",)
206
+
207
+ # Typed higher-order relations: a node's own out-topology *flowsTo* its
208
+ # successor/neighbour label context, and its in-topology *flowsFrom* its
209
+ # predecessor context. When both endpoints co-occur in a case,
210
+ # mount().build_case emits ``flowsTo(fanOut(subj), nbrIllicit(subj))`` — the
211
+ # cross-record structure SMA maps and flat encodings discard. Relations are
212
+ # declared on the degree-bucket terms (one endpoint) toward the context
213
+ # buckets; build_case only materializes a relation when BOTH terms are
214
+ # present on the same transaction.
215
+ def wire(src: str, rel: str, dsts: tuple[str, ...]) -> None:
216
+ terms[src].relations = terms[src].relations + tuple((rel, d) for d in dsts)
217
+
218
+ illicit_ctx = tuple(f"nbrIllicit_{c}" for c in _COUNT_BUCKETS)
219
+ licit_ctx = tuple(f"nbrLicit_{c}" for c in _COUNT_BUCKETS)
220
+ for b in _DEGREE_BUCKETS:
221
+ wire(f"fanOut_{b}", "flowsTo", illicit_ctx + licit_ctx)
222
+ wire(f"fanIn_{b}", "flowsFrom", illicit_ctx + licit_ctx)
223
+
224
+ return OntologyGraph(name=name, terms=terms)
225
+
226
+
227
+ @dataclass
228
+ class NeighbourhoodEncoder:
229
+ """Encode a transaction's local graph neighbourhood into typology term ids.
230
+
231
+ ``visible_labels`` is the label map the encoder is allowed to READ for
232
+ neighbour context — pass the train/index split's labels so test-node
233
+ neighbour context never peeks at held-out labels. A node's OWN class is
234
+ never emitted regardless. ``lo``/``hi`` are the z-score tier cut points.
235
+ """
236
+
237
+ graph: EllipticGraph
238
+ visible_labels: dict[str, str]
239
+ lo: float = -0.3
240
+ hi: float = 0.3
241
+
242
+ def encode(self, txid: str) -> list[str]:
243
+ """Return the sorted typology term ids describing ``txid``'s neighbourhood."""
244
+ g = self.graph
245
+ terms: list[str] = []
246
+
247
+ preds = g.preds.get(txid, [])
248
+ succs = g.succs.get(txid, [])
249
+ terms.append(f"fanIn_{_degree_class(len(preds))}")
250
+ terms.append(f"fanOut_{_degree_class(len(succs))}")
251
+
252
+ feats = g.feats.get(txid, [])
253
+ # feats[0] is the time-step; local & aggregated value channels follow.
254
+ in_val = feats[COL_AGG_VALUE - 1] if len(feats) > COL_AGG_VALUE - 1 else 0.0
255
+ out_val = feats[COL_LOCAL_VALUE - 1] if len(feats) > COL_LOCAL_VALUE - 1 else 0.0
256
+ terms.append(f"inVal_{_tier(in_val, self.lo, self.hi)}")
257
+ terms.append(f"outVal_{_tier(out_val, self.lo, self.hi)}")
258
+
259
+ ts = g.time_step.get(txid, 1)
260
+ if ts <= 16:
261
+ terms.append("temp_early")
262
+ elif ts <= 33:
263
+ terms.append("temp_mid")
264
+ else:
265
+ terms.append("temp_late")
266
+
267
+ # Neighbour LABEL context — leak-guarded: only visible_labels, never self.
268
+ n_illicit = sum(
269
+ 1 for n in preds + succs
270
+ if n != txid and self.visible_labels.get(n) == ILLICIT
271
+ )
272
+ n_licit = sum(
273
+ 1 for n in preds + succs
274
+ if n != txid and self.visible_labels.get(n) == LICIT
275
+ )
276
+ terms.append(f"nbrIllicit_{_count_class(n_illicit)}")
277
+ terms.append(f"nbrLicit_{_count_class(n_licit)}")
278
+
279
+ return sorted(set(terms))