structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/match/mh.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Match hypothesis seeding and support closure."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import defaultdict, deque
|
|
6
|
+
|
|
7
|
+
from sma.ir.canon import Canonicalizer, default_canonicalizer
|
|
8
|
+
from sma.ir.schema import Entity, Statement
|
|
9
|
+
from sma.ir.sexpr import dumps_node
|
|
10
|
+
|
|
11
|
+
from .types import MatchConfig, MatchHypothesis
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def seed_expression_mhs(
|
|
15
|
+
base_exprs: tuple[Statement, ...],
|
|
16
|
+
target_exprs: tuple[Statement, ...],
|
|
17
|
+
config: MatchConfig | None = None,
|
|
18
|
+
canon: Canonicalizer | None = None,
|
|
19
|
+
) -> tuple[MatchHypothesis, ...]:
|
|
20
|
+
config = config or MatchConfig()
|
|
21
|
+
canon = canon or default_canonicalizer()
|
|
22
|
+
base_groups = _group_by_signature(base_exprs, canon)
|
|
23
|
+
target_groups = _group_by_signature(target_exprs, canon)
|
|
24
|
+
|
|
25
|
+
out: list[MatchHypothesis] = []
|
|
26
|
+
for key, bases in base_groups.items():
|
|
27
|
+
targets = target_groups.get(key)
|
|
28
|
+
if targets:
|
|
29
|
+
for b, t in _capped_pairs(bases, targets, config.mh_group_cap):
|
|
30
|
+
out.append(MatchHypothesis(b, t))
|
|
31
|
+
if config.delta > 0:
|
|
32
|
+
# Minimal ascension across canonical groups, also capped per group pair.
|
|
33
|
+
for (b_functor, b_arity), bases in base_groups.items():
|
|
34
|
+
for (t_functor, t_arity), targets in target_groups.items():
|
|
35
|
+
if b_arity != t_arity or b_functor == t_functor:
|
|
36
|
+
continue
|
|
37
|
+
ok, asc, ancestor, dist = canon.compatible(
|
|
38
|
+
b_functor, t_functor, delta=config.delta, rho=config.rho
|
|
39
|
+
)
|
|
40
|
+
if ok:
|
|
41
|
+
for b, t in _capped_pairs(bases, targets, config.mh_group_cap):
|
|
42
|
+
out.append(MatchHypothesis(b, t, asc, ancestor, dist))
|
|
43
|
+
return tuple(out)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _group_by_signature(
|
|
47
|
+
exprs: tuple[Statement, ...], canon: Canonicalizer
|
|
48
|
+
) -> dict[tuple[str, int], list[Statement]]:
|
|
49
|
+
groups: dict[tuple[str, int], list[Statement]] = defaultdict(list)
|
|
50
|
+
for expr in exprs:
|
|
51
|
+
groups[(canon.canonical(expr.functor), expr.arity)].append(expr)
|
|
52
|
+
return groups
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _capped_pairs(
|
|
56
|
+
bases: list[Statement], targets: list[Statement], cap: int
|
|
57
|
+
) -> list[tuple[Statement, Statement]]:
|
|
58
|
+
"""Deterministic U-ordered pair selection within one functor group.
|
|
59
|
+
|
|
60
|
+
Small groups keep the full cross product. Large groups are capped:
|
|
61
|
+
bit-identical statements pair first (they carry the highest achievable
|
|
62
|
+
match score), then a band around the canonical sort order fills the rest.
|
|
63
|
+
"""
|
|
64
|
+
if len(bases) * len(targets) <= cap:
|
|
65
|
+
return [(b, t) for b in bases for t in targets]
|
|
66
|
+
|
|
67
|
+
base_text = {id(b): dumps_node(b) for b in bases}
|
|
68
|
+
target_text = {id(t): dumps_node(t) for t in targets}
|
|
69
|
+
sorted_bases = sorted(bases, key=lambda s: base_text[id(s)])
|
|
70
|
+
sorted_targets = sorted(targets, key=lambda s: target_text[id(s)])
|
|
71
|
+
|
|
72
|
+
pairs: list[tuple[Statement, Statement]] = []
|
|
73
|
+
used: set[tuple[int, int]] = set()
|
|
74
|
+
|
|
75
|
+
by_text: dict[str, deque] = defaultdict(deque)
|
|
76
|
+
for t in sorted_targets:
|
|
77
|
+
by_text[target_text[id(t)]].append(t)
|
|
78
|
+
for b in sorted_bases:
|
|
79
|
+
queue = by_text.get(base_text[id(b)])
|
|
80
|
+
if queue:
|
|
81
|
+
t = queue.popleft()
|
|
82
|
+
pairs.append((b, t))
|
|
83
|
+
used.add((id(b), id(t)))
|
|
84
|
+
if len(pairs) >= cap:
|
|
85
|
+
return pairs
|
|
86
|
+
|
|
87
|
+
n_targets = len(sorted_targets)
|
|
88
|
+
for offset in range(n_targets):
|
|
89
|
+
for i, b in enumerate(sorted_bases):
|
|
90
|
+
for j in ((i + offset, i - offset) if offset else (i,)):
|
|
91
|
+
if 0 <= j < n_targets:
|
|
92
|
+
t = sorted_targets[j]
|
|
93
|
+
if (id(b), id(t)) not in used:
|
|
94
|
+
used.add((id(b), id(t)))
|
|
95
|
+
pairs.append((b, t))
|
|
96
|
+
if len(pairs) >= cap:
|
|
97
|
+
return pairs
|
|
98
|
+
return pairs
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Entity types whose names are CONSTANTS, not variables (blueprint 2.1:
|
|
102
|
+
# entities/constants are distinct vocabulary classes). A template-name or
|
|
103
|
+
# integer entity denotes itself; pairing count(template_A, 3) with
|
|
104
|
+
# count(template_B, 2) is vacuous shape-matching, not analogy - it was the
|
|
105
|
+
# root cause of the Liberty haystack failure (generic bookkeeping skeleton
|
|
106
|
+
# matching any session against any other).
|
|
107
|
+
# Integers are deliberately NOT constants: count(template_X, 3) vs
|
|
108
|
+
# count(template_X, 5) is a legitimate analogy (same burst, different size);
|
|
109
|
+
# the template-name constraint alone blocks the vacuous cross-template case.
|
|
110
|
+
CONSTANT_ENTITY_TYPES = frozenset({"event_type"})
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def constants_compatible(b_ent: Entity, t_ent: Entity) -> bool:
|
|
114
|
+
if b_ent.type in CONSTANT_ENTITY_TYPES and t_ent.type in CONSTANT_ENTITY_TYPES:
|
|
115
|
+
return b_ent.name == t_ent.name
|
|
116
|
+
return True
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def support_closure(
|
|
120
|
+
root: MatchHypothesis,
|
|
121
|
+
canon: Canonicalizer | None = None,
|
|
122
|
+
delta: int = 0,
|
|
123
|
+
rho: float = 1.0,
|
|
124
|
+
) -> tuple[MatchHypothesis, ...] | None:
|
|
125
|
+
"""Downward closure of a root MH; None when structurally impossible.
|
|
126
|
+
|
|
127
|
+
SME parallel connectivity: argument correspondences must themselves be
|
|
128
|
+
LEGAL match hypotheses. A statement-argument pair with incompatible
|
|
129
|
+
functors invalidates the whole kernel (previously it was silently
|
|
130
|
+
admitted, letting higher-order parents like `before` manufacture
|
|
131
|
+
cross-template "matches" that surprisal weighting then amplified - the
|
|
132
|
+
Liberty ses_n>1 anomaly). Compatibility = canonical identity, or lattice
|
|
133
|
+
ascension within delta at rho^dist penalty. Unequal constants likewise
|
|
134
|
+
invalidate.
|
|
135
|
+
"""
|
|
136
|
+
canon = canon or default_canonicalizer()
|
|
137
|
+
out: list[MatchHypothesis] = []
|
|
138
|
+
seen: set[tuple[str, str]] = set()
|
|
139
|
+
bad = False
|
|
140
|
+
|
|
141
|
+
def add(mh: MatchHypothesis) -> None:
|
|
142
|
+
nonlocal bad
|
|
143
|
+
if bad or mh.key in seen:
|
|
144
|
+
return
|
|
145
|
+
seen.add(mh.key)
|
|
146
|
+
out.append(mh)
|
|
147
|
+
if isinstance(mh.base, Statement) and isinstance(mh.target, Statement):
|
|
148
|
+
if mh.base.arity != mh.target.arity:
|
|
149
|
+
bad = True
|
|
150
|
+
return
|
|
151
|
+
for b_arg, t_arg in zip(mh.base.args, mh.target.args, strict=True):
|
|
152
|
+
if isinstance(b_arg, Entity) and isinstance(t_arg, Entity):
|
|
153
|
+
if not constants_compatible(b_arg, t_arg):
|
|
154
|
+
bad = True
|
|
155
|
+
return
|
|
156
|
+
add(MatchHypothesis(b_arg, t_arg))
|
|
157
|
+
elif isinstance(b_arg, Statement) and isinstance(t_arg, Statement):
|
|
158
|
+
ok, asc, ancestor, dist = canon.compatible(
|
|
159
|
+
b_arg.functor, t_arg.functor, delta=delta, rho=rho
|
|
160
|
+
)
|
|
161
|
+
if not ok:
|
|
162
|
+
bad = True
|
|
163
|
+
return
|
|
164
|
+
# Each MH pays ITS OWN ascension penalty only - the
|
|
165
|
+
# parent's penalty lives in the parent's weight.
|
|
166
|
+
# Multiplying down the chain (a previous bug) punished
|
|
167
|
+
# deep systems exponentially, the opposite of
|
|
168
|
+
# systematicity.
|
|
169
|
+
add(MatchHypothesis(b_arg, t_arg, ascension=asc,
|
|
170
|
+
ancestor=ancestor, distance=dist))
|
|
171
|
+
else:
|
|
172
|
+
# Statement paired with entity (or vice versa): illegal.
|
|
173
|
+
bad = True
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
add(root)
|
|
177
|
+
return None if bad else tuple(out)
|
sma/match/ses.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Structural evaluation score with trickle-down support.
|
|
2
|
+
|
|
3
|
+
Two weighting regimes share this code path:
|
|
4
|
+
- SES (default): every match hypothesis carries unit base weight sigma_0 = 1.
|
|
5
|
+
- surprisal-SES (score-v2 candidate, ADR-004 upgrade path): statement MHs
|
|
6
|
+
carry sigma_0 = corpus surprisal of their canonical functor (-log2 p), so
|
|
7
|
+
rare shared structure counts more while systematicity still compounds via
|
|
8
|
+
trickle-down. With cost_fn=None this reduces exactly to SES.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Callable
|
|
14
|
+
|
|
15
|
+
from sma.ir.schema import Statement
|
|
16
|
+
|
|
17
|
+
from .types import GMap, MatchHypothesis, node_key
|
|
18
|
+
|
|
19
|
+
CostFn = Callable[[MatchHypothesis], float]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def structural_evaluation(
|
|
23
|
+
hypotheses: tuple[MatchHypothesis, ...],
|
|
24
|
+
gamma: float = 0.25,
|
|
25
|
+
cost_fn: CostFn | None = None,
|
|
26
|
+
) -> float:
|
|
27
|
+
by_key = {mh.key: mh for mh in hypotheses}
|
|
28
|
+
parents: dict[tuple[str, str], list[tuple[str, str]]] = {mh.key: [] for mh in hypotheses}
|
|
29
|
+
for mh in hypotheses:
|
|
30
|
+
if not isinstance(mh.base, Statement) or not isinstance(mh.target, Statement):
|
|
31
|
+
continue
|
|
32
|
+
for b_arg, t_arg in zip(mh.base.args, mh.target.args, strict=True):
|
|
33
|
+
child_key = (node_key(b_arg), node_key(t_arg))
|
|
34
|
+
if child_key in parents:
|
|
35
|
+
parents[child_key].append(mh.key)
|
|
36
|
+
|
|
37
|
+
def weight(mh: MatchHypothesis) -> float:
|
|
38
|
+
return 1.0 if cost_fn is None else cost_fn(mh)
|
|
39
|
+
|
|
40
|
+
memo: dict[tuple[str, str], float] = {}
|
|
41
|
+
|
|
42
|
+
def score(key: tuple[str, str], stack: frozenset[tuple[str, str]] = frozenset()) -> float:
|
|
43
|
+
if key in memo:
|
|
44
|
+
return memo[key]
|
|
45
|
+
if key in stack:
|
|
46
|
+
return weight(by_key[key]) * by_key[key].ascension
|
|
47
|
+
parent_score = sum(score(parent, stack | {key}) for parent in parents.get(key, ()))
|
|
48
|
+
value = weight(by_key[key]) * by_key[key].ascension + gamma * parent_score
|
|
49
|
+
memo[key] = value
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
return sum(score(key) for key in by_key)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def self_score(case, gamma: float = 0.25, cost_fn: CostFn | None = None) -> float:
|
|
56
|
+
hyps: list[MatchHypothesis] = []
|
|
57
|
+
for expr in case.expressions():
|
|
58
|
+
hyps.append(MatchHypothesis(expr, expr))
|
|
59
|
+
for entity in expr.entities():
|
|
60
|
+
hyps.append(MatchHypothesis(entity, entity))
|
|
61
|
+
unique = {mh.key: mh for mh in hyps}
|
|
62
|
+
return structural_evaluation(tuple(unique.values()), gamma=gamma, cost_fn=cost_fn)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def normalize_score(
|
|
66
|
+
score: float,
|
|
67
|
+
base,
|
|
68
|
+
target,
|
|
69
|
+
gamma: float = 0.25,
|
|
70
|
+
cost_fn: CostFn | None = None,
|
|
71
|
+
normalization: str = "max",
|
|
72
|
+
) -> float:
|
|
73
|
+
# Same weights in numerator and denominators keep ses_n scale-free.
|
|
74
|
+
self_base = self_score(base, gamma, cost_fn=cost_fn)
|
|
75
|
+
self_target = self_score(target, gamma, cost_fn=cost_fn)
|
|
76
|
+
if normalization == "min":
|
|
77
|
+
denom = min(self_base, self_target)
|
|
78
|
+
elif normalization == "sqrt":
|
|
79
|
+
denom = (self_base * self_target) ** 0.5
|
|
80
|
+
elif normalization == "target":
|
|
81
|
+
denom = self_target
|
|
82
|
+
else: # "max", blueprint 2.3
|
|
83
|
+
denom = max(self_base, self_target)
|
|
84
|
+
return score / max(denom, 1e-9)
|
sma/match/types.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Shared matcher dataclasses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from functools import cached_property
|
|
7
|
+
|
|
8
|
+
from sma.ir.schema import Case, Entity, Node, Statement
|
|
9
|
+
from sma.ir.sexpr import dumps_node
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def node_key(node: Node) -> str:
|
|
13
|
+
prefix = "E" if isinstance(node, Entity) else "S"
|
|
14
|
+
return f"{prefix}:{dumps_node(node)}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class MatchHypothesis:
|
|
19
|
+
base: Node
|
|
20
|
+
target: Node
|
|
21
|
+
ascension: float = 1.0
|
|
22
|
+
ancestor: str | None = None
|
|
23
|
+
distance: int = 0
|
|
24
|
+
|
|
25
|
+
# node_key serializes the whole expression tree, and these keys are read
|
|
26
|
+
# O(kernels^2) times during merge — they must be computed once per instance.
|
|
27
|
+
@cached_property
|
|
28
|
+
def base_key(self) -> str:
|
|
29
|
+
return node_key(self.base)
|
|
30
|
+
|
|
31
|
+
@cached_property
|
|
32
|
+
def target_key(self) -> str:
|
|
33
|
+
return node_key(self.target)
|
|
34
|
+
|
|
35
|
+
@cached_property
|
|
36
|
+
def key(self) -> tuple[str, str]:
|
|
37
|
+
return (self.base_key, self.target_key)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class Kernel:
|
|
42
|
+
root: MatchHypothesis
|
|
43
|
+
hypotheses: tuple[MatchHypothesis, ...]
|
|
44
|
+
weight: float = 0.0
|
|
45
|
+
|
|
46
|
+
@cached_property
|
|
47
|
+
def bindings(self) -> dict[str, str]:
|
|
48
|
+
return {mh.base_key: mh.target_key for mh in self.hypotheses}
|
|
49
|
+
|
|
50
|
+
@cached_property
|
|
51
|
+
def reverse_bindings(self) -> dict[str, str]:
|
|
52
|
+
return {target: base for base, target in self.bindings.items()}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class GMap:
|
|
57
|
+
base: Case
|
|
58
|
+
target: Case
|
|
59
|
+
hypotheses: tuple[MatchHypothesis, ...]
|
|
60
|
+
kernels: tuple[Kernel, ...]
|
|
61
|
+
score: float
|
|
62
|
+
normalized_score: float
|
|
63
|
+
scorer: str = "ses"
|
|
64
|
+
optimality_gap: float | None = None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def correspondences(self) -> list[dict[str, str | float | int | None]]:
|
|
68
|
+
return [
|
|
69
|
+
{
|
|
70
|
+
"base": mh.base_key,
|
|
71
|
+
"target": mh.target_key,
|
|
72
|
+
"ascension": mh.ascension,
|
|
73
|
+
"ancestor": mh.ancestor,
|
|
74
|
+
"distance": mh.distance,
|
|
75
|
+
}
|
|
76
|
+
for mh in self.hypotheses
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class CandidateInference:
|
|
82
|
+
inference_sexpr: str
|
|
83
|
+
base_case_id: str
|
|
84
|
+
target_case_id: str
|
|
85
|
+
ses_n: float
|
|
86
|
+
support: tuple[str, ...] = ()
|
|
87
|
+
skolems: tuple[str, ...] = ()
|
|
88
|
+
ascensions: tuple[str, ...] = ()
|
|
89
|
+
status: str = "hypothetical"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class MatchConfig:
|
|
94
|
+
gamma: float = 0.25
|
|
95
|
+
rho: float = 0.95 # frozen at prereg-v1 (calibration grid; inert when delta=0)
|
|
96
|
+
delta: int = 0
|
|
97
|
+
scorer: str = "surprisal" # "ses" | "mdl" | "surprisal" (score-v2, ADR-005)
|
|
98
|
+
# Normalization of the structural score: "max" (blueprint 2.3),
|
|
99
|
+
# "min" (10.2 tripwire), "sqrt" (geometric mean, cosine-style symmetric),
|
|
100
|
+
# "target" (query-relative; ranking == raw-score ordering per query).
|
|
101
|
+
# Frozen to "max" at prereg-v1 (calibration grid: beats target on family
|
|
102
|
+
# and LOO-haystack validation). Registered caveat: out-of-corpus haystack
|
|
103
|
+
# probes use hybrid fused as the production posture.
|
|
104
|
+
normalization: str = "max"
|
|
105
|
+
# Corpus surprisal per canonical functor (-log2 p), supplied by the index
|
|
106
|
+
# for scorer="surprisal"; None means unit weights (identical to "ses").
|
|
107
|
+
functor_costs: dict | None = None
|
|
108
|
+
exact_kernel_limit: int = 60
|
|
109
|
+
cpsat_time_ms: int = 20
|
|
110
|
+
# Tripwire response from blueprint section 10.2: cap MH pairs per functor
|
|
111
|
+
# group (U-ordered: identical statements first) so sessions with many
|
|
112
|
+
# repeated event types cannot explode the kernel count quadratically.
|
|
113
|
+
mh_group_cap: int = 128
|
|
114
|
+
metadata: dict = field(default_factory=dict)
|
|
115
|
+
|
sma/match/verifier.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Inference verifier."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from sma.ir.sexpr import loads_statement
|
|
8
|
+
from sma.ir.signatures import SignatureRegistry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class VerificationResult:
|
|
13
|
+
status: str
|
|
14
|
+
reasons: tuple[str, ...] = ()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def verify_inference(inference_sexpr: str, registry: SignatureRegistry | None = None) -> VerificationResult:
|
|
18
|
+
registry = registry or SignatureRegistry.with_defaults()
|
|
19
|
+
try:
|
|
20
|
+
statement = loads_statement(inference_sexpr)
|
|
21
|
+
registry.validate_statement(statement)
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
return VerificationResult("type_fail", (str(exc),))
|
|
24
|
+
if "AnalogySkolemFn_" in inference_sexpr:
|
|
25
|
+
return VerificationResult("hypothetical", ("contains analogy skolems",))
|
|
26
|
+
return VerificationResult("pass", ())
|
|
27
|
+
|
sma/ontology/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Universal OWL/OBO ontology loader, mounter, registry, and router for SMA-1.
|
|
2
|
+
|
|
3
|
+
This package generalizes the hand-rolled HPO mount in
|
|
4
|
+
``scripts/rare_disease_test.py`` into a reusable pipeline: parse any OBO/OWL
|
|
5
|
+
ontology into a normalized :class:`OntologyGraph`, mount it onto a
|
|
6
|
+
``Canonicalizer`` (is-a edges become the predicate lattice), build a
|
|
7
|
+
``MacFacIndex`` over cases, and retrieve by structural analogy. A
|
|
8
|
+
:class:`OntologyRegistry` caches mounted ontologies and a :class:`DomainRouter`
|
|
9
|
+
selects which ontology a query belongs to. See ``sma/ontology/README.md``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from .attack import load_attack_stix
|
|
15
|
+
from .cpc import load_cpc
|
|
16
|
+
from .mitre_xml import load_capec, load_cwe, load_mitre_xml
|
|
17
|
+
from .rdf_loader import load_rdflib
|
|
18
|
+
from .usgaap import load_usgaap
|
|
19
|
+
from .graph import OntologyGraph, Term
|
|
20
|
+
from .loader import fid, load_obo, load_ontology, load_owl, load_owl_dir
|
|
21
|
+
from .mount import MountedOntology, mount
|
|
22
|
+
from .registry import OntologyEntry, OntologyRegistry
|
|
23
|
+
from .router import DomainRouter
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"OntologyGraph",
|
|
27
|
+
"Term",
|
|
28
|
+
"load_obo",
|
|
29
|
+
"load_owl",
|
|
30
|
+
"load_owl_dir",
|
|
31
|
+
"load_ontology",
|
|
32
|
+
"fid",
|
|
33
|
+
"MountedOntology",
|
|
34
|
+
"mount",
|
|
35
|
+
"OntologyRegistry",
|
|
36
|
+
"OntologyEntry",
|
|
37
|
+
"DomainRouter",
|
|
38
|
+
"load_attack_stix",
|
|
39
|
+
"load_cpc",
|
|
40
|
+
"load_capec",
|
|
41
|
+
"load_cwe",
|
|
42
|
+
"load_mitre_xml",
|
|
43
|
+
"load_rdflib",
|
|
44
|
+
"load_usgaap",
|
|
45
|
+
]
|
sma/ontology/attack.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Load MITRE ATT&CK (STIX 2.1 JSON) into the normalized :class:`OntologyGraph`.
|
|
2
|
+
|
|
3
|
+
ATT&CK ships as a STIX bundle (``mitre/cti`` ``enterprise-attack.json``), not
|
|
4
|
+
OBO/OWL, so it needs a dedicated parser. It maps cleanly onto the same shape the
|
|
5
|
+
rest of the ontology package consumes:
|
|
6
|
+
|
|
7
|
+
* ``attack-pattern`` objects become technique terms, keyed by their ATT&CK
|
|
8
|
+
``external_id`` (e.g. ``"T1059"`` or sub-technique ``"T1059.001"``).
|
|
9
|
+
* ``x-mitre-tactic`` objects become tactic terms, keyed by their ``shortname``.
|
|
10
|
+
* A sub-technique ``T1059.001`` gets is_a parent ``T1059`` (split on ``"."``);
|
|
11
|
+
this is corroborated by ``relationship`` objects of type ``subtechnique-of``.
|
|
12
|
+
* A technique's ``kill_chain_phases`` and STIX ``uses``/``mitigates``
|
|
13
|
+
relationships become typed relations between the mapped external ids.
|
|
14
|
+
|
|
15
|
+
Revoked or ``x_mitre_deprecated`` objects are marked obsolete.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from .graph import OntologyGraph, Term
|
|
24
|
+
|
|
25
|
+
#: ATT&CK download URL (kept here so the demo can surface it without fetching).
|
|
26
|
+
ATTACK_STIX_URL = (
|
|
27
|
+
"https://raw.githubusercontent.com/mitre/cti/master/"
|
|
28
|
+
"enterprise-attack/enterprise-attack.json"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _external_id(obj: dict) -> str:
|
|
33
|
+
"""Return the ATT&CK external_id (e.g. ``T1059``) for a STIX object, or ``""``."""
|
|
34
|
+
for ref in obj.get("external_references", ()):
|
|
35
|
+
if ref.get("source_name") == "mitre-attack" and ref.get("external_id"):
|
|
36
|
+
return ref["external_id"]
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_obsolete(obj: dict) -> bool:
|
|
41
|
+
"""True if the STIX object is revoked or marked deprecated."""
|
|
42
|
+
return bool(obj.get("revoked")) or bool(obj.get("x_mitre_deprecated"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_attack_stix(path: str, name: str = "attack") -> OntologyGraph:
|
|
46
|
+
"""Parse an ATT&CK STIX 2.1 bundle into an :class:`OntologyGraph`.
|
|
47
|
+
|
|
48
|
+
Techniques (``attack-pattern``) and tactics (``x-mitre-tactic``) become
|
|
49
|
+
terms; sub-technique is_a edges, kill-chain ``accomplishes`` links, and
|
|
50
|
+
``uses``/``mitigates`` relationships become parents/typed relations between
|
|
51
|
+
the resolved external ids.
|
|
52
|
+
"""
|
|
53
|
+
with open(path, "r", encoding="utf-8") as handle:
|
|
54
|
+
bundle = json.load(handle)
|
|
55
|
+
|
|
56
|
+
version = str(bundle.get("spec_version", "") or "")
|
|
57
|
+
objects = bundle.get("objects", [])
|
|
58
|
+
|
|
59
|
+
terms: dict[str, Term] = {}
|
|
60
|
+
# STIX object 'id' -> our term id (external_id / tactic shortname), so that
|
|
61
|
+
# 'relationship' objects (which reference STIX ids) can resolve endpoints.
|
|
62
|
+
stix_to_term: dict[str, str] = {}
|
|
63
|
+
# Accumulate parents/relations per term id before constructing Term records.
|
|
64
|
+
parents: dict[str, set[str]] = {}
|
|
65
|
+
relations: dict[str, set[tuple[str, str]]] = {}
|
|
66
|
+
obsolete: dict[str, bool] = {}
|
|
67
|
+
names: dict[str, str] = {}
|
|
68
|
+
|
|
69
|
+
# --- First pass: collect technique + tactic terms. --------------------- #
|
|
70
|
+
for obj in objects:
|
|
71
|
+
otype = obj.get("type")
|
|
72
|
+
if otype == "attack-pattern":
|
|
73
|
+
tid = _external_id(obj)
|
|
74
|
+
if not tid:
|
|
75
|
+
continue
|
|
76
|
+
stix_to_term[obj.get("id", "")] = tid
|
|
77
|
+
names[tid] = obj.get("name", "")
|
|
78
|
+
obsolete[tid] = _is_obsolete(obj)
|
|
79
|
+
parents.setdefault(tid, set())
|
|
80
|
+
relations.setdefault(tid, set())
|
|
81
|
+
# Sub-technique is_a parent derived by splitting the id on ".".
|
|
82
|
+
if "." in tid:
|
|
83
|
+
parents[tid].add(tid.split(".", 1)[0])
|
|
84
|
+
# kill_chain_phases -> ("accomplishes", tactic_shortname)
|
|
85
|
+
for phase in obj.get("kill_chain_phases", ()):
|
|
86
|
+
if phase.get("kill_chain_name") == "mitre-attack":
|
|
87
|
+
pname = phase.get("phase_name")
|
|
88
|
+
if pname:
|
|
89
|
+
relations[tid].add(("accomplishes", pname))
|
|
90
|
+
elif otype == "x-mitre-tactic":
|
|
91
|
+
short = obj.get("x_mitre_shortname") or _external_id(obj)
|
|
92
|
+
if not short:
|
|
93
|
+
continue
|
|
94
|
+
stix_to_term[obj.get("id", "")] = short
|
|
95
|
+
names[short] = obj.get("name", "")
|
|
96
|
+
obsolete[short] = _is_obsolete(obj)
|
|
97
|
+
parents.setdefault(short, set())
|
|
98
|
+
relations.setdefault(short, set())
|
|
99
|
+
|
|
100
|
+
# --- Second pass: STIX relationship objects. --------------------------- #
|
|
101
|
+
for obj in objects:
|
|
102
|
+
if obj.get("type") != "relationship":
|
|
103
|
+
continue
|
|
104
|
+
if _is_obsolete(obj):
|
|
105
|
+
continue
|
|
106
|
+
rtype = obj.get("relationship_type")
|
|
107
|
+
src = stix_to_term.get(obj.get("source_ref", ""))
|
|
108
|
+
tgt = stix_to_term.get(obj.get("target_ref", ""))
|
|
109
|
+
if not src or not tgt:
|
|
110
|
+
continue
|
|
111
|
+
if rtype == "subtechnique-of":
|
|
112
|
+
# Corroborates (and is the source of truth for) the is_a edge.
|
|
113
|
+
parents.setdefault(src, set()).add(tgt)
|
|
114
|
+
elif rtype in ("uses", "mitigates"):
|
|
115
|
+
relations.setdefault(src, set()).add((rtype, tgt))
|
|
116
|
+
|
|
117
|
+
# --- Materialize Term records (parents/relations to resolvable ids). --- #
|
|
118
|
+
for tid in names:
|
|
119
|
+
ps = tuple(sorted(p for p in parents.get(tid, ()) if p in names))
|
|
120
|
+
rs = tuple(sorted(
|
|
121
|
+
(rel, obj_id) for rel, obj_id in relations.get(tid, ())
|
|
122
|
+
if obj_id in names
|
|
123
|
+
))
|
|
124
|
+
terms[tid] = Term(
|
|
125
|
+
id=tid,
|
|
126
|
+
name=names[tid],
|
|
127
|
+
parents=ps,
|
|
128
|
+
relations=rs,
|
|
129
|
+
obsolete=obsolete.get(tid, False),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if not name:
|
|
133
|
+
name = Path(path).stem
|
|
134
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|
sma/ontology/cpc.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Loader for the Cooperative Patent Classification (CPC) scheme XML.
|
|
2
|
+
|
|
3
|
+
CPC ships as one XML file per subclass (``cpc-scheme-A01B.xml`` ...), each a tree
|
|
4
|
+
of nested ``<classification-item>`` elements. The nesting IS the is-a hierarchy:
|
|
5
|
+
a classification-item nested inside another is a narrower category of it. We map
|
|
6
|
+
each item's ``<classification-symbol>`` to a :class:`Term` id, its
|
|
7
|
+
``<class-title>`` text to the name, and the enclosing item's symbol to its is-a
|
|
8
|
+
parent. This yields the deep (~250k node) golden taxonomy for the legal/IP arm.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import xml.etree.ElementTree as ET
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from .graph import OntologyGraph, Term
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _local(tag: str) -> str:
|
|
19
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _title(item: ET.Element) -> str:
|
|
23
|
+
"""Concatenate the text fragments under an item's direct <class-title>."""
|
|
24
|
+
for child in item:
|
|
25
|
+
if _local(child.tag) == "class-title":
|
|
26
|
+
parts = [t.text.strip() for t in child.iter()
|
|
27
|
+
if _local(t.tag) == "text" and t.text and t.text.strip()]
|
|
28
|
+
return "; ".join(parts)
|
|
29
|
+
return ""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _walk(item: ET.Element, parent_symbol: str, terms: dict[str, Term]) -> None:
|
|
33
|
+
symbol = ""
|
|
34
|
+
for child in item:
|
|
35
|
+
if _local(child.tag) == "classification-symbol":
|
|
36
|
+
symbol = (child.text or "").strip()
|
|
37
|
+
break
|
|
38
|
+
if symbol:
|
|
39
|
+
existing = terms.get(symbol)
|
|
40
|
+
parents = (parent_symbol,) if parent_symbol else ()
|
|
41
|
+
if existing is None:
|
|
42
|
+
terms[symbol] = Term(id=symbol, name=_title(item), parents=parents)
|
|
43
|
+
elif parent_symbol and parent_symbol not in existing.parents:
|
|
44
|
+
terms[symbol] = Term(id=symbol, name=existing.name or _title(item),
|
|
45
|
+
parents=tuple(dict.fromkeys((*existing.parents, parent_symbol))))
|
|
46
|
+
next_parent = symbol or parent_symbol
|
|
47
|
+
for child in item:
|
|
48
|
+
if _local(child.tag) == "classification-item":
|
|
49
|
+
_walk(child, next_parent, terms)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_cpc(path: str, name: str = "cpc") -> OntologyGraph:
|
|
53
|
+
"""Load the CPC scheme from a directory of cpc-scheme-*.xml files (or one file)."""
|
|
54
|
+
root_path = Path(path)
|
|
55
|
+
files = sorted(root_path.glob("cpc-scheme-*.xml")) if root_path.is_dir() else [root_path]
|
|
56
|
+
terms: dict[str, Term] = {}
|
|
57
|
+
version = ""
|
|
58
|
+
for f in files:
|
|
59
|
+
try:
|
|
60
|
+
tree = ET.parse(f)
|
|
61
|
+
except ET.ParseError:
|
|
62
|
+
continue
|
|
63
|
+
scheme = tree.getroot()
|
|
64
|
+
if not version:
|
|
65
|
+
version = scheme.get("publication-date", "")
|
|
66
|
+
for child in scheme:
|
|
67
|
+
if _local(child.tag) == "classification-item":
|
|
68
|
+
_walk(child, "", terms)
|
|
69
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|