structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/match/mh.py ADDED
@@ -0,0 +1,177 @@
1
+ """Match hypothesis seeding and support closure."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import defaultdict, deque
6
+
7
+ from sma.ir.canon import Canonicalizer, default_canonicalizer
8
+ from sma.ir.schema import Entity, Statement
9
+ from sma.ir.sexpr import dumps_node
10
+
11
+ from .types import MatchConfig, MatchHypothesis
12
+
13
+
14
+ def seed_expression_mhs(
15
+ base_exprs: tuple[Statement, ...],
16
+ target_exprs: tuple[Statement, ...],
17
+ config: MatchConfig | None = None,
18
+ canon: Canonicalizer | None = None,
19
+ ) -> tuple[MatchHypothesis, ...]:
20
+ config = config or MatchConfig()
21
+ canon = canon or default_canonicalizer()
22
+ base_groups = _group_by_signature(base_exprs, canon)
23
+ target_groups = _group_by_signature(target_exprs, canon)
24
+
25
+ out: list[MatchHypothesis] = []
26
+ for key, bases in base_groups.items():
27
+ targets = target_groups.get(key)
28
+ if targets:
29
+ for b, t in _capped_pairs(bases, targets, config.mh_group_cap):
30
+ out.append(MatchHypothesis(b, t))
31
+ if config.delta > 0:
32
+ # Minimal ascension across canonical groups, also capped per group pair.
33
+ for (b_functor, b_arity), bases in base_groups.items():
34
+ for (t_functor, t_arity), targets in target_groups.items():
35
+ if b_arity != t_arity or b_functor == t_functor:
36
+ continue
37
+ ok, asc, ancestor, dist = canon.compatible(
38
+ b_functor, t_functor, delta=config.delta, rho=config.rho
39
+ )
40
+ if ok:
41
+ for b, t in _capped_pairs(bases, targets, config.mh_group_cap):
42
+ out.append(MatchHypothesis(b, t, asc, ancestor, dist))
43
+ return tuple(out)
44
+
45
+
46
+ def _group_by_signature(
47
+ exprs: tuple[Statement, ...], canon: Canonicalizer
48
+ ) -> dict[tuple[str, int], list[Statement]]:
49
+ groups: dict[tuple[str, int], list[Statement]] = defaultdict(list)
50
+ for expr in exprs:
51
+ groups[(canon.canonical(expr.functor), expr.arity)].append(expr)
52
+ return groups
53
+
54
+
55
+ def _capped_pairs(
56
+ bases: list[Statement], targets: list[Statement], cap: int
57
+ ) -> list[tuple[Statement, Statement]]:
58
+ """Deterministic U-ordered pair selection within one functor group.
59
+
60
+ Small groups keep the full cross product. Large groups are capped:
61
+ bit-identical statements pair first (they carry the highest achievable
62
+ match score), then a band around the canonical sort order fills the rest.
63
+ """
64
+ if len(bases) * len(targets) <= cap:
65
+ return [(b, t) for b in bases for t in targets]
66
+
67
+ base_text = {id(b): dumps_node(b) for b in bases}
68
+ target_text = {id(t): dumps_node(t) for t in targets}
69
+ sorted_bases = sorted(bases, key=lambda s: base_text[id(s)])
70
+ sorted_targets = sorted(targets, key=lambda s: target_text[id(s)])
71
+
72
+ pairs: list[tuple[Statement, Statement]] = []
73
+ used: set[tuple[int, int]] = set()
74
+
75
+ by_text: dict[str, deque] = defaultdict(deque)
76
+ for t in sorted_targets:
77
+ by_text[target_text[id(t)]].append(t)
78
+ for b in sorted_bases:
79
+ queue = by_text.get(base_text[id(b)])
80
+ if queue:
81
+ t = queue.popleft()
82
+ pairs.append((b, t))
83
+ used.add((id(b), id(t)))
84
+ if len(pairs) >= cap:
85
+ return pairs
86
+
87
+ n_targets = len(sorted_targets)
88
+ for offset in range(n_targets):
89
+ for i, b in enumerate(sorted_bases):
90
+ for j in ((i + offset, i - offset) if offset else (i,)):
91
+ if 0 <= j < n_targets:
92
+ t = sorted_targets[j]
93
+ if (id(b), id(t)) not in used:
94
+ used.add((id(b), id(t)))
95
+ pairs.append((b, t))
96
+ if len(pairs) >= cap:
97
+ return pairs
98
+ return pairs
99
+
100
+
101
+ # Entity types whose names are CONSTANTS, not variables (blueprint 2.1:
102
+ # entities/constants are distinct vocabulary classes). A template-name or
103
+ # integer entity denotes itself; pairing count(template_A, 3) with
104
+ # count(template_B, 2) is vacuous shape-matching, not analogy - it was the
105
+ # root cause of the Liberty haystack failure (generic bookkeeping skeleton
106
+ # matching any session against any other).
107
+ # Integers are deliberately NOT constants: count(template_X, 3) vs
108
+ # count(template_X, 5) is a legitimate analogy (same burst, different size);
109
+ # the template-name constraint alone blocks the vacuous cross-template case.
110
+ CONSTANT_ENTITY_TYPES = frozenset({"event_type"})
111
+
112
+
113
+ def constants_compatible(b_ent: Entity, t_ent: Entity) -> bool:
114
+ if b_ent.type in CONSTANT_ENTITY_TYPES and t_ent.type in CONSTANT_ENTITY_TYPES:
115
+ return b_ent.name == t_ent.name
116
+ return True
117
+
118
+
119
+ def support_closure(
120
+ root: MatchHypothesis,
121
+ canon: Canonicalizer | None = None,
122
+ delta: int = 0,
123
+ rho: float = 1.0,
124
+ ) -> tuple[MatchHypothesis, ...] | None:
125
+ """Downward closure of a root MH; None when structurally impossible.
126
+
127
+ SME parallel connectivity: argument correspondences must themselves be
128
+ LEGAL match hypotheses. A statement-argument pair with incompatible
129
+ functors invalidates the whole kernel (previously it was silently
130
+ admitted, letting higher-order parents like `before` manufacture
131
+ cross-template "matches" that surprisal weighting then amplified - the
132
+ Liberty ses_n>1 anomaly). Compatibility = canonical identity, or lattice
133
+ ascension within delta at rho^dist penalty. Unequal constants likewise
134
+ invalidate.
135
+ """
136
+ canon = canon or default_canonicalizer()
137
+ out: list[MatchHypothesis] = []
138
+ seen: set[tuple[str, str]] = set()
139
+ bad = False
140
+
141
+ def add(mh: MatchHypothesis) -> None:
142
+ nonlocal bad
143
+ if bad or mh.key in seen:
144
+ return
145
+ seen.add(mh.key)
146
+ out.append(mh)
147
+ if isinstance(mh.base, Statement) and isinstance(mh.target, Statement):
148
+ if mh.base.arity != mh.target.arity:
149
+ bad = True
150
+ return
151
+ for b_arg, t_arg in zip(mh.base.args, mh.target.args, strict=True):
152
+ if isinstance(b_arg, Entity) and isinstance(t_arg, Entity):
153
+ if not constants_compatible(b_arg, t_arg):
154
+ bad = True
155
+ return
156
+ add(MatchHypothesis(b_arg, t_arg))
157
+ elif isinstance(b_arg, Statement) and isinstance(t_arg, Statement):
158
+ ok, asc, ancestor, dist = canon.compatible(
159
+ b_arg.functor, t_arg.functor, delta=delta, rho=rho
160
+ )
161
+ if not ok:
162
+ bad = True
163
+ return
164
+ # Each MH pays ITS OWN ascension penalty only - the
165
+ # parent's penalty lives in the parent's weight.
166
+ # Multiplying down the chain (a previous bug) punished
167
+ # deep systems exponentially, the opposite of
168
+ # systematicity.
169
+ add(MatchHypothesis(b_arg, t_arg, ascension=asc,
170
+ ancestor=ancestor, distance=dist))
171
+ else:
172
+ # Statement paired with entity (or vice versa): illegal.
173
+ bad = True
174
+ return
175
+
176
+ add(root)
177
+ return None if bad else tuple(out)
sma/match/ses.py ADDED
@@ -0,0 +1,84 @@
1
+ """Structural evaluation score with trickle-down support.
2
+
3
+ Two weighting regimes share this code path:
4
+ - SES (default): every match hypothesis carries unit base weight sigma_0 = 1.
5
+ - surprisal-SES (score-v2 candidate, ADR-004 upgrade path): statement MHs
6
+ carry sigma_0 = corpus surprisal of their canonical functor (-log2 p), so
7
+ rare shared structure counts more while systematicity still compounds via
8
+ trickle-down. With cost_fn=None this reduces exactly to SES.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Callable
14
+
15
+ from sma.ir.schema import Statement
16
+
17
+ from .types import GMap, MatchHypothesis, node_key
18
+
19
+ CostFn = Callable[[MatchHypothesis], float]
20
+
21
+
22
+ def structural_evaluation(
23
+ hypotheses: tuple[MatchHypothesis, ...],
24
+ gamma: float = 0.25,
25
+ cost_fn: CostFn | None = None,
26
+ ) -> float:
27
+ by_key = {mh.key: mh for mh in hypotheses}
28
+ parents: dict[tuple[str, str], list[tuple[str, str]]] = {mh.key: [] for mh in hypotheses}
29
+ for mh in hypotheses:
30
+ if not isinstance(mh.base, Statement) or not isinstance(mh.target, Statement):
31
+ continue
32
+ for b_arg, t_arg in zip(mh.base.args, mh.target.args, strict=True):
33
+ child_key = (node_key(b_arg), node_key(t_arg))
34
+ if child_key in parents:
35
+ parents[child_key].append(mh.key)
36
+
37
+ def weight(mh: MatchHypothesis) -> float:
38
+ return 1.0 if cost_fn is None else cost_fn(mh)
39
+
40
+ memo: dict[tuple[str, str], float] = {}
41
+
42
+ def score(key: tuple[str, str], stack: frozenset[tuple[str, str]] = frozenset()) -> float:
43
+ if key in memo:
44
+ return memo[key]
45
+ if key in stack:
46
+ return weight(by_key[key]) * by_key[key].ascension
47
+ parent_score = sum(score(parent, stack | {key}) for parent in parents.get(key, ()))
48
+ value = weight(by_key[key]) * by_key[key].ascension + gamma * parent_score
49
+ memo[key] = value
50
+ return value
51
+
52
+ return sum(score(key) for key in by_key)
53
+
54
+
55
+ def self_score(case, gamma: float = 0.25, cost_fn: CostFn | None = None) -> float:
56
+ hyps: list[MatchHypothesis] = []
57
+ for expr in case.expressions():
58
+ hyps.append(MatchHypothesis(expr, expr))
59
+ for entity in expr.entities():
60
+ hyps.append(MatchHypothesis(entity, entity))
61
+ unique = {mh.key: mh for mh in hyps}
62
+ return structural_evaluation(tuple(unique.values()), gamma=gamma, cost_fn=cost_fn)
63
+
64
+
65
+ def normalize_score(
66
+ score: float,
67
+ base,
68
+ target,
69
+ gamma: float = 0.25,
70
+ cost_fn: CostFn | None = None,
71
+ normalization: str = "max",
72
+ ) -> float:
73
+ # Same weights in numerator and denominators keep ses_n scale-free.
74
+ self_base = self_score(base, gamma, cost_fn=cost_fn)
75
+ self_target = self_score(target, gamma, cost_fn=cost_fn)
76
+ if normalization == "min":
77
+ denom = min(self_base, self_target)
78
+ elif normalization == "sqrt":
79
+ denom = (self_base * self_target) ** 0.5
80
+ elif normalization == "target":
81
+ denom = self_target
82
+ else: # "max", blueprint 2.3
83
+ denom = max(self_base, self_target)
84
+ return score / max(denom, 1e-9)
sma/match/types.py ADDED
@@ -0,0 +1,115 @@
1
+ """Shared matcher dataclasses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from functools import cached_property
7
+
8
+ from sma.ir.schema import Case, Entity, Node, Statement
9
+ from sma.ir.sexpr import dumps_node
10
+
11
+
12
+ def node_key(node: Node) -> str:
13
+ prefix = "E" if isinstance(node, Entity) else "S"
14
+ return f"{prefix}:{dumps_node(node)}"
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class MatchHypothesis:
19
+ base: Node
20
+ target: Node
21
+ ascension: float = 1.0
22
+ ancestor: str | None = None
23
+ distance: int = 0
24
+
25
+ # node_key serializes the whole expression tree, and these keys are read
26
+ # O(kernels^2) times during merge — they must be computed once per instance.
27
+ @cached_property
28
+ def base_key(self) -> str:
29
+ return node_key(self.base)
30
+
31
+ @cached_property
32
+ def target_key(self) -> str:
33
+ return node_key(self.target)
34
+
35
+ @cached_property
36
+ def key(self) -> tuple[str, str]:
37
+ return (self.base_key, self.target_key)
38
+
39
+
40
+ @dataclass
41
+ class Kernel:
42
+ root: MatchHypothesis
43
+ hypotheses: tuple[MatchHypothesis, ...]
44
+ weight: float = 0.0
45
+
46
+ @cached_property
47
+ def bindings(self) -> dict[str, str]:
48
+ return {mh.base_key: mh.target_key for mh in self.hypotheses}
49
+
50
+ @cached_property
51
+ def reverse_bindings(self) -> dict[str, str]:
52
+ return {target: base for base, target in self.bindings.items()}
53
+
54
+
55
+ @dataclass
56
+ class GMap:
57
+ base: Case
58
+ target: Case
59
+ hypotheses: tuple[MatchHypothesis, ...]
60
+ kernels: tuple[Kernel, ...]
61
+ score: float
62
+ normalized_score: float
63
+ scorer: str = "ses"
64
+ optimality_gap: float | None = None
65
+
66
+ @property
67
+ def correspondences(self) -> list[dict[str, str | float | int | None]]:
68
+ return [
69
+ {
70
+ "base": mh.base_key,
71
+ "target": mh.target_key,
72
+ "ascension": mh.ascension,
73
+ "ancestor": mh.ancestor,
74
+ "distance": mh.distance,
75
+ }
76
+ for mh in self.hypotheses
77
+ ]
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class CandidateInference:
82
+ inference_sexpr: str
83
+ base_case_id: str
84
+ target_case_id: str
85
+ ses_n: float
86
+ support: tuple[str, ...] = ()
87
+ skolems: tuple[str, ...] = ()
88
+ ascensions: tuple[str, ...] = ()
89
+ status: str = "hypothetical"
90
+
91
+
92
+ @dataclass
93
+ class MatchConfig:
94
+ gamma: float = 0.25
95
+ rho: float = 0.95 # frozen at prereg-v1 (calibration grid; inert when delta=0)
96
+ delta: int = 0
97
+ scorer: str = "surprisal" # "ses" | "mdl" | "surprisal" (score-v2, ADR-005)
98
+ # Normalization of the structural score: "max" (blueprint 2.3),
99
+ # "min" (10.2 tripwire), "sqrt" (geometric mean, cosine-style symmetric),
100
+ # "target" (query-relative; ranking == raw-score ordering per query).
101
+ # Frozen to "max" at prereg-v1 (calibration grid: beats target on family
102
+ # and LOO-haystack validation). Registered caveat: out-of-corpus haystack
103
+ # probes use hybrid fused as the production posture.
104
+ normalization: str = "max"
105
+ # Corpus surprisal per canonical functor (-log2 p), supplied by the index
106
+ # for scorer="surprisal"; None means unit weights (identical to "ses").
107
+ functor_costs: dict | None = None
108
+ exact_kernel_limit: int = 60
109
+ cpsat_time_ms: int = 20
110
+ # Tripwire response from blueprint section 10.2: cap MH pairs per functor
111
+ # group (U-ordered: identical statements first) so sessions with many
112
+ # repeated event types cannot explode the kernel count quadratically.
113
+ mh_group_cap: int = 128
114
+ metadata: dict = field(default_factory=dict)
115
+
sma/match/verifier.py ADDED
@@ -0,0 +1,27 @@
1
+ """Inference verifier."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from sma.ir.sexpr import loads_statement
8
+ from sma.ir.signatures import SignatureRegistry
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class VerificationResult:
13
+ status: str
14
+ reasons: tuple[str, ...] = ()
15
+
16
+
17
+ def verify_inference(inference_sexpr: str, registry: SignatureRegistry | None = None) -> VerificationResult:
18
+ registry = registry or SignatureRegistry.with_defaults()
19
+ try:
20
+ statement = loads_statement(inference_sexpr)
21
+ registry.validate_statement(statement)
22
+ except Exception as exc:
23
+ return VerificationResult("type_fail", (str(exc),))
24
+ if "AnalogySkolemFn_" in inference_sexpr:
25
+ return VerificationResult("hypothetical", ("contains analogy skolems",))
26
+ return VerificationResult("pass", ())
27
+
@@ -0,0 +1,45 @@
1
+ """Universal OWL/OBO ontology loader, mounter, registry, and router for SMA-1.
2
+
3
+ This package generalizes the hand-rolled HPO mount in
4
+ ``scripts/rare_disease_test.py`` into a reusable pipeline: parse any OBO/OWL
5
+ ontology into a normalized :class:`OntologyGraph`, mount it onto a
6
+ ``Canonicalizer`` (is-a edges become the predicate lattice), build a
7
+ ``MacFacIndex`` over cases, and retrieve by structural analogy. A
8
+ :class:`OntologyRegistry` caches mounted ontologies and a :class:`DomainRouter`
9
+ selects which ontology a query belongs to. See ``sma/ontology/README.md``.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from .attack import load_attack_stix
15
+ from .cpc import load_cpc
16
+ from .mitre_xml import load_capec, load_cwe, load_mitre_xml
17
+ from .rdf_loader import load_rdflib
18
+ from .usgaap import load_usgaap
19
+ from .graph import OntologyGraph, Term
20
+ from .loader import fid, load_obo, load_ontology, load_owl, load_owl_dir
21
+ from .mount import MountedOntology, mount
22
+ from .registry import OntologyEntry, OntologyRegistry
23
+ from .router import DomainRouter
24
+
25
+ __all__ = [
26
+ "OntologyGraph",
27
+ "Term",
28
+ "load_obo",
29
+ "load_owl",
30
+ "load_owl_dir",
31
+ "load_ontology",
32
+ "fid",
33
+ "MountedOntology",
34
+ "mount",
35
+ "OntologyRegistry",
36
+ "OntologyEntry",
37
+ "DomainRouter",
38
+ "load_attack_stix",
39
+ "load_cpc",
40
+ "load_capec",
41
+ "load_cwe",
42
+ "load_mitre_xml",
43
+ "load_rdflib",
44
+ "load_usgaap",
45
+ ]
sma/ontology/attack.py ADDED
@@ -0,0 +1,134 @@
1
+ """Load MITRE ATT&CK (STIX 2.1 JSON) into the normalized :class:`OntologyGraph`.
2
+
3
+ ATT&CK ships as a STIX bundle (``mitre/cti`` ``enterprise-attack.json``), not
4
+ OBO/OWL, so it needs a dedicated parser. It maps cleanly onto the same shape the
5
+ rest of the ontology package consumes:
6
+
7
+ * ``attack-pattern`` objects become technique terms, keyed by their ATT&CK
8
+ ``external_id`` (e.g. ``"T1059"`` or sub-technique ``"T1059.001"``).
9
+ * ``x-mitre-tactic`` objects become tactic terms, keyed by their ``shortname``.
10
+ * A sub-technique ``T1059.001`` gets is_a parent ``T1059`` (split on ``"."``);
11
+ this is corroborated by ``relationship`` objects of type ``subtechnique-of``.
12
+ * A technique's ``kill_chain_phases`` and STIX ``uses``/``mitigates``
13
+ relationships become typed relations between the mapped external ids.
14
+
15
+ Revoked or ``x_mitre_deprecated`` objects are marked obsolete.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ from pathlib import Path
22
+
23
+ from .graph import OntologyGraph, Term
24
+
25
+ #: ATT&CK download URL (kept here so the demo can surface it without fetching).
26
+ ATTACK_STIX_URL = (
27
+ "https://raw.githubusercontent.com/mitre/cti/master/"
28
+ "enterprise-attack/enterprise-attack.json"
29
+ )
30
+
31
+
32
+ def _external_id(obj: dict) -> str:
33
+ """Return the ATT&CK external_id (e.g. ``T1059``) for a STIX object, or ``""``."""
34
+ for ref in obj.get("external_references", ()):
35
+ if ref.get("source_name") == "mitre-attack" and ref.get("external_id"):
36
+ return ref["external_id"]
37
+ return ""
38
+
39
+
40
+ def _is_obsolete(obj: dict) -> bool:
41
+ """True if the STIX object is revoked or marked deprecated."""
42
+ return bool(obj.get("revoked")) or bool(obj.get("x_mitre_deprecated"))
43
+
44
+
45
+ def load_attack_stix(path: str, name: str = "attack") -> OntologyGraph:
46
+ """Parse an ATT&CK STIX 2.1 bundle into an :class:`OntologyGraph`.
47
+
48
+ Techniques (``attack-pattern``) and tactics (``x-mitre-tactic``) become
49
+ terms; sub-technique is_a edges, kill-chain ``accomplishes`` links, and
50
+ ``uses``/``mitigates`` relationships become parents/typed relations between
51
+ the resolved external ids.
52
+ """
53
+ with open(path, "r", encoding="utf-8") as handle:
54
+ bundle = json.load(handle)
55
+
56
+ version = str(bundle.get("spec_version", "") or "")
57
+ objects = bundle.get("objects", [])
58
+
59
+ terms: dict[str, Term] = {}
60
+ # STIX object 'id' -> our term id (external_id / tactic shortname), so that
61
+ # 'relationship' objects (which reference STIX ids) can resolve endpoints.
62
+ stix_to_term: dict[str, str] = {}
63
+ # Accumulate parents/relations per term id before constructing Term records.
64
+ parents: dict[str, set[str]] = {}
65
+ relations: dict[str, set[tuple[str, str]]] = {}
66
+ obsolete: dict[str, bool] = {}
67
+ names: dict[str, str] = {}
68
+
69
+ # --- First pass: collect technique + tactic terms. --------------------- #
70
+ for obj in objects:
71
+ otype = obj.get("type")
72
+ if otype == "attack-pattern":
73
+ tid = _external_id(obj)
74
+ if not tid:
75
+ continue
76
+ stix_to_term[obj.get("id", "")] = tid
77
+ names[tid] = obj.get("name", "")
78
+ obsolete[tid] = _is_obsolete(obj)
79
+ parents.setdefault(tid, set())
80
+ relations.setdefault(tid, set())
81
+ # Sub-technique is_a parent derived by splitting the id on ".".
82
+ if "." in tid:
83
+ parents[tid].add(tid.split(".", 1)[0])
84
+ # kill_chain_phases -> ("accomplishes", tactic_shortname)
85
+ for phase in obj.get("kill_chain_phases", ()):
86
+ if phase.get("kill_chain_name") == "mitre-attack":
87
+ pname = phase.get("phase_name")
88
+ if pname:
89
+ relations[tid].add(("accomplishes", pname))
90
+ elif otype == "x-mitre-tactic":
91
+ short = obj.get("x_mitre_shortname") or _external_id(obj)
92
+ if not short:
93
+ continue
94
+ stix_to_term[obj.get("id", "")] = short
95
+ names[short] = obj.get("name", "")
96
+ obsolete[short] = _is_obsolete(obj)
97
+ parents.setdefault(short, set())
98
+ relations.setdefault(short, set())
99
+
100
+ # --- Second pass: STIX relationship objects. --------------------------- #
101
+ for obj in objects:
102
+ if obj.get("type") != "relationship":
103
+ continue
104
+ if _is_obsolete(obj):
105
+ continue
106
+ rtype = obj.get("relationship_type")
107
+ src = stix_to_term.get(obj.get("source_ref", ""))
108
+ tgt = stix_to_term.get(obj.get("target_ref", ""))
109
+ if not src or not tgt:
110
+ continue
111
+ if rtype == "subtechnique-of":
112
+ # Corroborates (and is the source of truth for) the is_a edge.
113
+ parents.setdefault(src, set()).add(tgt)
114
+ elif rtype in ("uses", "mitigates"):
115
+ relations.setdefault(src, set()).add((rtype, tgt))
116
+
117
+ # --- Materialize Term records (parents/relations to resolvable ids). --- #
118
+ for tid in names:
119
+ ps = tuple(sorted(p for p in parents.get(tid, ()) if p in names))
120
+ rs = tuple(sorted(
121
+ (rel, obj_id) for rel, obj_id in relations.get(tid, ())
122
+ if obj_id in names
123
+ ))
124
+ terms[tid] = Term(
125
+ id=tid,
126
+ name=names[tid],
127
+ parents=ps,
128
+ relations=rs,
129
+ obsolete=obsolete.get(tid, False),
130
+ )
131
+
132
+ if not name:
133
+ name = Path(path).stem
134
+ return OntologyGraph(name=name, version=version, terms=terms)
sma/ontology/cpc.py ADDED
@@ -0,0 +1,69 @@
1
+ """Loader for the Cooperative Patent Classification (CPC) scheme XML.
2
+
3
+ CPC ships as one XML file per subclass (``cpc-scheme-A01B.xml`` ...), each a tree
4
+ of nested ``<classification-item>`` elements. The nesting IS the is-a hierarchy:
5
+ a classification-item nested inside another is a narrower category of it. We map
6
+ each item's ``<classification-symbol>`` to a :class:`Term` id, its
7
+ ``<class-title>`` text to the name, and the enclosing item's symbol to its is-a
8
+ parent. This yields the deep (~250k node) golden taxonomy for the legal/IP arm.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import xml.etree.ElementTree as ET
13
+ from pathlib import Path
14
+
15
+ from .graph import OntologyGraph, Term
16
+
17
+
18
+ def _local(tag: str) -> str:
19
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
20
+
21
+
22
+ def _title(item: ET.Element) -> str:
23
+ """Concatenate the text fragments under an item's direct <class-title>."""
24
+ for child in item:
25
+ if _local(child.tag) == "class-title":
26
+ parts = [t.text.strip() for t in child.iter()
27
+ if _local(t.tag) == "text" and t.text and t.text.strip()]
28
+ return "; ".join(parts)
29
+ return ""
30
+
31
+
32
+ def _walk(item: ET.Element, parent_symbol: str, terms: dict[str, Term]) -> None:
33
+ symbol = ""
34
+ for child in item:
35
+ if _local(child.tag) == "classification-symbol":
36
+ symbol = (child.text or "").strip()
37
+ break
38
+ if symbol:
39
+ existing = terms.get(symbol)
40
+ parents = (parent_symbol,) if parent_symbol else ()
41
+ if existing is None:
42
+ terms[symbol] = Term(id=symbol, name=_title(item), parents=parents)
43
+ elif parent_symbol and parent_symbol not in existing.parents:
44
+ terms[symbol] = Term(id=symbol, name=existing.name or _title(item),
45
+ parents=tuple(dict.fromkeys((*existing.parents, parent_symbol))))
46
+ next_parent = symbol or parent_symbol
47
+ for child in item:
48
+ if _local(child.tag) == "classification-item":
49
+ _walk(child, next_parent, terms)
50
+
51
+
52
+ def load_cpc(path: str, name: str = "cpc") -> OntologyGraph:
53
+ """Load the CPC scheme from a directory of cpc-scheme-*.xml files (or one file)."""
54
+ root_path = Path(path)
55
+ files = sorted(root_path.glob("cpc-scheme-*.xml")) if root_path.is_dir() else [root_path]
56
+ terms: dict[str, Term] = {}
57
+ version = ""
58
+ for f in files:
59
+ try:
60
+ tree = ET.parse(f)
61
+ except ET.ParseError:
62
+ continue
63
+ scheme = tree.getroot()
64
+ if not version:
65
+ version = scheme.get("publication-date", "")
66
+ for child in scheme:
67
+ if _local(child.tag) == "classification-item":
68
+ _walk(child, "", terms)
69
+ return OntologyGraph(name=name, version=version, terms=terms)