structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/ontology/graph.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Normalized ontology graph contract shared across the ontology package.
|
|
2
|
+
|
|
3
|
+
The graph is a small, serializable representation of an OBO/OWL ontology: a
|
|
4
|
+
flat map of term ids to :class:`Term` records, with helpers that yield the
|
|
5
|
+
is-a and typed-relation edges actually used to build the predicate lattice and
|
|
6
|
+
higher-order case statements. Obsolete terms (and any edge touching one) are
|
|
7
|
+
skipped by the edge iterators.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Iterator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Term:
|
|
18
|
+
"""A single ontology term.
|
|
19
|
+
|
|
20
|
+
``id`` is the canonical prefixed id (e.g. ``"HP:0001250"``); ``parents``
|
|
21
|
+
are is_a parent ids; ``relations`` are typed ``(rel_type, target_id)`` pairs
|
|
22
|
+
(e.g. ``("part_of", "GO:0005634")``).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
id: str
|
|
26
|
+
name: str = ""
|
|
27
|
+
parents: tuple[str, ...] = ()
|
|
28
|
+
relations: tuple[tuple[str, str], ...] = ()
|
|
29
|
+
obsolete: bool = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class OntologyGraph:
|
|
34
|
+
"""A flat, normalized view of an ontology."""
|
|
35
|
+
|
|
36
|
+
name: str
|
|
37
|
+
version: str = ""
|
|
38
|
+
terms: dict[str, Term] = field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
def active_terms(self) -> dict[str, Term]:
|
|
41
|
+
"""Return only the non-obsolete terms."""
|
|
42
|
+
return {tid: term for tid, term in self.terms.items() if not term.obsolete}
|
|
43
|
+
|
|
44
|
+
def is_a_edges(self) -> Iterator[tuple[str, str]]:
|
|
45
|
+
"""Yield ``(child_id, parent_id)`` is_a edges with both endpoints active."""
|
|
46
|
+
active = self.active_terms()
|
|
47
|
+
for child_id, term in active.items():
|
|
48
|
+
for parent_id in term.parents:
|
|
49
|
+
if parent_id in active:
|
|
50
|
+
yield child_id, parent_id
|
|
51
|
+
|
|
52
|
+
def typed_relations(self) -> Iterator[tuple[str, str, str]]:
|
|
53
|
+
"""Yield ``(subj_id, rel_type, obj_id)`` typed relations, active only."""
|
|
54
|
+
active = self.active_terms()
|
|
55
|
+
for subj_id, term in active.items():
|
|
56
|
+
for rel_type, obj_id in term.relations:
|
|
57
|
+
if obj_id in active:
|
|
58
|
+
yield subj_id, rel_type, obj_id
|
sma/ontology/loader.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""Universal OBO/OWL ontology loaders into the normalized :class:`OntologyGraph`.
|
|
2
|
+
|
|
3
|
+
``load_obo`` parses the OBO flat-file ``[Term]`` blocks; ``load_owl`` parses the
|
|
4
|
+
common RDF/XML subset using the stdlib ``xml.etree`` only (rdflib is not a
|
|
5
|
+
dependency). ``load_ontology`` dispatches on the file extension.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import xml.etree.ElementTree as ET
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .graph import OntologyGraph, Term
|
|
14
|
+
|
|
15
|
+
# Known prefixes whose underscore-encoded OWL ids should be restored to a colon
|
|
16
|
+
# form (e.g. ``HP_0001250`` -> ``HP:0001250``).
|
|
17
|
+
_KNOWN_PREFIXES = (
|
|
18
|
+
"HP", "GO", "MONDO", "MP", "CHEBI", "UBERON", "CL", "DOID", "SO", "PATO",
|
|
19
|
+
"NCBITaxon", "EFO", "ORDO", "OMIM", "TST",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fid(term_id: str) -> str:
|
|
24
|
+
"""Functor-safe id: ``HP:0001250`` -> ``HP_0001250``."""
|
|
25
|
+
return term_id.replace(":", "_")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_ontology(path: str, name: str = "") -> OntologyGraph:
|
|
29
|
+
"""Load an ontology, dispatching on the ``.obo``/``.owl``/``.owl.xml`` extension."""
|
|
30
|
+
lower = str(path).lower()
|
|
31
|
+
if lower.endswith(".obo"):
|
|
32
|
+
return load_obo(path, name=name)
|
|
33
|
+
if lower.endswith(".owl") or lower.endswith(".owl.xml") or lower.endswith(".rdf") or lower.endswith(".xml"):
|
|
34
|
+
return load_owl(path, name=name)
|
|
35
|
+
raise ValueError(f"Unrecognized ontology extension: {path}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# --------------------------------------------------------------------------- #
|
|
39
|
+
# OBO
|
|
40
|
+
# --------------------------------------------------------------------------- #
|
|
41
|
+
def load_obo(path: str, name: str = "") -> OntologyGraph:
|
|
42
|
+
"""Parse an OBO flat file into an :class:`OntologyGraph`."""
|
|
43
|
+
version = ""
|
|
44
|
+
header_ontology = ""
|
|
45
|
+
terms: dict[str, Term] = {}
|
|
46
|
+
|
|
47
|
+
in_term = False
|
|
48
|
+
cur_id = ""
|
|
49
|
+
cur_name = ""
|
|
50
|
+
cur_parents: list[str] = []
|
|
51
|
+
cur_relations: list[tuple[str, str]] = []
|
|
52
|
+
cur_obsolete = False
|
|
53
|
+
|
|
54
|
+
def flush() -> None:
|
|
55
|
+
nonlocal cur_id, cur_name, cur_parents, cur_relations, cur_obsolete
|
|
56
|
+
if cur_id and ":" in cur_id:
|
|
57
|
+
terms[cur_id] = Term(
|
|
58
|
+
id=cur_id,
|
|
59
|
+
name=cur_name,
|
|
60
|
+
parents=tuple(cur_parents),
|
|
61
|
+
relations=tuple(cur_relations),
|
|
62
|
+
obsolete=cur_obsolete,
|
|
63
|
+
)
|
|
64
|
+
cur_id = ""
|
|
65
|
+
cur_name = ""
|
|
66
|
+
cur_parents = []
|
|
67
|
+
cur_relations = []
|
|
68
|
+
cur_obsolete = False
|
|
69
|
+
|
|
70
|
+
with open(path, "r", encoding="utf-8") as handle:
|
|
71
|
+
for raw in handle:
|
|
72
|
+
line = raw.rstrip("\n")
|
|
73
|
+
stripped = line.strip()
|
|
74
|
+
if stripped.startswith("[") and stripped.endswith("]"):
|
|
75
|
+
# New stanza. Flush any pending term, then track whether we are
|
|
76
|
+
# entering a [Term] stanza (others, like [Typedef], are ignored).
|
|
77
|
+
if in_term:
|
|
78
|
+
flush()
|
|
79
|
+
in_term = stripped == "[Term]"
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
if not in_term:
|
|
83
|
+
# Header region (before any stanza).
|
|
84
|
+
if stripped.startswith("data-version:"):
|
|
85
|
+
version = stripped[len("data-version:"):].strip()
|
|
86
|
+
elif stripped.startswith("ontology:") and not header_ontology:
|
|
87
|
+
header_ontology = stripped[len("ontology:"):].strip()
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
if stripped.startswith("id:"):
|
|
91
|
+
cur_id = stripped[len("id:"):].strip()
|
|
92
|
+
elif stripped.startswith("name:"):
|
|
93
|
+
cur_name = stripped[len("name:"):].strip()
|
|
94
|
+
elif stripped.startswith("is_a:"):
|
|
95
|
+
token = stripped[len("is_a:"):].strip().split("!", 1)[0].strip()
|
|
96
|
+
token = token.split()[0] if token else ""
|
|
97
|
+
if ":" in token:
|
|
98
|
+
cur_parents.append(token)
|
|
99
|
+
elif stripped.startswith("relationship:"):
|
|
100
|
+
rest = stripped[len("relationship:"):].strip().split("!", 1)[0].strip()
|
|
101
|
+
parts = rest.split()
|
|
102
|
+
if len(parts) >= 2 and ":" in parts[1]:
|
|
103
|
+
cur_relations.append((parts[0], parts[1]))
|
|
104
|
+
elif stripped.startswith("is_obsolete:"):
|
|
105
|
+
if stripped[len("is_obsolete:"):].strip().lower() == "true":
|
|
106
|
+
cur_obsolete = True
|
|
107
|
+
|
|
108
|
+
if in_term:
|
|
109
|
+
flush()
|
|
110
|
+
|
|
111
|
+
if not name:
|
|
112
|
+
name = header_ontology or Path(path).stem
|
|
113
|
+
name = name.removesuffix(".obo")
|
|
114
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# --------------------------------------------------------------------------- #
|
|
118
|
+
# OWL / RDF-XML
|
|
119
|
+
# --------------------------------------------------------------------------- #
|
|
120
|
+
def _local(tag: str) -> str:
|
|
121
|
+
"""Strip the namespace from an etree tag: ``{uri}label`` -> ``label``."""
|
|
122
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _attr(elem: ET.Element, local_name: str) -> str | None:
|
|
126
|
+
"""Fetch an attribute by local name, ignoring namespace prefix."""
|
|
127
|
+
for key, value in elem.attrib.items():
|
|
128
|
+
if _local(key) == local_name:
|
|
129
|
+
return value
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _term_id_from_iri(iri: str) -> str:
|
|
134
|
+
"""Derive a compact prefixed term id from an IRI.
|
|
135
|
+
|
|
136
|
+
Takes the fragment after ``#`` or the final ``/`` and restores the colon for
|
|
137
|
+
known prefixes (``HP_0001250`` -> ``HP:0001250``).
|
|
138
|
+
"""
|
|
139
|
+
frag = iri.rsplit("#", 1)[-1]
|
|
140
|
+
frag = frag.rsplit("/", 1)[-1]
|
|
141
|
+
if "_" in frag:
|
|
142
|
+
prefix, _, rest = frag.partition("_")
|
|
143
|
+
if prefix in _KNOWN_PREFIXES and rest:
|
|
144
|
+
return f"{prefix}:{rest}"
|
|
145
|
+
return frag
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def load_owl(path: str, name: str = "") -> OntologyGraph:
|
|
149
|
+
"""Parse the common RDF/XML subset of an OWL ontology (stdlib etree only)."""
|
|
150
|
+
tree = ET.parse(path)
|
|
151
|
+
root = tree.getroot()
|
|
152
|
+
|
|
153
|
+
version = ""
|
|
154
|
+
terms: dict[str, Term] = {}
|
|
155
|
+
|
|
156
|
+
for elem in root.iter():
|
|
157
|
+
local = _local(elem.tag)
|
|
158
|
+
|
|
159
|
+
if local == "Ontology":
|
|
160
|
+
for child in list(elem):
|
|
161
|
+
clocal = _local(child.tag)
|
|
162
|
+
if clocal == "versionIRI":
|
|
163
|
+
res = _attr(child, "resource")
|
|
164
|
+
if res:
|
|
165
|
+
version = version or res
|
|
166
|
+
elif clocal == "versionInfo":
|
|
167
|
+
if child.text and child.text.strip():
|
|
168
|
+
version = version or child.text.strip()
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
if local != "Class":
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
about = _attr(elem, "about") or _attr(elem, "ID")
|
|
175
|
+
if not about:
|
|
176
|
+
continue
|
|
177
|
+
term_id = _term_id_from_iri(about)
|
|
178
|
+
|
|
179
|
+
term_name = ""
|
|
180
|
+
parents: list[str] = []
|
|
181
|
+
relations: list[tuple[str, str]] = []
|
|
182
|
+
obsolete = False
|
|
183
|
+
|
|
184
|
+
for child in list(elem):
|
|
185
|
+
clocal = _local(child.tag)
|
|
186
|
+
if clocal == "label":
|
|
187
|
+
if child.text and not term_name:
|
|
188
|
+
term_name = child.text.strip()
|
|
189
|
+
elif clocal == "deprecated":
|
|
190
|
+
if child.text and child.text.strip().lower() == "true":
|
|
191
|
+
obsolete = True
|
|
192
|
+
elif clocal == "subClassOf":
|
|
193
|
+
resource = _attr(child, "resource")
|
|
194
|
+
if resource:
|
|
195
|
+
parents.append(_term_id_from_iri(resource))
|
|
196
|
+
continue
|
|
197
|
+
# Anonymous superclass: look for an owl:Restriction.
|
|
198
|
+
for restr in child.iter():
|
|
199
|
+
if _local(restr.tag) != "Restriction":
|
|
200
|
+
continue
|
|
201
|
+
rel_type = ""
|
|
202
|
+
target = ""
|
|
203
|
+
for rchild in list(restr):
|
|
204
|
+
rlocal = _local(rchild.tag)
|
|
205
|
+
if rlocal == "onProperty":
|
|
206
|
+
res = _attr(rchild, "resource")
|
|
207
|
+
if res:
|
|
208
|
+
rel_type = _local(_term_id_from_iri(res))
|
|
209
|
+
elif rlocal == "someValuesFrom":
|
|
210
|
+
res = _attr(rchild, "resource")
|
|
211
|
+
if res:
|
|
212
|
+
target = _term_id_from_iri(res)
|
|
213
|
+
if rel_type and target:
|
|
214
|
+
relations.append((rel_type, target))
|
|
215
|
+
|
|
216
|
+
terms[term_id] = Term(
|
|
217
|
+
id=term_id,
|
|
218
|
+
name=term_name,
|
|
219
|
+
parents=tuple(parents),
|
|
220
|
+
relations=tuple(relations),
|
|
221
|
+
obsolete=obsolete,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if not name:
|
|
225
|
+
name = Path(path).stem
|
|
226
|
+
name = name.removesuffix(".owl")
|
|
227
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def load_owl_dir(root: str, name: str = "", pattern: str = "*.rdf") -> OntologyGraph:
|
|
231
|
+
"""Load + merge a multi-file OWL/RDF ontology (e.g. LKIF, FIBO) into one graph.
|
|
232
|
+
|
|
233
|
+
Many real OWL ontologies ship as a directory of modules rather than a single
|
|
234
|
+
file. This loads every file matching ``pattern`` under ``root`` (recursively)
|
|
235
|
+
and merges their terms. On id collision the first non-empty term wins; a later
|
|
236
|
+
file may fill in a missing label or add parents/relations.
|
|
237
|
+
"""
|
|
238
|
+
root_path = Path(root)
|
|
239
|
+
files = sorted(root_path.rglob(pattern)) if root_path.is_dir() else [root_path]
|
|
240
|
+
merged: dict[str, Term] = {}
|
|
241
|
+
version = ""
|
|
242
|
+
for f in files:
|
|
243
|
+
try:
|
|
244
|
+
g = load_owl(str(f), name=name)
|
|
245
|
+
except ET.ParseError:
|
|
246
|
+
continue # skip non-RDF/XML or malformed module files
|
|
247
|
+
version = version or g.version
|
|
248
|
+
for tid, term in g.terms.items():
|
|
249
|
+
cur = merged.get(tid)
|
|
250
|
+
if cur is None:
|
|
251
|
+
merged[tid] = term
|
|
252
|
+
continue
|
|
253
|
+
merged[tid] = Term(
|
|
254
|
+
id=tid,
|
|
255
|
+
name=cur.name or term.name,
|
|
256
|
+
parents=tuple(dict.fromkeys((*cur.parents, *term.parents))),
|
|
257
|
+
relations=tuple(dict.fromkeys((*cur.relations, *term.relations))),
|
|
258
|
+
obsolete=cur.obsolete and term.obsolete,
|
|
259
|
+
)
|
|
260
|
+
if not name:
|
|
261
|
+
name = root_path.stem
|
|
262
|
+
return OntologyGraph(name=name, version=version, terms=merged)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Loaders for MITRE CAPEC (attack patterns) and CWE (software weaknesses) XML.
|
|
2
|
+
|
|
3
|
+
Both ship as a flat list of entries with explicit ``ChildOf`` relations that form
|
|
4
|
+
a deep is-a hierarchy, plus other typed relations (CanPrecede/CanFollow/PeerOf,
|
|
5
|
+
and CAPEC->CWE ``exploits`` links). These enrich the cyber domain beyond ATT&CK's
|
|
6
|
+
shallow tactic/technique tree with real subsumption depth and cross-links.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import xml.etree.ElementTree as ET
|
|
11
|
+
|
|
12
|
+
from .graph import OntologyGraph, Term
|
|
13
|
+
|
|
14
|
+
_KINDS = {
|
|
15
|
+
# kind: (entry_tag, id_prefix, related_tag, id_attr)
|
|
16
|
+
"capec": ("Attack_Pattern", "CAPEC", "Related_Attack_Pattern", "CAPEC_ID"),
|
|
17
|
+
"cwe": ("Weakness", "CWE", "Related_Weakness", "CWE_ID"),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _local(tag: str) -> str:
|
|
22
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_mitre_xml(path: str, kind: str, name: str = "") -> OntologyGraph:
|
|
26
|
+
entry_tag, prefix, related_tag, id_attr = _KINDS[kind]
|
|
27
|
+
name = name or kind
|
|
28
|
+
tree = ET.parse(path)
|
|
29
|
+
terms: dict[str, Term] = {}
|
|
30
|
+
version = tree.getroot().get("Version", "")
|
|
31
|
+
for elem in tree.getroot().iter():
|
|
32
|
+
if _local(elem.tag) != entry_tag:
|
|
33
|
+
continue
|
|
34
|
+
eid = elem.get("ID")
|
|
35
|
+
if not eid:
|
|
36
|
+
continue
|
|
37
|
+
tid = f"{prefix}-{eid}"
|
|
38
|
+
obsolete = (elem.get("Status", "") or "").lower() in ("deprecated", "obsolete")
|
|
39
|
+
parents: list[str] = []
|
|
40
|
+
relations: list[tuple[str, str]] = []
|
|
41
|
+
for rel in elem.iter():
|
|
42
|
+
rlocal = _local(rel.tag)
|
|
43
|
+
if rlocal == related_tag:
|
|
44
|
+
nature = rel.get("Nature", "")
|
|
45
|
+
target = rel.get(id_attr)
|
|
46
|
+
if not target:
|
|
47
|
+
continue
|
|
48
|
+
if nature == "ChildOf":
|
|
49
|
+
parents.append(f"{prefix}-{target}")
|
|
50
|
+
elif nature:
|
|
51
|
+
relations.append((nature, f"{prefix}-{target}"))
|
|
52
|
+
elif rlocal == "Related_Weakness" and kind == "capec":
|
|
53
|
+
cwe = rel.get("CWE_ID")
|
|
54
|
+
if cwe:
|
|
55
|
+
relations.append(("exploits", f"CWE-{cwe}"))
|
|
56
|
+
terms[tid] = Term(id=tid, name=elem.get("Name", ""),
|
|
57
|
+
parents=tuple(dict.fromkeys(parents)),
|
|
58
|
+
relations=tuple(dict.fromkeys(relations)), obsolete=obsolete)
|
|
59
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_capec(path: str, name: str = "capec") -> OntologyGraph:
|
|
63
|
+
return load_mitre_xml(path, "capec", name=name)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def load_cwe(path: str, name: str = "cwe") -> OntologyGraph:
|
|
67
|
+
return load_mitre_xml(path, "cwe", name=name)
|
sma/ontology/mount.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Mount a normalized :class:`OntologyGraph` onto SMA's matching machinery.
|
|
2
|
+
|
|
3
|
+
Mounting lifts an ontology's is_a hierarchy into a :class:`Canonicalizer`
|
|
4
|
+
predicate lattice (so structurally-distinct-but-related terms can ascend to a
|
|
5
|
+
shared ancestor during matching) and provides the case/index builders that turn
|
|
6
|
+
a set of present terms into an SMA :class:`Case`.
|
|
7
|
+
|
|
8
|
+
A term ``T`` present on a subject becomes the statement ``fid(T)(subject)``;
|
|
9
|
+
each typed relation ``(s, rel, o)`` whose *both* endpoints are present becomes
|
|
10
|
+
the higher-order statement ``rel(fid(s)(subject), fid(o)(subject))``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Any, Iterable, Mapping
|
|
17
|
+
|
|
18
|
+
from sma.index.macfac import MacFacIndex
|
|
19
|
+
from sma.ir.canon import Canonicalizer
|
|
20
|
+
from sma.ir.schema import Case, entity, make_case, stmt
|
|
21
|
+
from sma.match.types import MatchConfig
|
|
22
|
+
|
|
23
|
+
from .graph import OntologyGraph
|
|
24
|
+
from .loader import fid
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _default_config() -> MatchConfig:
|
|
28
|
+
"""Ontology matching default: ascend up to two is_a hops (``delta=2``)."""
|
|
29
|
+
return MatchConfig(delta=2, rho=0.95)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class MountedOntology:
|
|
34
|
+
"""An :class:`OntologyGraph` bound to a populated :class:`Canonicalizer`."""
|
|
35
|
+
|
|
36
|
+
graph: OntologyGraph
|
|
37
|
+
canon: Canonicalizer
|
|
38
|
+
config: MatchConfig
|
|
39
|
+
|
|
40
|
+
def build_case(
|
|
41
|
+
self,
|
|
42
|
+
term_ids: Iterable[str],
|
|
43
|
+
subject: str = "subject",
|
|
44
|
+
metadata: Mapping[str, Any] | None = None,
|
|
45
|
+
) -> Case:
|
|
46
|
+
"""Build a :class:`Case` for the given present ``term_ids``.
|
|
47
|
+
|
|
48
|
+
Unknown term ids (not in ``graph.terms``) are dropped. Each present term
|
|
49
|
+
contributes ``fid(term)(subject)``; each typed relation with both
|
|
50
|
+
endpoints present contributes the higher-order
|
|
51
|
+
``rel(fid(s)(subject), fid(o)(subject))``.
|
|
52
|
+
"""
|
|
53
|
+
present = [t for t in term_ids if t in self.graph.terms]
|
|
54
|
+
present_set = set(present)
|
|
55
|
+
subj = entity(subject, subject)
|
|
56
|
+
|
|
57
|
+
statements = [stmt(fid(t), subj) for t in present]
|
|
58
|
+
for s, rel, o in self.graph.typed_relations():
|
|
59
|
+
if s in present_set and o in present_set:
|
|
60
|
+
statements.append(stmt(rel, stmt(fid(s), subj), stmt(fid(o), subj)))
|
|
61
|
+
|
|
62
|
+
return make_case(statements, metadata=metadata)
|
|
63
|
+
|
|
64
|
+
def build_index(
|
|
65
|
+
self,
|
|
66
|
+
records: Iterable[tuple[str, Iterable[str], Mapping[str, Any] | None]],
|
|
67
|
+
config: MatchConfig | None = None,
|
|
68
|
+
) -> MacFacIndex:
|
|
69
|
+
"""Build a :class:`MacFacIndex` over ``(key, term_ids, metadata)`` records.
|
|
70
|
+
|
|
71
|
+
Each record's ``key`` is stored under ``metadata["key"]`` and the
|
|
72
|
+
returned index carries a ``key_of`` attribute mapping ``case_id -> key``
|
|
73
|
+
so callers can recover the original key from a retrieval result.
|
|
74
|
+
"""
|
|
75
|
+
cases: list[Case] = []
|
|
76
|
+
key_of: dict[str, str] = {}
|
|
77
|
+
for key, term_ids, metadata in records:
|
|
78
|
+
md = dict(metadata or {})
|
|
79
|
+
md["key"] = key
|
|
80
|
+
case = self.build_case(term_ids, metadata=md)
|
|
81
|
+
cases.append(case)
|
|
82
|
+
key_of[case.case_id] = key
|
|
83
|
+
|
|
84
|
+
index = MacFacIndex(config=config or self.config, canon=self.canon)
|
|
85
|
+
index.build(cases)
|
|
86
|
+
index.key_of = key_of
|
|
87
|
+
return index
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def mount(graph: OntologyGraph, config: MatchConfig | None = None) -> MountedOntology:
|
|
91
|
+
"""Mount ``graph``: populate a predicate lattice from its is_a edges.
|
|
92
|
+
|
|
93
|
+
Every ``(child, parent)`` in :meth:`OntologyGraph.is_a_edges` becomes a
|
|
94
|
+
lattice edge ``fid(child) -> fid(parent)``. The default config ascends up to
|
|
95
|
+
two hops (``MatchConfig(delta=2, rho=0.95)``).
|
|
96
|
+
"""
|
|
97
|
+
cfg = config or _default_config()
|
|
98
|
+
canon = Canonicalizer()
|
|
99
|
+
for child, parent in graph.is_a_edges():
|
|
100
|
+
canon.lattice.add(fid(child), fid(parent))
|
|
101
|
+
return MountedOntology(graph=graph, canon=canon, config=cfg)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Complete RDF loader using rdflib (for ontologies our xml.etree subset misses).
|
|
2
|
+
|
|
3
|
+
Unlike :func:`sma.ontology.loader.load_owl` (a stdlib RDF/XML subset), this uses
|
|
4
|
+
rdflib to fully parse OWL — including Turtle, restrictions, and the complete class
|
|
5
|
+
graph — for ontologies like FIBO whose power lives in typed relations expressed
|
|
6
|
+
via ``owl:Restriction``. Falls back gracefully if rdflib is absent.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .graph import OntologyGraph, Term
|
|
13
|
+
from .loader import _KNOWN_PREFIXES
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _compact(uri: str) -> str:
|
|
17
|
+
frag = str(uri).rsplit("#", 1)[-1].rsplit("/", 1)[-1]
|
|
18
|
+
if "_" in frag:
|
|
19
|
+
prefix, _, rest = frag.partition("_")
|
|
20
|
+
if prefix in _KNOWN_PREFIXES and rest:
|
|
21
|
+
return f"{prefix}:{rest}"
|
|
22
|
+
return frag
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_rdflib(path: str, name: str = "", fmt: str | None = None,
|
|
26
|
+
pattern: str = "*.rdf") -> OntologyGraph:
|
|
27
|
+
"""Load an OWL/RDF ontology (file, Turtle, or a directory of RDF files) via rdflib."""
|
|
28
|
+
import logging
|
|
29
|
+
import rdflib
|
|
30
|
+
from rdflib import OWL, RDF, RDFS
|
|
31
|
+
from rdflib.namespace import DC, DCTERMS
|
|
32
|
+
|
|
33
|
+
# FIBO metadata carries malformed xsd:dateTime literals; rdflib logs a
|
|
34
|
+
# traceback per occurrence but parses fine. Silence that noise.
|
|
35
|
+
logging.getLogger("rdflib.term").setLevel(logging.CRITICAL)
|
|
36
|
+
|
|
37
|
+
g = rdflib.Graph()
|
|
38
|
+
root = Path(path)
|
|
39
|
+
files = sorted(root.rglob(pattern)) if root.is_dir() else [root]
|
|
40
|
+
for f in files:
|
|
41
|
+
try:
|
|
42
|
+
g.parse(str(f), format=fmt) if fmt else g.parse(str(f))
|
|
43
|
+
except Exception:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
# Collect named owl:Class (and rdfs:Class) subjects with a real IRI.
|
|
47
|
+
terms: dict[str, Term] = {}
|
|
48
|
+
classes = set(g.subjects(RDF.type, OWL.Class)) | set(g.subjects(RDF.type, RDFS.Class))
|
|
49
|
+
for cls in classes:
|
|
50
|
+
if isinstance(cls, rdflib.BNode):
|
|
51
|
+
continue
|
|
52
|
+
tid = _compact(cls)
|
|
53
|
+
label = g.value(cls, RDFS.label) or g.value(cls, DCTERMS.title) or g.value(cls, DC.title)
|
|
54
|
+
parents: list[str] = []
|
|
55
|
+
relations: list[tuple[str, str]] = []
|
|
56
|
+
for sup in g.objects(cls, RDFS.subClassOf):
|
|
57
|
+
if isinstance(sup, rdflib.BNode):
|
|
58
|
+
# owl:Restriction -> typed relation (onProperty + someValuesFrom).
|
|
59
|
+
prop = g.value(sup, OWL.onProperty)
|
|
60
|
+
tgt = g.value(sup, OWL.someValuesFrom) or g.value(sup, OWL.allValuesFrom)
|
|
61
|
+
if prop is not None and tgt is not None and not isinstance(tgt, rdflib.BNode):
|
|
62
|
+
relations.append((_compact(prop), _compact(tgt)))
|
|
63
|
+
else:
|
|
64
|
+
parents.append(_compact(sup))
|
|
65
|
+
obsolete = bool(g.value(cls, OWL.deprecated))
|
|
66
|
+
terms[tid] = Term(id=tid, name=str(label) if label else "",
|
|
67
|
+
parents=tuple(dict.fromkeys(parents)),
|
|
68
|
+
relations=tuple(dict.fromkeys(relations)), obsolete=obsolete)
|
|
69
|
+
|
|
70
|
+
version = ""
|
|
71
|
+
for o in g.objects(None, OWL.versionIRI):
|
|
72
|
+
version = str(o); break
|
|
73
|
+
if not name:
|
|
74
|
+
name = root.stem
|
|
75
|
+
return OntologyGraph(name=name, version=version, terms=terms)
|
sma/ontology/registry.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""A small registry of named ontologies that loads and mounts them on demand.
|
|
2
|
+
|
|
3
|
+
Each :class:`OntologyEntry` records where an ontology lives on disk and how to
|
|
4
|
+
parse it; :meth:`OntologyRegistry.get` lazily loads the file into an
|
|
5
|
+
:class:`~sma.ontology.graph.OntologyGraph`, mounts it (building the predicate
|
|
6
|
+
lattice + match index machinery), and caches the result so repeated lookups are
|
|
7
|
+
cheap.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from .loader import load_ontology
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
19
|
+
from .mount import MountedOntology
|
|
20
|
+
from sma.match.types import MatchConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _infer_format(path: str) -> str:
|
|
24
|
+
"""Infer the on-disk format from a path's extension.
|
|
25
|
+
|
|
26
|
+
``.obo`` -> ``"obo"``; ``.owl``/``.owl.xml``/``.rdf``/``.xml`` -> ``"owl"``.
|
|
27
|
+
"""
|
|
28
|
+
lower = str(path).lower()
|
|
29
|
+
if lower.endswith(".obo"):
|
|
30
|
+
return "obo"
|
|
31
|
+
if (
|
|
32
|
+
lower.endswith(".owl")
|
|
33
|
+
or lower.endswith(".owl.xml")
|
|
34
|
+
or lower.endswith(".rdf")
|
|
35
|
+
or lower.endswith(".xml")
|
|
36
|
+
):
|
|
37
|
+
return "owl"
|
|
38
|
+
raise ValueError(f"Cannot infer ontology format from extension: {path}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class OntologyEntry:
|
|
43
|
+
"""A registered ontology: its name, source path, format, and version."""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
path: str
|
|
47
|
+
format: str
|
|
48
|
+
version: str = ""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class OntologyRegistry:
|
|
52
|
+
"""A name-keyed collection of ontologies, loaded and mounted on demand."""
|
|
53
|
+
|
|
54
|
+
def __init__(self) -> None:
|
|
55
|
+
self._entries: dict[str, OntologyEntry] = {}
|
|
56
|
+
self._order: list[str] = []
|
|
57
|
+
self._mounted: dict[str, "MountedOntology"] = {}
|
|
58
|
+
|
|
59
|
+
def register(
|
|
60
|
+
self,
|
|
61
|
+
name: str,
|
|
62
|
+
path: str,
|
|
63
|
+
fmt: str | None = None,
|
|
64
|
+
version: str | None = None,
|
|
65
|
+
) -> OntologyEntry:
|
|
66
|
+
"""Register an ontology under ``name``.
|
|
67
|
+
|
|
68
|
+
``fmt`` is inferred from the file extension when omitted. Re-registering
|
|
69
|
+
a name replaces its entry and invalidates any cached mounted ontology.
|
|
70
|
+
"""
|
|
71
|
+
resolved_fmt = fmt if fmt is not None else _infer_format(path)
|
|
72
|
+
entry = OntologyEntry(
|
|
73
|
+
name=name,
|
|
74
|
+
path=str(path),
|
|
75
|
+
format=resolved_fmt,
|
|
76
|
+
version=version or "",
|
|
77
|
+
)
|
|
78
|
+
if name not in self._entries:
|
|
79
|
+
self._order.append(name)
|
|
80
|
+
self._entries[name] = entry
|
|
81
|
+
self._mounted.pop(name, None)
|
|
82
|
+
return entry
|
|
83
|
+
|
|
84
|
+
def get(self, name: str, config: "MatchConfig | None" = None) -> "MountedOntology":
|
|
85
|
+
"""Load + mount the named ontology, caching the result.
|
|
86
|
+
|
|
87
|
+
The mounted ontology is cached on first access; subsequent calls return
|
|
88
|
+
the same object (identity-stable) without re-reading the file.
|
|
89
|
+
"""
|
|
90
|
+
cached = self._mounted.get(name)
|
|
91
|
+
if cached is not None:
|
|
92
|
+
return cached
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
entry = self._entries[name]
|
|
96
|
+
except KeyError:
|
|
97
|
+
raise KeyError(f"No ontology registered under {name!r}") from None
|
|
98
|
+
|
|
99
|
+
# Import lazily: mount.py is owned by a sibling agent and may bring in
|
|
100
|
+
# heavier match machinery; keep registry import-time light and decoupled.
|
|
101
|
+
from .mount import mount
|
|
102
|
+
|
|
103
|
+
graph = load_ontology(entry.path, name=entry.name)
|
|
104
|
+
if not entry.version and graph.version:
|
|
105
|
+
entry.version = graph.version
|
|
106
|
+
mounted = mount(graph, config=config)
|
|
107
|
+
self._mounted[name] = mounted
|
|
108
|
+
return mounted
|
|
109
|
+
|
|
110
|
+
def list(self) -> list[OntologyEntry]:
|
|
111
|
+
"""Return registered entries in registration order."""
|
|
112
|
+
return [self._entries[name] for name in self._order]
|
|
113
|
+
|
|
114
|
+
def __contains__(self, name: object) -> bool:
|
|
115
|
+
return name in self._entries
|