structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/ontology/graph.py ADDED
@@ -0,0 +1,58 @@
1
+ """Normalized ontology graph contract shared across the ontology package.
2
+
3
+ The graph is a small, serializable representation of an OBO/OWL ontology: a
4
+ flat map of term ids to :class:`Term` records, with helpers that yield the
5
+ is-a and typed-relation edges actually used to build the predicate lattice and
6
+ higher-order case statements. Obsolete terms (and any edge touching one) are
7
+ skipped by the edge iterators.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass, field
13
+ from typing import Iterator
14
+
15
+
16
+ @dataclass
17
+ class Term:
18
+ """A single ontology term.
19
+
20
+ ``id`` is the canonical prefixed id (e.g. ``"HP:0001250"``); ``parents``
21
+ are is_a parent ids; ``relations`` are typed ``(rel_type, target_id)`` pairs
22
+ (e.g. ``("part_of", "GO:0005634")``).
23
+ """
24
+
25
+ id: str
26
+ name: str = ""
27
+ parents: tuple[str, ...] = ()
28
+ relations: tuple[tuple[str, str], ...] = ()
29
+ obsolete: bool = False
30
+
31
+
32
+ @dataclass
33
+ class OntologyGraph:
34
+ """A flat, normalized view of an ontology."""
35
+
36
+ name: str
37
+ version: str = ""
38
+ terms: dict[str, Term] = field(default_factory=dict)
39
+
40
+ def active_terms(self) -> dict[str, Term]:
41
+ """Return only the non-obsolete terms."""
42
+ return {tid: term for tid, term in self.terms.items() if not term.obsolete}
43
+
44
+ def is_a_edges(self) -> Iterator[tuple[str, str]]:
45
+ """Yield ``(child_id, parent_id)`` is_a edges with both endpoints active."""
46
+ active = self.active_terms()
47
+ for child_id, term in active.items():
48
+ for parent_id in term.parents:
49
+ if parent_id in active:
50
+ yield child_id, parent_id
51
+
52
+ def typed_relations(self) -> Iterator[tuple[str, str, str]]:
53
+ """Yield ``(subj_id, rel_type, obj_id)`` typed relations, active only."""
54
+ active = self.active_terms()
55
+ for subj_id, term in active.items():
56
+ for rel_type, obj_id in term.relations:
57
+ if obj_id in active:
58
+ yield subj_id, rel_type, obj_id
sma/ontology/loader.py ADDED
@@ -0,0 +1,262 @@
1
+ """Universal OBO/OWL ontology loaders into the normalized :class:`OntologyGraph`.
2
+
3
+ ``load_obo`` parses the OBO flat-file ``[Term]`` blocks; ``load_owl`` parses the
4
+ common RDF/XML subset using the stdlib ``xml.etree`` only (rdflib is not a
5
+ dependency). ``load_ontology`` dispatches on the file extension.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import xml.etree.ElementTree as ET
11
+ from pathlib import Path
12
+
13
+ from .graph import OntologyGraph, Term
14
+
15
+ # Known prefixes whose underscore-encoded OWL ids should be restored to a colon
16
+ # form (e.g. ``HP_0001250`` -> ``HP:0001250``).
17
+ _KNOWN_PREFIXES = (
18
+ "HP", "GO", "MONDO", "MP", "CHEBI", "UBERON", "CL", "DOID", "SO", "PATO",
19
+ "NCBITaxon", "EFO", "ORDO", "OMIM", "TST",
20
+ )
21
+
22
+
23
+ def fid(term_id: str) -> str:
24
+ """Functor-safe id: ``HP:0001250`` -> ``HP_0001250``."""
25
+ return term_id.replace(":", "_")
26
+
27
+
28
+ def load_ontology(path: str, name: str = "") -> OntologyGraph:
29
+ """Load an ontology, dispatching on the ``.obo``/``.owl``/``.owl.xml`` extension."""
30
+ lower = str(path).lower()
31
+ if lower.endswith(".obo"):
32
+ return load_obo(path, name=name)
33
+ if lower.endswith(".owl") or lower.endswith(".owl.xml") or lower.endswith(".rdf") or lower.endswith(".xml"):
34
+ return load_owl(path, name=name)
35
+ raise ValueError(f"Unrecognized ontology extension: {path}")
36
+
37
+
38
+ # --------------------------------------------------------------------------- #
39
+ # OBO
40
+ # --------------------------------------------------------------------------- #
41
+ def load_obo(path: str, name: str = "") -> OntologyGraph:
42
+ """Parse an OBO flat file into an :class:`OntologyGraph`."""
43
+ version = ""
44
+ header_ontology = ""
45
+ terms: dict[str, Term] = {}
46
+
47
+ in_term = False
48
+ cur_id = ""
49
+ cur_name = ""
50
+ cur_parents: list[str] = []
51
+ cur_relations: list[tuple[str, str]] = []
52
+ cur_obsolete = False
53
+
54
+ def flush() -> None:
55
+ nonlocal cur_id, cur_name, cur_parents, cur_relations, cur_obsolete
56
+ if cur_id and ":" in cur_id:
57
+ terms[cur_id] = Term(
58
+ id=cur_id,
59
+ name=cur_name,
60
+ parents=tuple(cur_parents),
61
+ relations=tuple(cur_relations),
62
+ obsolete=cur_obsolete,
63
+ )
64
+ cur_id = ""
65
+ cur_name = ""
66
+ cur_parents = []
67
+ cur_relations = []
68
+ cur_obsolete = False
69
+
70
+ with open(path, "r", encoding="utf-8") as handle:
71
+ for raw in handle:
72
+ line = raw.rstrip("\n")
73
+ stripped = line.strip()
74
+ if stripped.startswith("[") and stripped.endswith("]"):
75
+ # New stanza. Flush any pending term, then track whether we are
76
+ # entering a [Term] stanza (others, like [Typedef], are ignored).
77
+ if in_term:
78
+ flush()
79
+ in_term = stripped == "[Term]"
80
+ continue
81
+
82
+ if not in_term:
83
+ # Header region (before any stanza).
84
+ if stripped.startswith("data-version:"):
85
+ version = stripped[len("data-version:"):].strip()
86
+ elif stripped.startswith("ontology:") and not header_ontology:
87
+ header_ontology = stripped[len("ontology:"):].strip()
88
+ continue
89
+
90
+ if stripped.startswith("id:"):
91
+ cur_id = stripped[len("id:"):].strip()
92
+ elif stripped.startswith("name:"):
93
+ cur_name = stripped[len("name:"):].strip()
94
+ elif stripped.startswith("is_a:"):
95
+ token = stripped[len("is_a:"):].strip().split("!", 1)[0].strip()
96
+ token = token.split()[0] if token else ""
97
+ if ":" in token:
98
+ cur_parents.append(token)
99
+ elif stripped.startswith("relationship:"):
100
+ rest = stripped[len("relationship:"):].strip().split("!", 1)[0].strip()
101
+ parts = rest.split()
102
+ if len(parts) >= 2 and ":" in parts[1]:
103
+ cur_relations.append((parts[0], parts[1]))
104
+ elif stripped.startswith("is_obsolete:"):
105
+ if stripped[len("is_obsolete:"):].strip().lower() == "true":
106
+ cur_obsolete = True
107
+
108
+ if in_term:
109
+ flush()
110
+
111
+ if not name:
112
+ name = header_ontology or Path(path).stem
113
+ name = name.removesuffix(".obo")
114
+ return OntologyGraph(name=name, version=version, terms=terms)
115
+
116
+
117
+ # --------------------------------------------------------------------------- #
118
+ # OWL / RDF-XML
119
+ # --------------------------------------------------------------------------- #
120
+ def _local(tag: str) -> str:
121
+ """Strip the namespace from an etree tag: ``{uri}label`` -> ``label``."""
122
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
123
+
124
+
125
+ def _attr(elem: ET.Element, local_name: str) -> str | None:
126
+ """Fetch an attribute by local name, ignoring namespace prefix."""
127
+ for key, value in elem.attrib.items():
128
+ if _local(key) == local_name:
129
+ return value
130
+ return None
131
+
132
+
133
+ def _term_id_from_iri(iri: str) -> str:
134
+ """Derive a compact prefixed term id from an IRI.
135
+
136
+ Takes the fragment after ``#`` or the final ``/`` and restores the colon for
137
+ known prefixes (``HP_0001250`` -> ``HP:0001250``).
138
+ """
139
+ frag = iri.rsplit("#", 1)[-1]
140
+ frag = frag.rsplit("/", 1)[-1]
141
+ if "_" in frag:
142
+ prefix, _, rest = frag.partition("_")
143
+ if prefix in _KNOWN_PREFIXES and rest:
144
+ return f"{prefix}:{rest}"
145
+ return frag
146
+
147
+
148
+ def load_owl(path: str, name: str = "") -> OntologyGraph:
149
+ """Parse the common RDF/XML subset of an OWL ontology (stdlib etree only)."""
150
+ tree = ET.parse(path)
151
+ root = tree.getroot()
152
+
153
+ version = ""
154
+ terms: dict[str, Term] = {}
155
+
156
+ for elem in root.iter():
157
+ local = _local(elem.tag)
158
+
159
+ if local == "Ontology":
160
+ for child in list(elem):
161
+ clocal = _local(child.tag)
162
+ if clocal == "versionIRI":
163
+ res = _attr(child, "resource")
164
+ if res:
165
+ version = version or res
166
+ elif clocal == "versionInfo":
167
+ if child.text and child.text.strip():
168
+ version = version or child.text.strip()
169
+ continue
170
+
171
+ if local != "Class":
172
+ continue
173
+
174
+ about = _attr(elem, "about") or _attr(elem, "ID")
175
+ if not about:
176
+ continue
177
+ term_id = _term_id_from_iri(about)
178
+
179
+ term_name = ""
180
+ parents: list[str] = []
181
+ relations: list[tuple[str, str]] = []
182
+ obsolete = False
183
+
184
+ for child in list(elem):
185
+ clocal = _local(child.tag)
186
+ if clocal == "label":
187
+ if child.text and not term_name:
188
+ term_name = child.text.strip()
189
+ elif clocal == "deprecated":
190
+ if child.text and child.text.strip().lower() == "true":
191
+ obsolete = True
192
+ elif clocal == "subClassOf":
193
+ resource = _attr(child, "resource")
194
+ if resource:
195
+ parents.append(_term_id_from_iri(resource))
196
+ continue
197
+ # Anonymous superclass: look for an owl:Restriction.
198
+ for restr in child.iter():
199
+ if _local(restr.tag) != "Restriction":
200
+ continue
201
+ rel_type = ""
202
+ target = ""
203
+ for rchild in list(restr):
204
+ rlocal = _local(rchild.tag)
205
+ if rlocal == "onProperty":
206
+ res = _attr(rchild, "resource")
207
+ if res:
208
+ rel_type = _local(_term_id_from_iri(res))
209
+ elif rlocal == "someValuesFrom":
210
+ res = _attr(rchild, "resource")
211
+ if res:
212
+ target = _term_id_from_iri(res)
213
+ if rel_type and target:
214
+ relations.append((rel_type, target))
215
+
216
+ terms[term_id] = Term(
217
+ id=term_id,
218
+ name=term_name,
219
+ parents=tuple(parents),
220
+ relations=tuple(relations),
221
+ obsolete=obsolete,
222
+ )
223
+
224
+ if not name:
225
+ name = Path(path).stem
226
+ name = name.removesuffix(".owl")
227
+ return OntologyGraph(name=name, version=version, terms=terms)
228
+
229
+
230
+ def load_owl_dir(root: str, name: str = "", pattern: str = "*.rdf") -> OntologyGraph:
231
+ """Load + merge a multi-file OWL/RDF ontology (e.g. LKIF, FIBO) into one graph.
232
+
233
+ Many real OWL ontologies ship as a directory of modules rather than a single
234
+ file. This loads every file matching ``pattern`` under ``root`` (recursively)
235
+ and merges their terms. On id collision the first non-empty term wins; a later
236
+ file may fill in a missing label or add parents/relations.
237
+ """
238
+ root_path = Path(root)
239
+ files = sorted(root_path.rglob(pattern)) if root_path.is_dir() else [root_path]
240
+ merged: dict[str, Term] = {}
241
+ version = ""
242
+ for f in files:
243
+ try:
244
+ g = load_owl(str(f), name=name)
245
+ except ET.ParseError:
246
+ continue # skip non-RDF/XML or malformed module files
247
+ version = version or g.version
248
+ for tid, term in g.terms.items():
249
+ cur = merged.get(tid)
250
+ if cur is None:
251
+ merged[tid] = term
252
+ continue
253
+ merged[tid] = Term(
254
+ id=tid,
255
+ name=cur.name or term.name,
256
+ parents=tuple(dict.fromkeys((*cur.parents, *term.parents))),
257
+ relations=tuple(dict.fromkeys((*cur.relations, *term.relations))),
258
+ obsolete=cur.obsolete and term.obsolete,
259
+ )
260
+ if not name:
261
+ name = root_path.stem
262
+ return OntologyGraph(name=name, version=version, terms=merged)
@@ -0,0 +1,67 @@
1
+ """Loaders for MITRE CAPEC (attack patterns) and CWE (software weaknesses) XML.
2
+
3
+ Both ship as a flat list of entries with explicit ``ChildOf`` relations that form
4
+ a deep is-a hierarchy, plus other typed relations (CanPrecede/CanFollow/PeerOf,
5
+ and CAPEC->CWE ``exploits`` links). These enrich the cyber domain beyond ATT&CK's
6
+ shallow tactic/technique tree with real subsumption depth and cross-links.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import xml.etree.ElementTree as ET
11
+
12
+ from .graph import OntologyGraph, Term
13
+
14
+ _KINDS = {
15
+ # kind: (entry_tag, id_prefix, related_tag, id_attr)
16
+ "capec": ("Attack_Pattern", "CAPEC", "Related_Attack_Pattern", "CAPEC_ID"),
17
+ "cwe": ("Weakness", "CWE", "Related_Weakness", "CWE_ID"),
18
+ }
19
+
20
+
21
+ def _local(tag: str) -> str:
22
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
23
+
24
+
25
+ def load_mitre_xml(path: str, kind: str, name: str = "") -> OntologyGraph:
26
+ entry_tag, prefix, related_tag, id_attr = _KINDS[kind]
27
+ name = name or kind
28
+ tree = ET.parse(path)
29
+ terms: dict[str, Term] = {}
30
+ version = tree.getroot().get("Version", "")
31
+ for elem in tree.getroot().iter():
32
+ if _local(elem.tag) != entry_tag:
33
+ continue
34
+ eid = elem.get("ID")
35
+ if not eid:
36
+ continue
37
+ tid = f"{prefix}-{eid}"
38
+ obsolete = (elem.get("Status", "") or "").lower() in ("deprecated", "obsolete")
39
+ parents: list[str] = []
40
+ relations: list[tuple[str, str]] = []
41
+ for rel in elem.iter():
42
+ rlocal = _local(rel.tag)
43
+ if rlocal == related_tag:
44
+ nature = rel.get("Nature", "")
45
+ target = rel.get(id_attr)
46
+ if not target:
47
+ continue
48
+ if nature == "ChildOf":
49
+ parents.append(f"{prefix}-{target}")
50
+ elif nature:
51
+ relations.append((nature, f"{prefix}-{target}"))
52
+ elif rlocal == "Related_Weakness" and kind == "capec":
53
+ cwe = rel.get("CWE_ID")
54
+ if cwe:
55
+ relations.append(("exploits", f"CWE-{cwe}"))
56
+ terms[tid] = Term(id=tid, name=elem.get("Name", ""),
57
+ parents=tuple(dict.fromkeys(parents)),
58
+ relations=tuple(dict.fromkeys(relations)), obsolete=obsolete)
59
+ return OntologyGraph(name=name, version=version, terms=terms)
60
+
61
+
62
+ def load_capec(path: str, name: str = "capec") -> OntologyGraph:
63
+ return load_mitre_xml(path, "capec", name=name)
64
+
65
+
66
+ def load_cwe(path: str, name: str = "cwe") -> OntologyGraph:
67
+ return load_mitre_xml(path, "cwe", name=name)
sma/ontology/mount.py ADDED
@@ -0,0 +1,101 @@
1
+ """Mount a normalized :class:`OntologyGraph` onto SMA's matching machinery.
2
+
3
+ Mounting lifts an ontology's is_a hierarchy into a :class:`Canonicalizer`
4
+ predicate lattice (so structurally-distinct-but-related terms can ascend to a
5
+ shared ancestor during matching) and provides the case/index builders that turn
6
+ a set of present terms into an SMA :class:`Case`.
7
+
8
+ A term ``T`` present on a subject becomes the statement ``fid(T)(subject)``;
9
+ each typed relation ``(s, rel, o)`` whose *both* endpoints are present becomes
10
+ the higher-order statement ``rel(fid(s)(subject), fid(o)(subject))``.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import dataclass
16
+ from typing import Any, Iterable, Mapping
17
+
18
+ from sma.index.macfac import MacFacIndex
19
+ from sma.ir.canon import Canonicalizer
20
+ from sma.ir.schema import Case, entity, make_case, stmt
21
+ from sma.match.types import MatchConfig
22
+
23
+ from .graph import OntologyGraph
24
+ from .loader import fid
25
+
26
+
27
+ def _default_config() -> MatchConfig:
28
+ """Ontology matching default: ascend up to two is_a hops (``delta=2``)."""
29
+ return MatchConfig(delta=2, rho=0.95)
30
+
31
+
32
+ @dataclass
33
+ class MountedOntology:
34
+ """An :class:`OntologyGraph` bound to a populated :class:`Canonicalizer`."""
35
+
36
+ graph: OntologyGraph
37
+ canon: Canonicalizer
38
+ config: MatchConfig
39
+
40
+ def build_case(
41
+ self,
42
+ term_ids: Iterable[str],
43
+ subject: str = "subject",
44
+ metadata: Mapping[str, Any] | None = None,
45
+ ) -> Case:
46
+ """Build a :class:`Case` for the given present ``term_ids``.
47
+
48
+ Unknown term ids (not in ``graph.terms``) are dropped. Each present term
49
+ contributes ``fid(term)(subject)``; each typed relation with both
50
+ endpoints present contributes the higher-order
51
+ ``rel(fid(s)(subject), fid(o)(subject))``.
52
+ """
53
+ present = [t for t in term_ids if t in self.graph.terms]
54
+ present_set = set(present)
55
+ subj = entity(subject, subject)
56
+
57
+ statements = [stmt(fid(t), subj) for t in present]
58
+ for s, rel, o in self.graph.typed_relations():
59
+ if s in present_set and o in present_set:
60
+ statements.append(stmt(rel, stmt(fid(s), subj), stmt(fid(o), subj)))
61
+
62
+ return make_case(statements, metadata=metadata)
63
+
64
+ def build_index(
65
+ self,
66
+ records: Iterable[tuple[str, Iterable[str], Mapping[str, Any] | None]],
67
+ config: MatchConfig | None = None,
68
+ ) -> MacFacIndex:
69
+ """Build a :class:`MacFacIndex` over ``(key, term_ids, metadata)`` records.
70
+
71
+ Each record's ``key`` is stored under ``metadata["key"]`` and the
72
+ returned index carries a ``key_of`` attribute mapping ``case_id -> key``
73
+ so callers can recover the original key from a retrieval result.
74
+ """
75
+ cases: list[Case] = []
76
+ key_of: dict[str, str] = {}
77
+ for key, term_ids, metadata in records:
78
+ md = dict(metadata or {})
79
+ md["key"] = key
80
+ case = self.build_case(term_ids, metadata=md)
81
+ cases.append(case)
82
+ key_of[case.case_id] = key
83
+
84
+ index = MacFacIndex(config=config or self.config, canon=self.canon)
85
+ index.build(cases)
86
+ index.key_of = key_of
87
+ return index
88
+
89
+
90
+ def mount(graph: OntologyGraph, config: MatchConfig | None = None) -> MountedOntology:
91
+ """Mount ``graph``: populate a predicate lattice from its is_a edges.
92
+
93
+ Every ``(child, parent)`` in :meth:`OntologyGraph.is_a_edges` becomes a
94
+ lattice edge ``fid(child) -> fid(parent)``. The default config ascends up to
95
+ two hops (``MatchConfig(delta=2, rho=0.95)``).
96
+ """
97
+ cfg = config or _default_config()
98
+ canon = Canonicalizer()
99
+ for child, parent in graph.is_a_edges():
100
+ canon.lattice.add(fid(child), fid(parent))
101
+ return MountedOntology(graph=graph, canon=canon, config=cfg)
@@ -0,0 +1,75 @@
1
+ """Complete RDF loader using rdflib (for ontologies our xml.etree subset misses).
2
+
3
+ Unlike :func:`sma.ontology.loader.load_owl` (a stdlib RDF/XML subset), this uses
4
+ rdflib to fully parse OWL — including Turtle, restrictions, and the complete class
5
+ graph — for ontologies like FIBO whose power lives in typed relations expressed
6
+ via ``owl:Restriction``. Falls back gracefully if rdflib is absent.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ from .graph import OntologyGraph, Term
13
+ from .loader import _KNOWN_PREFIXES
14
+
15
+
16
+ def _compact(uri: str) -> str:
17
+ frag = str(uri).rsplit("#", 1)[-1].rsplit("/", 1)[-1]
18
+ if "_" in frag:
19
+ prefix, _, rest = frag.partition("_")
20
+ if prefix in _KNOWN_PREFIXES and rest:
21
+ return f"{prefix}:{rest}"
22
+ return frag
23
+
24
+
25
+ def load_rdflib(path: str, name: str = "", fmt: str | None = None,
26
+ pattern: str = "*.rdf") -> OntologyGraph:
27
+ """Load an OWL/RDF ontology (file, Turtle, or a directory of RDF files) via rdflib."""
28
+ import logging
29
+ import rdflib
30
+ from rdflib import OWL, RDF, RDFS
31
+ from rdflib.namespace import DC, DCTERMS
32
+
33
+ # FIBO metadata carries malformed xsd:dateTime literals; rdflib logs a
34
+ # traceback per occurrence but parses fine. Silence that noise.
35
+ logging.getLogger("rdflib.term").setLevel(logging.CRITICAL)
36
+
37
+ g = rdflib.Graph()
38
+ root = Path(path)
39
+ files = sorted(root.rglob(pattern)) if root.is_dir() else [root]
40
+ for f in files:
41
+ try:
42
+ g.parse(str(f), format=fmt) if fmt else g.parse(str(f))
43
+ except Exception:
44
+ continue
45
+
46
+ # Collect named owl:Class (and rdfs:Class) subjects with a real IRI.
47
+ terms: dict[str, Term] = {}
48
+ classes = set(g.subjects(RDF.type, OWL.Class)) | set(g.subjects(RDF.type, RDFS.Class))
49
+ for cls in classes:
50
+ if isinstance(cls, rdflib.BNode):
51
+ continue
52
+ tid = _compact(cls)
53
+ label = g.value(cls, RDFS.label) or g.value(cls, DCTERMS.title) or g.value(cls, DC.title)
54
+ parents: list[str] = []
55
+ relations: list[tuple[str, str]] = []
56
+ for sup in g.objects(cls, RDFS.subClassOf):
57
+ if isinstance(sup, rdflib.BNode):
58
+ # owl:Restriction -> typed relation (onProperty + someValuesFrom).
59
+ prop = g.value(sup, OWL.onProperty)
60
+ tgt = g.value(sup, OWL.someValuesFrom) or g.value(sup, OWL.allValuesFrom)
61
+ if prop is not None and tgt is not None and not isinstance(tgt, rdflib.BNode):
62
+ relations.append((_compact(prop), _compact(tgt)))
63
+ else:
64
+ parents.append(_compact(sup))
65
+ obsolete = bool(g.value(cls, OWL.deprecated))
66
+ terms[tid] = Term(id=tid, name=str(label) if label else "",
67
+ parents=tuple(dict.fromkeys(parents)),
68
+ relations=tuple(dict.fromkeys(relations)), obsolete=obsolete)
69
+
70
+ version = ""
71
+ for o in g.objects(None, OWL.versionIRI):
72
+ version = str(o); break
73
+ if not name:
74
+ name = root.stem
75
+ return OntologyGraph(name=name, version=version, terms=terms)
@@ -0,0 +1,115 @@
1
+ """A small registry of named ontologies that loads and mounts them on demand.
2
+
3
+ Each :class:`OntologyEntry` records where an ontology lives on disk and how to
4
+ parse it; :meth:`OntologyRegistry.get` lazily loads the file into an
5
+ :class:`~sma.ontology.graph.OntologyGraph`, mounts it (building the predicate
6
+ lattice + match index machinery), and caches the result so repeated lookups are
7
+ cheap.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING
15
+
16
+ from .loader import load_ontology
17
+
18
+ if TYPE_CHECKING: # pragma: no cover - typing only
19
+ from .mount import MountedOntology
20
+ from sma.match.types import MatchConfig
21
+
22
+
23
+ def _infer_format(path: str) -> str:
24
+ """Infer the on-disk format from a path's extension.
25
+
26
+ ``.obo`` -> ``"obo"``; ``.owl``/``.owl.xml``/``.rdf``/``.xml`` -> ``"owl"``.
27
+ """
28
+ lower = str(path).lower()
29
+ if lower.endswith(".obo"):
30
+ return "obo"
31
+ if (
32
+ lower.endswith(".owl")
33
+ or lower.endswith(".owl.xml")
34
+ or lower.endswith(".rdf")
35
+ or lower.endswith(".xml")
36
+ ):
37
+ return "owl"
38
+ raise ValueError(f"Cannot infer ontology format from extension: {path}")
39
+
40
+
41
+ @dataclass
42
+ class OntologyEntry:
43
+ """A registered ontology: its name, source path, format, and version."""
44
+
45
+ name: str
46
+ path: str
47
+ format: str
48
+ version: str = ""
49
+
50
+
51
+ class OntologyRegistry:
52
+ """A name-keyed collection of ontologies, loaded and mounted on demand."""
53
+
54
+ def __init__(self) -> None:
55
+ self._entries: dict[str, OntologyEntry] = {}
56
+ self._order: list[str] = []
57
+ self._mounted: dict[str, "MountedOntology"] = {}
58
+
59
+ def register(
60
+ self,
61
+ name: str,
62
+ path: str,
63
+ fmt: str | None = None,
64
+ version: str | None = None,
65
+ ) -> OntologyEntry:
66
+ """Register an ontology under ``name``.
67
+
68
+ ``fmt`` is inferred from the file extension when omitted. Re-registering
69
+ a name replaces its entry and invalidates any cached mounted ontology.
70
+ """
71
+ resolved_fmt = fmt if fmt is not None else _infer_format(path)
72
+ entry = OntologyEntry(
73
+ name=name,
74
+ path=str(path),
75
+ format=resolved_fmt,
76
+ version=version or "",
77
+ )
78
+ if name not in self._entries:
79
+ self._order.append(name)
80
+ self._entries[name] = entry
81
+ self._mounted.pop(name, None)
82
+ return entry
83
+
84
+ def get(self, name: str, config: "MatchConfig | None" = None) -> "MountedOntology":
85
+ """Load + mount the named ontology, caching the result.
86
+
87
+ The mounted ontology is cached on first access; subsequent calls return
88
+ the same object (identity-stable) without re-reading the file.
89
+ """
90
+ cached = self._mounted.get(name)
91
+ if cached is not None:
92
+ return cached
93
+
94
+ try:
95
+ entry = self._entries[name]
96
+ except KeyError:
97
+ raise KeyError(f"No ontology registered under {name!r}") from None
98
+
99
+ # Import lazily: mount.py is owned by a sibling agent and may bring in
100
+ # heavier match machinery; keep registry import-time light and decoupled.
101
+ from .mount import mount
102
+
103
+ graph = load_ontology(entry.path, name=entry.name)
104
+ if not entry.version and graph.version:
105
+ entry.version = graph.version
106
+ mounted = mount(graph, config=config)
107
+ self._mounted[name] = mounted
108
+ return mounted
109
+
110
+ def list(self) -> list[OntologyEntry]:
111
+ """Return registered entries in registration order."""
112
+ return [self._entries[name] for name in self._order]
113
+
114
+ def __contains__(self, name: object) -> bool:
115
+ return name in self._entries