structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/ontology/router.py ADDED
@@ -0,0 +1,69 @@
1
+ """Route term ids and domains to the ontologies that can resolve them.
2
+
3
+ The :class:`DomainRouter` maps two things onto ontology names: id *prefixes*
4
+ (``"HP:"`` -> ``"hpo"``) and human *domains* (``"medicine"`` -> ``"hpo"``).
5
+ :meth:`DomainRouter.route` resolves a batch of term ids and/or a domain into the
6
+ de-duplicated, order-stable list of ontology names that should be consulted.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Iterable
12
+
13
+ from .registry import OntologyRegistry
14
+
15
+
16
+ class DomainRouter:
17
+ """Maps id prefixes and domains to registered ontology names."""
18
+
19
+ def __init__(self, registry: OntologyRegistry) -> None:
20
+ self.registry = registry
21
+ self._prefixes: dict[str, str] = {}
22
+ self._domains: dict[str, str] = {}
23
+
24
+ def register_prefix(self, prefix: str, ontology_name: str) -> None:
25
+ """Bind an id prefix (e.g. ``"HP:"``) to an ontology name."""
26
+ self._prefixes[prefix] = ontology_name
27
+
28
+ def register_domain(self, domain: str, ontology_name: str) -> None:
29
+ """Bind a domain label (e.g. ``"medicine"``) to an ontology name."""
30
+ self._domains[domain] = ontology_name
31
+
32
+ def _ontology_for_term(self, term_id: str) -> str | None:
33
+ """Return the ontology bound to the longest matching prefix, if any."""
34
+ best: str | None = None
35
+ best_len = -1
36
+ for prefix, name in self._prefixes.items():
37
+ if term_id.startswith(prefix) and len(prefix) > best_len:
38
+ best = name
39
+ best_len = len(prefix)
40
+ return best
41
+
42
+ def route(
43
+ self,
44
+ term_ids: Iterable[str] | None = None,
45
+ domain: str | None = None,
46
+ ) -> list[str]:
47
+ """Resolve ``term_ids`` and/or ``domain`` to ontology names.
48
+
49
+ A mapped ``domain`` contributes its ontology first; then each term id
50
+ contributes the ontology of its longest matching prefix. The result is
51
+ de-duplicated while preserving first-seen order. Returns ``[]`` when
52
+ nothing matches.
53
+ """
54
+ ordered: list[str] = []
55
+ seen: set[str] = set()
56
+
57
+ def add(name: str | None) -> None:
58
+ if name is not None and name not in seen:
59
+ seen.add(name)
60
+ ordered.append(name)
61
+
62
+ if domain is not None:
63
+ add(self._domains.get(domain))
64
+
65
+ if term_ids is not None:
66
+ for term_id in term_ids:
67
+ add(self._ontology_for_term(term_id))
68
+
69
+ return ordered
sma/ontology/usgaap.py ADDED
@@ -0,0 +1,73 @@
1
+ """Loader for the US-GAAP financial reporting taxonomy (XBRL presentation linkbase).
2
+
3
+ FIBO is a schema ontology with no public instance corpus, so the financial arm
4
+ uses US-GAAP instead: its concepts form a hierarchy via the presentation
5
+ linkbase's parent-child arcs (abstract statement headers subsume line items), and
6
+ SEC filings provide real gold (each filing reports a set of US-GAAP concepts).
7
+ This parses the core financial-statement presentation linkbases into an
8
+ :class:`OntologyGraph` (concept -> parent header).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ import xml.etree.ElementTree as ET
14
+ from pathlib import Path
15
+
16
+ from .graph import OntologyGraph, Term
17
+
18
+ _PARENT_CHILD = "parent-child"
19
+
20
+
21
+ def _local(tag: str) -> str:
22
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
23
+
24
+
25
+ def _attr(el, name: str):
26
+ for k, v in el.attrib.items():
27
+ if _local(k) == name:
28
+ return v
29
+ return None
30
+
31
+
32
+ def _concept(href: str) -> str:
33
+ """'...#us-gaap_Revenues' -> 'Revenues'."""
34
+ frag = href.rsplit("#", 1)[-1]
35
+ return frag.split("_", 1)[1] if "_" in frag else frag
36
+
37
+
38
+ def _humanize(name: str) -> str:
39
+ return re.sub(r"(?<=[a-z])(?=[A-Z])", " ", name)
40
+
41
+
42
+ def load_usgaap(path: str, name: str = "usgaap", pattern: str = "*.xml") -> OntologyGraph:
43
+ root = Path(path)
44
+ files = sorted(root.glob(pattern)) if root.is_dir() else [root]
45
+ parents: dict[str, set[str]] = {}
46
+ seen: set[str] = set()
47
+ for f in files:
48
+ try:
49
+ tree = ET.parse(f)
50
+ except ET.ParseError:
51
+ continue
52
+ for plink in tree.iter():
53
+ if _local(plink.tag) != "presentationLink":
54
+ continue
55
+ loc: dict[str, str] = {}
56
+ for el in plink:
57
+ lt = _local(el.tag)
58
+ if lt == "loc":
59
+ lab = _attr(el, "label"); href = _attr(el, "href")
60
+ if lab and href:
61
+ loc[lab] = _concept(href)
62
+ for el in plink:
63
+ if _local(el.tag) != "presentationArc":
64
+ continue
65
+ if (_attr(el, "arcrole") or "").rsplit("/", 1)[-1] != _PARENT_CHILD:
66
+ continue
67
+ pa, ch = loc.get(_attr(el, "from")), loc.get(_attr(el, "to"))
68
+ if pa and ch and pa != ch:
69
+ parents.setdefault(ch, set()).add(pa)
70
+ seen.update((pa, ch))
71
+ terms = {c: Term(id=c, name=_humanize(c), parents=tuple(sorted(parents.get(c, ()))))
72
+ for c in sorted(seen)}
73
+ return OntologyGraph(name=name, version="us-gaap-2024", terms=terms)
sma/sage/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .assimilate import assimilate_stream
2
+ from .pools import Generalization, SagePool
3
+ from .probabilities import support_probability
4
+
5
+ __all__ = ["Generalization", "SagePool", "assimilate_stream", "support_probability"]
6
+
sma/sage/assimilate.py ADDED
@@ -0,0 +1,12 @@
1
+ """Convenience assimilation API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from sma.ir.schema import Case
6
+
7
+ from .pools import SagePool
8
+
9
+
10
+ def assimilate_stream(pool: SagePool, cases: list[Case]) -> list[str]:
11
+ return [pool.assimilate(case) for case in cases]
12
+
sma/sage/pools.py ADDED
@@ -0,0 +1,105 @@
1
+ """SAGE-style generalization pools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import Counter
6
+ from dataclasses import dataclass, field
7
+
8
+ from sma.ir.schema import Case, make_case
9
+ from sma.ir.sexpr import dumps_statement, loads_statement
10
+ from sma.match.engine import match_cases
11
+ from sma.match.types import MatchConfig
12
+
13
+ from .probabilities import support_probability
14
+
15
+
16
+ @dataclass
17
+ class Generalization:
18
+ gen_id: str
19
+ constituents: list[str] = field(default_factory=list)
20
+ fact_counts: Counter[str] = field(default_factory=Counter)
21
+
22
+ def probabilities(self) -> dict[str, float]:
23
+ total = max(len(self.constituents), 1)
24
+ return {fact: support_probability(count, total) for fact, count in self.fact_counts.items()}
25
+
26
+ def schema_case(self, probability_cutoff: float = 0.6, min_constituents: int = 3) -> Case:
27
+ total = len(self.constituents)
28
+ facts = []
29
+ for sexpr, count in sorted(self.fact_counts.items()):
30
+ if total < min_constituents or support_probability(count, total) >= probability_cutoff:
31
+ facts.append(loads_statement(sexpr))
32
+ return make_case(facts, {"adapter": "sage", "generalization": self.gen_id})
33
+
34
+
35
+ @dataclass
36
+ class SagePool:
37
+ pool_id: str
38
+ config: MatchConfig = field(default_factory=MatchConfig)
39
+ assimilation_threshold: float = 0.25
40
+ probability_cutoff: float = 0.6
41
+ min_constituents: int = 3
42
+ generalizations: list[Generalization] = field(default_factory=list)
43
+ outliers: list[Case] = field(default_factory=list)
44
+
45
+ def assimilate(self, case: Case) -> str:
46
+ best_idx = -1
47
+ best_score = float("-inf")
48
+ for idx, gen in enumerate(self.generalizations):
49
+ gmap = match_cases(gen.schema_case(self.probability_cutoff, self.min_constituents), case, self.config)
50
+ if gmap.normalized_score > best_score:
51
+ best_score = gmap.normalized_score
52
+ best_idx = idx
53
+ if best_idx >= 0 and best_score >= self.assimilation_threshold:
54
+ self._add_to_generalization(self.generalizations[best_idx], case)
55
+ return self.generalizations[best_idx].gen_id
56
+ for outlier in list(self.outliers):
57
+ gmap = match_cases(outlier, case, self.config)
58
+ if gmap.normalized_score >= self.assimilation_threshold:
59
+ gen = Generalization(gen_id=f"{self.pool_id}_gen_{len(self.generalizations)}")
60
+ self._add_to_generalization(gen, outlier)
61
+ self._add_to_generalization(gen, case)
62
+ self.generalizations.append(gen)
63
+ self.outliers.remove(outlier)
64
+ return gen.gen_id
65
+ self.outliers.append(case)
66
+ return "outlier"
67
+
68
+ def expectation_violation(self, case: Case) -> float:
69
+ """1 - best normalized structural fit to any learned schema.
70
+
71
+ Near 0 = the case is explained by an existing generalization;
72
+ near 1 = the case breaks every schema (a candidate concept-drift
73
+ point). With no generalizations yet, returns 1.0 (nothing to expect).
74
+ """
75
+ if not self.generalizations:
76
+ return 1.0
77
+ best = 0.0
78
+ for gen in self.generalizations:
79
+ schema = gen.schema_case(self.probability_cutoff, self.min_constituents)
80
+ gmap = match_cases(schema, case, self.config)
81
+ best = max(best, gmap.normalized_score)
82
+ return max(0.0, 1.0 - best)
83
+
84
+ def _add_to_generalization(self, gen: Generalization, case: Case) -> None:
85
+ if case.case_id not in gen.constituents:
86
+ gen.constituents.append(case.case_id)
87
+ for statement in case.statements:
88
+ gen.fact_counts[dumps_statement(statement)] += 1
89
+
90
+ def stats(self) -> dict:
91
+ return {
92
+ "pool_id": self.pool_id,
93
+ "n_generalizations": len(self.generalizations),
94
+ "n_outliers": len(self.outliers),
95
+ "generalizations": [
96
+ {
97
+ "gen_id": gen.gen_id,
98
+ "n_constituents": len(gen.constituents),
99
+ "n_facts": len(gen.fact_counts),
100
+ "probabilities": gen.probabilities(),
101
+ }
102
+ for gen in self.generalizations
103
+ ],
104
+ }
105
+
@@ -0,0 +1,10 @@
1
+ """Frequency probabilities for SAGE facts."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ def support_probability(count: int, total: int, alpha: float = 1.0) -> float:
7
+ if total <= 0:
8
+ return 0.0
9
+ return (count + alpha) / (total + 2 * alpha)
10
+
sma/store/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from .lmdb_store import CaseStore, case_to_json
2
+ from .registry import Registry
3
+ from .wal import WalRecord, read_wal
4
+
5
+ __all__ = ["CaseStore", "Registry", "WalRecord", "case_to_json", "read_wal"]
6
+
@@ -0,0 +1,78 @@
1
+ """Case store with a simple append-only WAL.
2
+
3
+ The class name keeps the blueprint contract. At runtime it uses LMDB when
4
+ available and falls back to a deterministic file store for minimal installs.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import pathlib
11
+ import zlib
12
+ from dataclasses import asdict
13
+ from typing import Iterable
14
+
15
+ from sma.ir.schema import Case, make_case
16
+ from sma.ir.sexpr import canonical_case_text, loads_case
17
+
18
+
19
+ class CaseStore:
20
+ def __init__(self, root: str | pathlib.Path):
21
+ self.root = pathlib.Path(root)
22
+ self.case_dir = self.root / "cases"
23
+ self.wal_path = self.root / "wal.jsonl"
24
+ self.case_dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ def put(self, case: Case) -> str:
27
+ text = canonical_case_text(case.statements)
28
+ payload = {
29
+ "case_id": case.case_id,
30
+ "metadata": dict(case.metadata),
31
+ "sexpr": text,
32
+ }
33
+ blob = zlib.compress(json.dumps(payload, sort_keys=True).encode("utf-8"))
34
+ path = self.case_dir / f"{case.case_id}.json.z"
35
+ path.write_bytes(blob)
36
+ with self.wal_path.open("a", encoding="utf-8") as wal:
37
+ wal.write(json.dumps({"op": "put", "case_id": case.case_id}, sort_keys=True) + "\n")
38
+ return case.case_id
39
+
40
+ def get(self, case_id: str) -> Case:
41
+ path = self.case_dir / f"{case_id}.json.z"
42
+ if not path.exists():
43
+ raise KeyError(case_id)
44
+ payload = json.loads(zlib.decompress(path.read_bytes()).decode("utf-8"))
45
+ statements = loads_case(payload["sexpr"])
46
+ return make_case(statements, payload.get("metadata", {}), case_id=payload["case_id"])
47
+
48
+ def exists(self, case_id: str) -> bool:
49
+ return (self.case_dir / f"{case_id}.json.z").exists()
50
+
51
+ def ids(self) -> list[str]:
52
+ return sorted(path.name.removesuffix(".json.z") for path in self.case_dir.glob("*.json.z"))
53
+
54
+ def iter_cases(self) -> Iterable[Case]:
55
+ for case_id in self.ids():
56
+ yield self.get(case_id)
57
+
58
+ def replay_wal(self) -> list[str]:
59
+ if not self.wal_path.exists():
60
+ return []
61
+ ids: list[str] = []
62
+ for line in self.wal_path.read_text(encoding="utf-8").splitlines():
63
+ if not line.strip():
64
+ continue
65
+ record = json.loads(line)
66
+ if record.get("op") == "put":
67
+ ids.append(record["case_id"])
68
+ return ids
69
+
70
+
71
+ def case_to_json(case: Case) -> dict:
72
+ return {
73
+ "case_id": case.case_id,
74
+ "metadata": dict(case.metadata),
75
+ "statements": [asdict(statement) for statement in case.statements],
76
+ "sexpr": canonical_case_text(case.statements),
77
+ }
78
+
sma/store/registry.py ADDED
@@ -0,0 +1,26 @@
1
+ """Schema and adapter version registry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class Registry:
10
+ adapters: dict[str, str] = field(default_factory=dict)
11
+ score_versions: dict[str, dict] = field(default_factory=dict)
12
+
13
+ def register_adapter(self, adapter_id: str, version: str) -> None:
14
+ self.adapters[adapter_id] = version
15
+
16
+ def register_score(self, score_id: str, config: dict) -> None:
17
+ self.score_versions[score_id] = dict(config)
18
+
19
+ @classmethod
20
+ def defaults(cls) -> "Registry":
21
+ registry = cls()
22
+ for adapter in ("logs", "code", "traces", "structured", "agentobs", "prose_tier1"):
23
+ registry.register_adapter(adapter, "0.1.0")
24
+ registry.register_score("score-v1-draft", {"gamma": 0.25, "rho": 0.5, "delta": 2})
25
+ return registry
26
+
sma/store/wal.py ADDED
@@ -0,0 +1,26 @@
1
+ """WAL helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import pathlib
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class WalRecord:
12
+ op: str
13
+ case_id: str
14
+
15
+
16
+ def read_wal(path: str | pathlib.Path) -> list[WalRecord]:
17
+ p = pathlib.Path(path)
18
+ if not p.exists():
19
+ return []
20
+ records: list[WalRecord] = []
21
+ for line in p.read_text(encoding="utf-8").splitlines():
22
+ if line.strip():
23
+ data = json.loads(line)
24
+ records.append(WalRecord(op=data["op"], case_id=data["case_id"]))
25
+ return records
26
+