structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/encoders/coverage.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Per-query structural coverage indicator (blueprint section 12-R3).
|
|
2
|
+
|
|
3
|
+
The cross-system event ontology in ``logs_drain.EVENT_CLASS_RULES`` is frozen
|
|
4
|
+
(tag ontology-v1). When a query's vocabulary falls outside those keyword
|
|
5
|
+
rules, MAC/FAC retrieval degrades silently: events still get template
|
|
6
|
+
functors, but none of the shared cross-system class statements fire, so
|
|
7
|
+
structural similarity is computed over near-disjoint functor sets. This is
|
|
8
|
+
the "lattice-miss" tripwire from blueprint section 4.3 / 12-R3, surfaced as a
|
|
9
|
+
measured per-query coverage number rather than hidden.
|
|
10
|
+
|
|
11
|
+
Read-only consumer of the frozen ontology: imports ``event_classes`` from
|
|
12
|
+
``logs_drain`` and never modifies the rules.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from sma.encoders.logs_drain import event_classes
|
|
18
|
+
|
|
19
|
+
# Below this fraction the SMA evidence panel shows an amber chip and the
|
|
20
|
+
# verbalizer prompt carries an explicit low-confidence caveat.
|
|
21
|
+
COVERAGE_WARN_THRESHOLD = 0.4
|
|
22
|
+
|
|
23
|
+
# Keyword attributes emitted by LogEncoder in addition to the class rules
|
|
24
|
+
# (timeout/retry/failure statements). They are subsets of the class keyword
|
|
25
|
+
# sets today, but checked explicitly so coverage stays correct even if the
|
|
26
|
+
# attribute rules and class rules ever diverge in a future ontology version.
|
|
27
|
+
_KEYWORD_ATTRIBUTES: tuple[tuple[str, ...], ...] = (
|
|
28
|
+
("timeout",),
|
|
29
|
+
("retry",),
|
|
30
|
+
("error", "exception", "fail"),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _line_fired(line_lower: str, extra_classes=None) -> bool:
|
|
35
|
+
if event_classes(line_lower):
|
|
36
|
+
return True
|
|
37
|
+
if extra_classes is not None and any(
|
|
38
|
+
any(k in line_lower for k in keywords) for _, keywords in extra_classes
|
|
39
|
+
):
|
|
40
|
+
return True
|
|
41
|
+
return any(any(k in line_lower for k in keywords) for keywords in _KEYWORD_ATTRIBUTES)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def rule_coverage(text: str, extra_classes=None) -> dict:
|
|
45
|
+
"""Fraction of non-empty lines that fired at least one class rule.
|
|
46
|
+
|
|
47
|
+
``extra_classes``: optional (name, keywords) pairs from an active draft
|
|
48
|
+
adapter - while a draft is applied, its rules legitimately count toward
|
|
49
|
+
coverage (the chip must reflect what retrieval can actually use).
|
|
50
|
+
|
|
51
|
+
Returns a dict with:
|
|
52
|
+
- ``fraction``: covered / total non-empty lines (0.0 for empty text)
|
|
53
|
+
- ``covered_lines`` / ``total_lines``: the raw counts
|
|
54
|
+
- ``low``: True when fraction < COVERAGE_WARN_THRESHOLD
|
|
55
|
+
- ``percent``: integer percent, for display
|
|
56
|
+
"""
|
|
57
|
+
lines = [line for line in text.splitlines() if line.strip()]
|
|
58
|
+
covered = sum(1 for line in lines if _line_fired(line.lower(), extra_classes))
|
|
59
|
+
total = len(lines)
|
|
60
|
+
fraction = covered / total if total else 0.0
|
|
61
|
+
return {
|
|
62
|
+
"fraction": fraction,
|
|
63
|
+
"covered_lines": covered,
|
|
64
|
+
"total_lines": total,
|
|
65
|
+
"low": fraction < COVERAGE_WARN_THRESHOLD,
|
|
66
|
+
"percent": round(fraction * 100),
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def coverage_warning(coverage: dict) -> str | None:
|
|
71
|
+
"""The exact low-confidence caveat for evidence metadata and the prompt."""
|
|
72
|
+
if not coverage.get("low"):
|
|
73
|
+
return None
|
|
74
|
+
return (
|
|
75
|
+
f"structural coverage of this query is LOW ({coverage['percent']}%) - "
|
|
76
|
+
"structural retrieval is low-confidence for this vocabulary"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
__all__ = ["COVERAGE_WARN_THRESHOLD", "rule_coverage", "coverage_warning"]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""LLM-drafted adapter: rules as data, never facts.
|
|
2
|
+
|
|
3
|
+
The frozen ontology (logs_drain.EVENT_CLASS_RULES, tag ontology-v1) cannot be
|
|
4
|
+
edited, but the blueprint's section 4.1 discipline allows EXTRA deterministic
|
|
5
|
+
class rules supplied as data at the adapter boundary. An LLM may *propose*
|
|
6
|
+
those rules (see sma.agent.adapter_draft); it never encodes anything. Once
|
|
7
|
+
the rules exist as data, encoding is pure deterministic keyword matching:
|
|
8
|
+
identical input + identical rules => identical case bytes.
|
|
9
|
+
|
|
10
|
+
DraftAdapter composes the standard LogEncoder output with additional class
|
|
11
|
+
statements derived from the supplied DraftRules, mirroring the
|
|
12
|
+
``event_classes`` first-match-per-class semantics of the frozen rules.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
|
|
21
|
+
import blake3
|
|
22
|
+
|
|
23
|
+
from sma.ir.schema import entity, make_case, stmt
|
|
24
|
+
|
|
25
|
+
from .base import EncodeResult
|
|
26
|
+
from .logs_drain import EVENT_CLASS_RULES, LogEncoder, infer_session
|
|
27
|
+
|
|
28
|
+
FROZEN_CLASS_NAMES = frozenset(name for name, _ in EVENT_CLASS_RULES)
|
|
29
|
+
|
|
30
|
+
_CLASS_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9]*Event$")
|
|
31
|
+
|
|
32
|
+
MAX_CLASSES = 8
|
|
33
|
+
MAX_KEYWORDS = 5
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class DraftRules:
|
|
38
|
+
"""Extra deterministic class rules, supplied as data.
|
|
39
|
+
|
|
40
|
+
``classes``: ordered (class_name, tuple-of-keywords) pairs, mirroring the
|
|
41
|
+
shape of the frozen EVENT_CLASS_RULES. ``maskings``: optional regexes for
|
|
42
|
+
variable tokens (ids, timestamps, counters), validated and carried in the
|
|
43
|
+
content-addressed artifact for future template masking. They are NOT
|
|
44
|
+
applied before keyword matching - class matching mirrors event_classes
|
|
45
|
+
exactly (raw lowered line), and LLM-drafted masks routinely cover the very
|
|
46
|
+
substrings the keywords need (e.g. ``code=[a-z0-9-]+`` vs ``grain-drift``).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
classes: tuple[tuple[str, tuple[str, ...]], ...] = ()
|
|
50
|
+
maskings: tuple[str, ...] = field(default_factory=tuple)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def validate_rules(rules: DraftRules) -> list[str]:
|
|
54
|
+
"""Return a list of validation errors (empty list means valid)."""
|
|
55
|
+
errors: list[str] = []
|
|
56
|
+
seen: set[str] = set()
|
|
57
|
+
for name, keywords in rules.classes:
|
|
58
|
+
if not isinstance(name, str) or not _CLASS_NAME_RE.match(name):
|
|
59
|
+
errors.append(
|
|
60
|
+
f"class name {name!r} must be alphanumeric with an 'Event' suffix"
|
|
61
|
+
)
|
|
62
|
+
if name in FROZEN_CLASS_NAMES:
|
|
63
|
+
errors.append(
|
|
64
|
+
f"class name {name!r} collides with the frozen ontology-v1 EVENT_CLASS_RULES"
|
|
65
|
+
)
|
|
66
|
+
if name in seen:
|
|
67
|
+
errors.append(f"duplicate class name {name!r}")
|
|
68
|
+
seen.add(name)
|
|
69
|
+
if not keywords:
|
|
70
|
+
errors.append(f"class {name!r} has no keywords")
|
|
71
|
+
for keyword in keywords:
|
|
72
|
+
if not isinstance(keyword, str) or not keyword or keyword != keyword.lower():
|
|
73
|
+
errors.append(
|
|
74
|
+
f"class {name!r} keyword {keyword!r} must be a non-empty lowercase string"
|
|
75
|
+
)
|
|
76
|
+
for pattern in rules.maskings:
|
|
77
|
+
try:
|
|
78
|
+
re.compile(pattern)
|
|
79
|
+
except re.error as exc:
|
|
80
|
+
errors.append(f"masking regex {pattern!r} does not compile: {exc}")
|
|
81
|
+
return errors
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def rules_hash(rules: DraftRules) -> str:
|
|
85
|
+
"""Content address (blake3) of the canonical JSON form of the rules."""
|
|
86
|
+
return blake3.blake3(rules_to_json(rules).encode("utf-8")).hexdigest()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def rules_to_json(rules: DraftRules) -> str:
|
|
90
|
+
payload = {
|
|
91
|
+
"classes": [
|
|
92
|
+
{"name": name, "keywords": list(keywords)} for name, keywords in rules.classes
|
|
93
|
+
],
|
|
94
|
+
"maskings": list(rules.maskings),
|
|
95
|
+
}
|
|
96
|
+
return json.dumps(payload, indent=2, sort_keys=True)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def rules_from_json(text: str) -> DraftRules:
|
|
100
|
+
payload = json.loads(text)
|
|
101
|
+
classes = tuple(
|
|
102
|
+
(row["name"], tuple(row["keywords"])) for row in payload.get("classes", [])
|
|
103
|
+
)
|
|
104
|
+
maskings = tuple(payload.get("maskings", []))
|
|
105
|
+
return DraftRules(classes=classes, maskings=maskings)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class DraftAdapter:
|
|
109
|
+
"""LogEncoder output + extra deterministic class statements from DraftRules."""
|
|
110
|
+
|
|
111
|
+
adapter_id = "logs+draft"
|
|
112
|
+
version = "0.1.0"
|
|
113
|
+
|
|
114
|
+
def __init__(self, rules: DraftRules):
|
|
115
|
+
errors = validate_rules(rules)
|
|
116
|
+
if errors:
|
|
117
|
+
raise ValueError("invalid draft rules: " + "; ".join(errors))
|
|
118
|
+
self.rules = rules
|
|
119
|
+
self.draft_hash = rules_hash(rules)
|
|
120
|
+
self._base = LogEncoder()
|
|
121
|
+
|
|
122
|
+
def draft_classes(self, line: str) -> list[str]:
|
|
123
|
+
"""Mirror of logs_drain.event_classes, over the supplied rules.
|
|
124
|
+
|
|
125
|
+
Matches the raw lowered line, exactly like the frozen rules; see the
|
|
126
|
+
DraftRules docstring for why maskings are not applied here.
|
|
127
|
+
"""
|
|
128
|
+
line_lower = line.lower()
|
|
129
|
+
return [
|
|
130
|
+
name
|
|
131
|
+
for name, keywords in self.rules.classes
|
|
132
|
+
if any(k in line_lower for k in keywords)
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
def encode(self, artifact: str, **kwargs) -> EncodeResult:
|
|
136
|
+
base = self._base.encode(artifact, **kwargs)
|
|
137
|
+
session = kwargs.get("session_id") or infer_session(artifact)
|
|
138
|
+
extra = []
|
|
139
|
+
# Mirror the base encoder's event enumeration exactly: e{i} over
|
|
140
|
+
# non-empty lines, so the extra class statements attach to the same
|
|
141
|
+
# event entities the base statements use.
|
|
142
|
+
for i, line in enumerate(line for line in artifact.splitlines() if line.strip()):
|
|
143
|
+
for cls in self.draft_classes(line):
|
|
144
|
+
extra.append(stmt(cls, entity(f"e{i}", "event"), entity(session, "session")))
|
|
145
|
+
metadata = dict(base.case.metadata)
|
|
146
|
+
metadata.update(
|
|
147
|
+
{
|
|
148
|
+
"adapter": "draft",
|
|
149
|
+
"base_adapter": self._base.adapter_id,
|
|
150
|
+
"draft_hash": self.draft_hash,
|
|
151
|
+
"version": self.version,
|
|
152
|
+
}
|
|
153
|
+
)
|
|
154
|
+
case = make_case(tuple(base.case.statements) + tuple(extra), metadata)
|
|
155
|
+
return EncodeResult(case, base.warnings)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def check_determinism(adapter: DraftAdapter, text: str) -> bool:
|
|
159
|
+
"""Encode twice and assert identical canonical bytes (blueprint section 4 CI rule)."""
|
|
160
|
+
from sma.ir.sexpr import canonical_case_text
|
|
161
|
+
|
|
162
|
+
first = adapter.encode(text).case
|
|
163
|
+
second = adapter.encode(text).case
|
|
164
|
+
first_text = canonical_case_text(first.statements)
|
|
165
|
+
second_text = canonical_case_text(second.statements)
|
|
166
|
+
if first_text != second_text or first.case_id != second.case_id:
|
|
167
|
+
raise AssertionError(
|
|
168
|
+
f"draft adapter is non-deterministic for hash={adapter.draft_hash[:8]}"
|
|
169
|
+
)
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
__all__ = [
|
|
174
|
+
"DraftAdapter",
|
|
175
|
+
"DraftRules",
|
|
176
|
+
"MAX_CLASSES",
|
|
177
|
+
"MAX_KEYWORDS",
|
|
178
|
+
"check_determinism",
|
|
179
|
+
"rules_from_json",
|
|
180
|
+
"rules_hash",
|
|
181
|
+
"rules_to_json",
|
|
182
|
+
"validate_rules",
|
|
183
|
+
]
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Expert clinical encoder for diabetes inpatient encounters (Tier-0).
|
|
2
|
+
|
|
3
|
+
Designed as a clinical-informatics specialist would: it does not emit flat
|
|
4
|
+
attribute=value triples. It encodes the *clinical relationships* an
|
|
5
|
+
endocrinologist reasons over — diagnosis comorbidity over the ICD-9 hierarchy,
|
|
6
|
+
the specific diabetes complication, antidiabetic therapy by drug CLASS with
|
|
7
|
+
titration, glycemic control state, care acuity/chronicity — and the higher-order
|
|
8
|
+
causal/temporal chains that link them (poor control -> therapy escalation;
|
|
9
|
+
diabetes -> end-organ complication; chronic utilization -> acute admission).
|
|
10
|
+
|
|
11
|
+
All mappings are real medical knowledge (ICD-9 category grouping per Strack et
|
|
12
|
+
al. 2014; antidiabetic drug classes per pharmacology; ADA glycemic thresholds).
|
|
13
|
+
NOTHING is fitted to the readmission label, which is never encoded.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from sma.ir.schema import Case, Statement, entity, make_case, stmt
|
|
18
|
+
|
|
19
|
+
from .base import EncodeResult
|
|
20
|
+
|
|
21
|
+
# --- ICD-9 diagnosis grouping (Strack et al. 2014, the canonical Diabetes-130
|
|
22
|
+
# categorization). Maps a code to a body-system category. ---------------
|
|
23
|
+
_DIABETES_COMPLICATION = {
|
|
24
|
+
"0": "uncomplicated", "1": "ketoacidosis", "2": "hyperosmolar",
|
|
25
|
+
"3": "other_coma", "4": "renal", "5": "ophthalmic", "6": "neurological",
|
|
26
|
+
"7": "peripheral_circulatory", "8": "other_specified", "9": "unspecified",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def icd9_category(code: str) -> str | None:
|
|
31
|
+
code = (code or "").strip()
|
|
32
|
+
if not code:
|
|
33
|
+
return None
|
|
34
|
+
if code[0] in "EV": # external-cause / supplementary -> Other
|
|
35
|
+
return "other"
|
|
36
|
+
if code.startswith("250"):
|
|
37
|
+
return "diabetes"
|
|
38
|
+
try:
|
|
39
|
+
num = int(float(code))
|
|
40
|
+
except ValueError:
|
|
41
|
+
return "other"
|
|
42
|
+
if 390 <= num <= 459 or num == 785:
|
|
43
|
+
return "circulatory"
|
|
44
|
+
if 460 <= num <= 519 or num == 786:
|
|
45
|
+
return "respiratory"
|
|
46
|
+
if 520 <= num <= 579 or num == 787:
|
|
47
|
+
return "digestive"
|
|
48
|
+
if 800 <= num <= 999:
|
|
49
|
+
return "injury"
|
|
50
|
+
if 710 <= num <= 739:
|
|
51
|
+
return "musculoskeletal"
|
|
52
|
+
if 580 <= num <= 629 or num == 788:
|
|
53
|
+
return "genitourinary"
|
|
54
|
+
if 140 <= num <= 239:
|
|
55
|
+
return "neoplasm"
|
|
56
|
+
return "other"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def diabetes_complication(code: str) -> str | None:
|
|
60
|
+
code = (code or "").strip()
|
|
61
|
+
if not code.startswith("250"):
|
|
62
|
+
return None
|
|
63
|
+
_, _, dec = code.partition(".")
|
|
64
|
+
return _DIABETES_COMPLICATION.get(dec[:1], "uncomplicated") if dec else "uncomplicated"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# --- Antidiabetic pharmacology: drug -> therapeutic class -------------------
|
|
68
|
+
_DRUG_CLASS = {
|
|
69
|
+
"metformin": "biguanide",
|
|
70
|
+
"glimepiride": "sulfonylurea", "glipizide": "sulfonylurea",
|
|
71
|
+
"glyburide": "sulfonylurea", "chlorpropamide": "sulfonylurea",
|
|
72
|
+
"tolbutamide": "sulfonylurea", "acetohexamide": "sulfonylurea",
|
|
73
|
+
"tolazamide": "sulfonylurea",
|
|
74
|
+
"repaglinide": "meglitinide", "nateglinide": "meglitinide",
|
|
75
|
+
"pioglitazone": "tzd", "rosiglitazone": "tzd", "troglitazone": "tzd",
|
|
76
|
+
"acarbose": "agi", "miglitol": "agi",
|
|
77
|
+
"insulin": "insulin",
|
|
78
|
+
"examide": "other", "citoglipton": "other",
|
|
79
|
+
"glyburide-metformin": "combination", "glipizide-metformin": "combination",
|
|
80
|
+
"glimepiride-pioglitazone": "combination",
|
|
81
|
+
"metformin-rosiglitazone": "combination", "metformin-pioglitazone": "combination",
|
|
82
|
+
}
|
|
83
|
+
_DRUGS = tuple(_DRUG_CLASS)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _glycemic_state(a1c: str, glu: str) -> str:
|
|
87
|
+
a1c, glu = (a1c or "").strip(), (glu or "").strip()
|
|
88
|
+
if a1c in (">7", ">8") or glu in (">200", ">300"):
|
|
89
|
+
return "uncontrolled"
|
|
90
|
+
if a1c == "Norm" or glu == "Norm":
|
|
91
|
+
return "controlled"
|
|
92
|
+
return "unmeasured"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _bin_count(n: int, lo: int = 1, hi: int = 3) -> str:
|
|
96
|
+
if n <= 0:
|
|
97
|
+
return "none"
|
|
98
|
+
return "low" if n <= hi else "high"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _int(fields: dict, key: str) -> int:
|
|
102
|
+
try:
|
|
103
|
+
return int(float(fields.get(key, "0")))
|
|
104
|
+
except (ValueError, TypeError):
|
|
105
|
+
return 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class HealthcareEncoder:
|
|
109
|
+
adapter_id = "healthcare"
|
|
110
|
+
version = "1.0.0"
|
|
111
|
+
|
|
112
|
+
def encode(self, artifact: str, **kwargs) -> EncodeResult:
|
|
113
|
+
import json
|
|
114
|
+
fields = json.loads(artifact) if isinstance(artifact, str) else dict(artifact)
|
|
115
|
+
return EncodeResult(self.encode_record(fields), ())
|
|
116
|
+
|
|
117
|
+
def encode_record(self, f: dict) -> Case:
|
|
118
|
+
# Clinical features are lifted into FUNCTORS (a structure-mapping memory
|
|
119
|
+
# discriminates on functor identity, not entity arguments), each over a
|
|
120
|
+
# constant patient node; the higher-order clinical relations connect
|
|
121
|
+
# them. So two patients with the same comorbidity/therapy/control profile
|
|
122
|
+
# share functors (MAC discriminates) AND share relational structure (SME
|
|
123
|
+
# systematicity).
|
|
124
|
+
p = entity("patient", "patient")
|
|
125
|
+
S: list[Statement] = []
|
|
126
|
+
|
|
127
|
+
def feat(name: str) -> Statement:
|
|
128
|
+
return stmt(name, p)
|
|
129
|
+
|
|
130
|
+
# 0. Raw discriminative features (high-cardinality functors = retrieval
|
|
131
|
+
# discrimination). A structure-mapping memory needs BOTH discriminative
|
|
132
|
+
# detail AND curated structure; abstraction alone collapses patients.
|
|
133
|
+
for col, val in f.items():
|
|
134
|
+
S.append(stmt(col, p, entity(str(val), "value")))
|
|
135
|
+
|
|
136
|
+
# 1. Diagnoses -> dx<Category> functors; diabetes complication -> cx<Type>.
|
|
137
|
+
cats: list[str] = []
|
|
138
|
+
dia_stmt: Statement | None = None
|
|
139
|
+
for col in ("diag_1", "diag_2", "diag_3"):
|
|
140
|
+
cat = icd9_category(f.get(col, ""))
|
|
141
|
+
if not cat:
|
|
142
|
+
continue
|
|
143
|
+
d = feat("dx" + cat.capitalize())
|
|
144
|
+
S.append(d)
|
|
145
|
+
cats.append(cat)
|
|
146
|
+
if cat == "diabetes" and dia_stmt is None:
|
|
147
|
+
dia_stmt = d
|
|
148
|
+
comp = diabetes_complication(f.get(col, ""))
|
|
149
|
+
if comp and comp not in ("uncomplicated", "unspecified"):
|
|
150
|
+
c = feat("cx" + comp.replace("_", " ").title().replace(" ", ""))
|
|
151
|
+
S.append(c)
|
|
152
|
+
S.append(stmt("manifests", d, c)) # higher-order
|
|
153
|
+
# comorbidity: relate the first diagnosis to each distinct other system
|
|
154
|
+
seen_dx = {s.functor: s for s in S if s.functor.startswith("dx")}
|
|
155
|
+
dx_list = list(seen_dx.values())
|
|
156
|
+
for other in dx_list[1:]:
|
|
157
|
+
S.append(stmt("comorbidWith", dx_list[0], other)) # higher-order
|
|
158
|
+
|
|
159
|
+
# 2. Therapy -> rx<Class> functors, titration -> up/down<Class>; treats.
|
|
160
|
+
therapy_stmts: dict[str, Statement] = {}
|
|
161
|
+
for drug in _DRUGS:
|
|
162
|
+
status = (f.get(drug, "No") or "No").strip()
|
|
163
|
+
if status == "No":
|
|
164
|
+
continue
|
|
165
|
+
cls = _DRUG_CLASS[drug]
|
|
166
|
+
if cls not in therapy_stmts:
|
|
167
|
+
t = feat("rx" + cls.capitalize())
|
|
168
|
+
therapy_stmts[cls] = t
|
|
169
|
+
S.append(t)
|
|
170
|
+
if dia_stmt is not None:
|
|
171
|
+
S.append(stmt("treats", t, dia_stmt)) # higher-order
|
|
172
|
+
if status == "Up":
|
|
173
|
+
S.append(feat("titrUp" + cls.capitalize()))
|
|
174
|
+
elif status == "Down":
|
|
175
|
+
S.append(feat("titrDown" + cls.capitalize()))
|
|
176
|
+
if len(therapy_stmts) >= 2:
|
|
177
|
+
S.append(feat("polytherapy"))
|
|
178
|
+
|
|
179
|
+
# 3. Glycemic control -> gly<State> functor; clinical picture links.
|
|
180
|
+
gly = _glycemic_state(f.get("A1Cresult", ""), f.get("max_glu_serum", ""))
|
|
181
|
+
gly_stmt = feat("gly" + gly.capitalize())
|
|
182
|
+
S.append(gly_stmt)
|
|
183
|
+
if dx_list:
|
|
184
|
+
S.append(stmt("presentsWith", dx_list[0], gly_stmt)) # higher-order
|
|
185
|
+
for t in therapy_stmts.values():
|
|
186
|
+
S.append(stmt("addresses", t, gly_stmt)) # higher-order
|
|
187
|
+
|
|
188
|
+
# 4. Poor control -> therapy escalation (the causal chain).
|
|
189
|
+
change = (f.get("change", "No") or "No").strip()
|
|
190
|
+
insulin = (f.get("insulin", "No") or "No").strip()
|
|
191
|
+
if gly == "uncontrolled" and (change == "Ch" or insulin in ("Up", "Steady")):
|
|
192
|
+
esc = feat("escalation")
|
|
193
|
+
S.append(esc)
|
|
194
|
+
S.append(stmt("cause", gly_stmt, esc)) # higher-order
|
|
195
|
+
|
|
196
|
+
# 5. Care acuity / chronicity -> functors.
|
|
197
|
+
prior = _int(f, "number_inpatient") + _int(f, "number_emergency") + _int(f, "number_outpatient")
|
|
198
|
+
util = feat("priorUtil" + _bin_count(prior).capitalize())
|
|
199
|
+
S.append(util)
|
|
200
|
+
adm = feat("admit" + str(f.get("admission_type_id", "0")))
|
|
201
|
+
S.append(adm)
|
|
202
|
+
if _bin_count(prior) == "high":
|
|
203
|
+
S.append(stmt("chronicity", util, adm)) # higher-order
|
|
204
|
+
S.append(feat("los" + _bin_count(_int(f, "time_in_hospital"), hi=4).capitalize()))
|
|
205
|
+
|
|
206
|
+
return make_case(S or [feat("encounterUnknown")],
|
|
207
|
+
{"adapter": self.adapter_id, "tier": 0})
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Deterministic Tier-0 log encoder.
|
|
2
|
+
|
|
3
|
+
This is a small Drain-like template masker for the MVP. When Drain3 is
|
|
4
|
+
installed, it can be substituted behind the same output contract.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from collections import Counter
|
|
11
|
+
|
|
12
|
+
from sma.ir.schema import Entity, Statement, entity, make_case, stmt
|
|
13
|
+
|
|
14
|
+
from .base import EncodeResult
|
|
15
|
+
|
|
16
|
+
IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
|
|
17
|
+
HEX_RE = re.compile(r"\b0x[0-9a-fA-F]+\b")
|
|
18
|
+
NUM_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
|
|
19
|
+
BLOCK_RE = re.compile(r"\bblk[_-]?[A-Za-z0-9_-]+\b")
|
|
20
|
+
|
|
21
|
+
# Cross-system event ontology (blueprint section 4.3 mini-ontology, tripwire
|
|
22
|
+
# response to the measured ~100% cross-system lattice miss): template hashes
|
|
23
|
+
# are system-specific by construction, so every event also gets coarse
|
|
24
|
+
# deterministic class statements whose functors ARE shared across systems.
|
|
25
|
+
# First-match-per-class keyword rules over the lowercased line; a line may
|
|
26
|
+
# carry several classes. Order is fixed; rules are data.
|
|
27
|
+
EVENT_CLASS_RULES: tuple[tuple[str, tuple[str, ...]], ...] = (
|
|
28
|
+
("timeoutEvent", ("timeout", "timed out")),
|
|
29
|
+
("retryEvent", ("retry", "retrying", "retransmit", "re-send", "resend")),
|
|
30
|
+
("ioEvent", ("eofexception", "ioexception", "io error", "input/output", "end of file")),
|
|
31
|
+
("memoryEvent", ("out of memory", "oom", "memory error", "ecc", "dimm", "cache error")),
|
|
32
|
+
("kernelEvent", ("kernel", "panic", "machine check", "mce", "interrupt")),
|
|
33
|
+
("networkEvent", ("connect", "socket", "network", "unreachable", "reset by peer",
|
|
34
|
+
"dhcp", "http", "link", "heartbeat", "packet")),
|
|
35
|
+
("authEvent", ("auth", "permission", "denied", "credential", "token", "login")),
|
|
36
|
+
("storageEvent", ("block", "replica", "disk", "volume", "snapshot", "image", "file system",
|
|
37
|
+
"filesystem", "storage")),
|
|
38
|
+
("lifecycleEvent", ("start", "stop", "restart", "boot", "shutdown", "terminat", "spawn",
|
|
39
|
+
"delet", "creat", "launch", "instance")),
|
|
40
|
+
("failureEvent", ("error", "fail", "exception", "fatal", "abort", "corrupt", "invalid")),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def event_classes(line_lower: str) -> list[str]:
|
|
45
|
+
return [name for name, keywords in EVENT_CLASS_RULES if any(k in line_lower for k in keywords)]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LogEncoder:
|
|
49
|
+
adapter_id = "logs"
|
|
50
|
+
version = "0.2.0"
|
|
51
|
+
|
|
52
|
+
def encode(self, artifact: str, **kwargs) -> EncodeResult:
|
|
53
|
+
session = kwargs.get("session_id") or infer_session(artifact)
|
|
54
|
+
statements: list[Statement] = [stmt("logSession", entity(session, "session"))]
|
|
55
|
+
events: list[Statement] = []
|
|
56
|
+
counts: Counter[str] = Counter()
|
|
57
|
+
for i, line in enumerate(line for line in artifact.splitlines() if line.strip()):
|
|
58
|
+
template = mask_template(line)
|
|
59
|
+
functor = template_functor(template)
|
|
60
|
+
event_id = entity(f"e{i}", "event")
|
|
61
|
+
event_stmt = stmt(functor, event_id, entity(session, "session"))
|
|
62
|
+
events.append(event_stmt)
|
|
63
|
+
counts[functor] += 1
|
|
64
|
+
statements.append(event_stmt)
|
|
65
|
+
component = infer_component(line)
|
|
66
|
+
if component:
|
|
67
|
+
statements.append(stmt("component", event_id, entity(component, "component")))
|
|
68
|
+
line_lower = line.lower()
|
|
69
|
+
if "timeout" in line_lower:
|
|
70
|
+
statements.append(stmt("timeout", event_id, entity(session, "session")))
|
|
71
|
+
if "retry" in line_lower:
|
|
72
|
+
statements.append(stmt("retry", event_id, entity(session, "session")))
|
|
73
|
+
if "error" in line_lower or "exception" in line_lower or "fail" in line_lower:
|
|
74
|
+
statements.append(stmt("failureEvent", event_id, entity(session, "session")))
|
|
75
|
+
for cls in event_classes(line_lower):
|
|
76
|
+
statements.append(stmt(cls, event_id, entity(session, "session")))
|
|
77
|
+
for left, right in zip(events, events[1:], strict=False):
|
|
78
|
+
statements.append(stmt("before", left, right))
|
|
79
|
+
for functor, count in counts.items():
|
|
80
|
+
statements.append(stmt("count", entity(functor, "event_type"), entity(str(count), "integer")))
|
|
81
|
+
statements.extend(derive_higher_order(events, statements))
|
|
82
|
+
case = make_case(statements, {"adapter": self.adapter_id, "version": self.version, "tier": 0})
|
|
83
|
+
return EncodeResult(case, ())
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def mask_template(line: str) -> str:
|
|
87
|
+
line = IP_RE.sub("<IP>", line)
|
|
88
|
+
line = HEX_RE.sub("<HEX>", line)
|
|
89
|
+
line = BLOCK_RE.sub("<BLOCK>", line)
|
|
90
|
+
line = NUM_RE.sub("<NUM>", line)
|
|
91
|
+
return " ".join(line.strip().split())
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def template_functor(template: str) -> str:
|
|
95
|
+
import hashlib
|
|
96
|
+
|
|
97
|
+
digest = hashlib.blake2b(template.encode("utf-8"), digest_size=4).hexdigest()
|
|
98
|
+
words = [w.lower() for w in re.findall(r"[A-Za-z]+", template)]
|
|
99
|
+
alias = "evt"
|
|
100
|
+
for key in ("timeout", "retry", "error", "exception", "restart", "fail", "block"):
|
|
101
|
+
if key in words:
|
|
102
|
+
alias = key
|
|
103
|
+
break
|
|
104
|
+
return f"{alias}_{digest}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def infer_session(text: str) -> str:
|
|
108
|
+
match = BLOCK_RE.search(text)
|
|
109
|
+
if match:
|
|
110
|
+
return match.group(0)
|
|
111
|
+
return "session_0"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def infer_component(line: str) -> str | None:
|
|
115
|
+
match = re.search(r"\b(?:INFO|WARN|ERROR|DEBUG)\s+([A-Za-z0-9_.-]+)", line)
|
|
116
|
+
if match:
|
|
117
|
+
return match.group(1)
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def derive_higher_order(events: list[Statement], statements: list[Statement]) -> list[Statement]:
|
|
122
|
+
out: list[Statement] = []
|
|
123
|
+
timeouts = [s for s in statements if s.functor == "timeout"]
|
|
124
|
+
retries = [s for s in statements if s.functor == "retry"]
|
|
125
|
+
failures = [s for s in statements if s.functor == "failureEvent"]
|
|
126
|
+
|
|
127
|
+
# rules/logs.yaml requires the antecedent event to precede the consequent
|
|
128
|
+
# ("timeout before retry within session").
|
|
129
|
+
def event_index(attribute: Statement) -> int:
|
|
130
|
+
name = attribute.args[0].name if isinstance(attribute.args[0], Entity) else ""
|
|
131
|
+
return int(name[1:]) if name.startswith("e") and name[1:].isdigit() else -1
|
|
132
|
+
|
|
133
|
+
for timeout in timeouts:
|
|
134
|
+
for retry in retries:
|
|
135
|
+
if event_index(timeout) < event_index(retry):
|
|
136
|
+
out.append(stmt("cause", timeout, retry))
|
|
137
|
+
for failure in failures:
|
|
138
|
+
for retry in retries:
|
|
139
|
+
if event_index(failure) < event_index(retry):
|
|
140
|
+
out.append(stmt("enables", failure, retry))
|
|
141
|
+
return out
|
|
142
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Flagged Tier-1 prose encoder.
|
|
2
|
+
|
|
3
|
+
This is a deterministic connective/clause fallback. It is marked Tier-1 and
|
|
4
|
+
does not support headline Tier-0 claims.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from hashlib import blake2s
|
|
11
|
+
|
|
12
|
+
from sma.ir.schema import Statement, entity, make_case, stmt
|
|
13
|
+
|
|
14
|
+
from .base import EncodeResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
CONNECTIVES = {
|
|
18
|
+
"because": "cause",
|
|
19
|
+
"therefore": "cause",
|
|
20
|
+
"so": "cause",
|
|
21
|
+
"if": "implies",
|
|
22
|
+
"although": "contrast",
|
|
23
|
+
"but": "contrast",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ProseTier1Encoder:
|
|
28
|
+
adapter_id = "prose_tier1"
|
|
29
|
+
version = "0.1.0"
|
|
30
|
+
|
|
31
|
+
def encode(self, artifact: str, **kwargs) -> EncodeResult:
|
|
32
|
+
clauses = [c.strip() for c in re.split(r"[.;!?]\s*", artifact) if c.strip()]
|
|
33
|
+
statements: list[Statement] = []
|
|
34
|
+
clause_stmts: list[Statement] = []
|
|
35
|
+
for i, clause in enumerate(clauses):
|
|
36
|
+
pred = first_verbish(clause) or "mentions"
|
|
37
|
+
digest = blake2s(clause.encode("utf-8"), digest_size=8).hexdigest()
|
|
38
|
+
clause_stmt = stmt(pred, entity(f"clause_{i}", "clause"), entity(digest, "text_digest"))
|
|
39
|
+
clause_stmts.append(clause_stmt)
|
|
40
|
+
statements.append(clause_stmt)
|
|
41
|
+
lower = artifact.lower()
|
|
42
|
+
for token, rel in CONNECTIVES.items():
|
|
43
|
+
if token in lower and len(clause_stmts) >= 2:
|
|
44
|
+
statements.append(stmt(rel, clause_stmts[0], clause_stmts[1]))
|
|
45
|
+
return EncodeResult(
|
|
46
|
+
make_case(statements or [stmt("emptyProse", entity("doc"))], {"adapter": self.adapter_id, "tier": 1}),
|
|
47
|
+
("Tier-1 prose extraction is flagged and excluded from headline claims.",),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def first_verbish(text: str) -> str | None:
|
|
52
|
+
words = re.findall(r"[A-Za-z]+", text)
|
|
53
|
+
for word in words:
|
|
54
|
+
low = word.lower()
|
|
55
|
+
if low.endswith("ed") or low.endswith("ing") or low in {"is", "are", "was", "were", "has", "have"}:
|
|
56
|
+
return low
|
|
57
|
+
return words[0].lower() if words else None
|