structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,80 @@
1
+ """Per-query structural coverage indicator (blueprint section 12-R3).
2
+
3
+ The cross-system event ontology in ``logs_drain.EVENT_CLASS_RULES`` is frozen
4
+ (tag ontology-v1). When a query's vocabulary falls outside those keyword
5
+ rules, MAC/FAC retrieval degrades silently: events still get template
6
+ functors, but none of the shared cross-system class statements fire, so
7
+ structural similarity is computed over near-disjoint functor sets. This is
8
+ the "lattice-miss" tripwire from blueprint section 4.3 / 12-R3, surfaced as a
9
+ measured per-query coverage number rather than hidden.
10
+
11
+ Read-only consumer of the frozen ontology: imports ``event_classes`` from
12
+ ``logs_drain`` and never modifies the rules.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from sma.encoders.logs_drain import event_classes
18
+
19
+ # Below this fraction the SMA evidence panel shows an amber chip and the
20
+ # verbalizer prompt carries an explicit low-confidence caveat.
21
+ COVERAGE_WARN_THRESHOLD = 0.4
22
+
23
+ # Keyword attributes emitted by LogEncoder in addition to the class rules
24
+ # (timeout/retry/failure statements). They are subsets of the class keyword
25
+ # sets today, but checked explicitly so coverage stays correct even if the
26
+ # attribute rules and class rules ever diverge in a future ontology version.
27
+ _KEYWORD_ATTRIBUTES: tuple[tuple[str, ...], ...] = (
28
+ ("timeout",),
29
+ ("retry",),
30
+ ("error", "exception", "fail"),
31
+ )
32
+
33
+
34
+ def _line_fired(line_lower: str, extra_classes=None) -> bool:
35
+ if event_classes(line_lower):
36
+ return True
37
+ if extra_classes is not None and any(
38
+ any(k in line_lower for k in keywords) for _, keywords in extra_classes
39
+ ):
40
+ return True
41
+ return any(any(k in line_lower for k in keywords) for keywords in _KEYWORD_ATTRIBUTES)
42
+
43
+
44
+ def rule_coverage(text: str, extra_classes=None) -> dict:
45
+ """Fraction of non-empty lines that fired at least one class rule.
46
+
47
+ ``extra_classes``: optional (name, keywords) pairs from an active draft
48
+ adapter - while a draft is applied, its rules legitimately count toward
49
+ coverage (the chip must reflect what retrieval can actually use).
50
+
51
+ Returns a dict with:
52
+ - ``fraction``: covered / total non-empty lines (0.0 for empty text)
53
+ - ``covered_lines`` / ``total_lines``: the raw counts
54
+ - ``low``: True when fraction < COVERAGE_WARN_THRESHOLD
55
+ - ``percent``: integer percent, for display
56
+ """
57
+ lines = [line for line in text.splitlines() if line.strip()]
58
+ covered = sum(1 for line in lines if _line_fired(line.lower(), extra_classes))
59
+ total = len(lines)
60
+ fraction = covered / total if total else 0.0
61
+ return {
62
+ "fraction": fraction,
63
+ "covered_lines": covered,
64
+ "total_lines": total,
65
+ "low": fraction < COVERAGE_WARN_THRESHOLD,
66
+ "percent": round(fraction * 100),
67
+ }
68
+
69
+
70
+ def coverage_warning(coverage: dict) -> str | None:
71
+ """The exact low-confidence caveat for evidence metadata and the prompt."""
72
+ if not coverage.get("low"):
73
+ return None
74
+ return (
75
+ f"structural coverage of this query is LOW ({coverage['percent']}%) - "
76
+ "structural retrieval is low-confidence for this vocabulary"
77
+ )
78
+
79
+
80
+ __all__ = ["COVERAGE_WARN_THRESHOLD", "rule_coverage", "coverage_warning"]
@@ -0,0 +1,183 @@
1
+ """LLM-drafted adapter: rules as data, never facts.
2
+
3
+ The frozen ontology (logs_drain.EVENT_CLASS_RULES, tag ontology-v1) cannot be
4
+ edited, but the blueprint's section 4.1 discipline allows EXTRA deterministic
5
+ class rules supplied as data at the adapter boundary. An LLM may *propose*
6
+ those rules (see sma.agent.adapter_draft); it never encodes anything. Once
7
+ the rules exist as data, encoding is pure deterministic keyword matching:
8
+ identical input + identical rules => identical case bytes.
9
+
10
+ DraftAdapter composes the standard LogEncoder output with additional class
11
+ statements derived from the supplied DraftRules, mirroring the
12
+ ``event_classes`` first-match-per-class semantics of the frozen rules.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import re
19
+ from dataclasses import dataclass, field
20
+
21
+ import blake3
22
+
23
+ from sma.ir.schema import entity, make_case, stmt
24
+
25
+ from .base import EncodeResult
26
+ from .logs_drain import EVENT_CLASS_RULES, LogEncoder, infer_session
27
+
28
+ FROZEN_CLASS_NAMES = frozenset(name for name, _ in EVENT_CLASS_RULES)
29
+
30
+ _CLASS_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9]*Event$")
31
+
32
+ MAX_CLASSES = 8
33
+ MAX_KEYWORDS = 5
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class DraftRules:
38
+ """Extra deterministic class rules, supplied as data.
39
+
40
+ ``classes``: ordered (class_name, tuple-of-keywords) pairs, mirroring the
41
+ shape of the frozen EVENT_CLASS_RULES. ``maskings``: optional regexes for
42
+ variable tokens (ids, timestamps, counters), validated and carried in the
43
+ content-addressed artifact for future template masking. They are NOT
44
+ applied before keyword matching - class matching mirrors event_classes
45
+ exactly (raw lowered line), and LLM-drafted masks routinely cover the very
46
+ substrings the keywords need (e.g. ``code=[a-z0-9-]+`` vs ``grain-drift``).
47
+ """
48
+
49
+ classes: tuple[tuple[str, tuple[str, ...]], ...] = ()
50
+ maskings: tuple[str, ...] = field(default_factory=tuple)
51
+
52
+
53
+ def validate_rules(rules: DraftRules) -> list[str]:
54
+ """Return a list of validation errors (empty list means valid)."""
55
+ errors: list[str] = []
56
+ seen: set[str] = set()
57
+ for name, keywords in rules.classes:
58
+ if not isinstance(name, str) or not _CLASS_NAME_RE.match(name):
59
+ errors.append(
60
+ f"class name {name!r} must be alphanumeric with an 'Event' suffix"
61
+ )
62
+ if name in FROZEN_CLASS_NAMES:
63
+ errors.append(
64
+ f"class name {name!r} collides with the frozen ontology-v1 EVENT_CLASS_RULES"
65
+ )
66
+ if name in seen:
67
+ errors.append(f"duplicate class name {name!r}")
68
+ seen.add(name)
69
+ if not keywords:
70
+ errors.append(f"class {name!r} has no keywords")
71
+ for keyword in keywords:
72
+ if not isinstance(keyword, str) or not keyword or keyword != keyword.lower():
73
+ errors.append(
74
+ f"class {name!r} keyword {keyword!r} must be a non-empty lowercase string"
75
+ )
76
+ for pattern in rules.maskings:
77
+ try:
78
+ re.compile(pattern)
79
+ except re.error as exc:
80
+ errors.append(f"masking regex {pattern!r} does not compile: {exc}")
81
+ return errors
82
+
83
+
84
+ def rules_hash(rules: DraftRules) -> str:
85
+ """Content address (blake3) of the canonical JSON form of the rules."""
86
+ return blake3.blake3(rules_to_json(rules).encode("utf-8")).hexdigest()
87
+
88
+
89
+ def rules_to_json(rules: DraftRules) -> str:
90
+ payload = {
91
+ "classes": [
92
+ {"name": name, "keywords": list(keywords)} for name, keywords in rules.classes
93
+ ],
94
+ "maskings": list(rules.maskings),
95
+ }
96
+ return json.dumps(payload, indent=2, sort_keys=True)
97
+
98
+
99
+ def rules_from_json(text: str) -> DraftRules:
100
+ payload = json.loads(text)
101
+ classes = tuple(
102
+ (row["name"], tuple(row["keywords"])) for row in payload.get("classes", [])
103
+ )
104
+ maskings = tuple(payload.get("maskings", []))
105
+ return DraftRules(classes=classes, maskings=maskings)
106
+
107
+
108
+ class DraftAdapter:
109
+ """LogEncoder output + extra deterministic class statements from DraftRules."""
110
+
111
+ adapter_id = "logs+draft"
112
+ version = "0.1.0"
113
+
114
+ def __init__(self, rules: DraftRules):
115
+ errors = validate_rules(rules)
116
+ if errors:
117
+ raise ValueError("invalid draft rules: " + "; ".join(errors))
118
+ self.rules = rules
119
+ self.draft_hash = rules_hash(rules)
120
+ self._base = LogEncoder()
121
+
122
+ def draft_classes(self, line: str) -> list[str]:
123
+ """Mirror of logs_drain.event_classes, over the supplied rules.
124
+
125
+ Matches the raw lowered line, exactly like the frozen rules; see the
126
+ DraftRules docstring for why maskings are not applied here.
127
+ """
128
+ line_lower = line.lower()
129
+ return [
130
+ name
131
+ for name, keywords in self.rules.classes
132
+ if any(k in line_lower for k in keywords)
133
+ ]
134
+
135
+ def encode(self, artifact: str, **kwargs) -> EncodeResult:
136
+ base = self._base.encode(artifact, **kwargs)
137
+ session = kwargs.get("session_id") or infer_session(artifact)
138
+ extra = []
139
+ # Mirror the base encoder's event enumeration exactly: e{i} over
140
+ # non-empty lines, so the extra class statements attach to the same
141
+ # event entities the base statements use.
142
+ for i, line in enumerate(line for line in artifact.splitlines() if line.strip()):
143
+ for cls in self.draft_classes(line):
144
+ extra.append(stmt(cls, entity(f"e{i}", "event"), entity(session, "session")))
145
+ metadata = dict(base.case.metadata)
146
+ metadata.update(
147
+ {
148
+ "adapter": "draft",
149
+ "base_adapter": self._base.adapter_id,
150
+ "draft_hash": self.draft_hash,
151
+ "version": self.version,
152
+ }
153
+ )
154
+ case = make_case(tuple(base.case.statements) + tuple(extra), metadata)
155
+ return EncodeResult(case, base.warnings)
156
+
157
+
158
+ def check_determinism(adapter: DraftAdapter, text: str) -> bool:
159
+ """Encode twice and assert identical canonical bytes (blueprint section 4 CI rule)."""
160
+ from sma.ir.sexpr import canonical_case_text
161
+
162
+ first = adapter.encode(text).case
163
+ second = adapter.encode(text).case
164
+ first_text = canonical_case_text(first.statements)
165
+ second_text = canonical_case_text(second.statements)
166
+ if first_text != second_text or first.case_id != second.case_id:
167
+ raise AssertionError(
168
+ f"draft adapter is non-deterministic for hash={adapter.draft_hash[:8]}"
169
+ )
170
+ return True
171
+
172
+
173
+ __all__ = [
174
+ "DraftAdapter",
175
+ "DraftRules",
176
+ "MAX_CLASSES",
177
+ "MAX_KEYWORDS",
178
+ "check_determinism",
179
+ "rules_from_json",
180
+ "rules_hash",
181
+ "rules_to_json",
182
+ "validate_rules",
183
+ ]
@@ -0,0 +1,207 @@
1
+ """Expert clinical encoder for diabetes inpatient encounters (Tier-0).
2
+
3
+ Designed as a clinical-informatics specialist would: it does not emit flat
4
+ attribute=value triples. It encodes the *clinical relationships* an
5
+ endocrinologist reasons over — diagnosis comorbidity over the ICD-9 hierarchy,
6
+ the specific diabetes complication, antidiabetic therapy by drug CLASS with
7
+ titration, glycemic control state, care acuity/chronicity — and the higher-order
8
+ causal/temporal chains that link them (poor control -> therapy escalation;
9
+ diabetes -> end-organ complication; chronic utilization -> acute admission).
10
+
11
+ All mappings are real medical knowledge (ICD-9 category grouping per Strack et
12
+ al. 2014; antidiabetic drug classes per pharmacology; ADA glycemic thresholds).
13
+ NOTHING is fitted to the readmission label, which is never encoded.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ from sma.ir.schema import Case, Statement, entity, make_case, stmt
18
+
19
+ from .base import EncodeResult
20
+
21
+ # --- ICD-9 diagnosis grouping (Strack et al. 2014, the canonical Diabetes-130
22
+ # categorization). Maps a code to a body-system category. ---------------
23
+ _DIABETES_COMPLICATION = {
24
+ "0": "uncomplicated", "1": "ketoacidosis", "2": "hyperosmolar",
25
+ "3": "other_coma", "4": "renal", "5": "ophthalmic", "6": "neurological",
26
+ "7": "peripheral_circulatory", "8": "other_specified", "9": "unspecified",
27
+ }
28
+
29
+
30
+ def icd9_category(code: str) -> str | None:
31
+ code = (code or "").strip()
32
+ if not code:
33
+ return None
34
+ if code[0] in "EV": # external-cause / supplementary -> Other
35
+ return "other"
36
+ if code.startswith("250"):
37
+ return "diabetes"
38
+ try:
39
+ num = int(float(code))
40
+ except ValueError:
41
+ return "other"
42
+ if 390 <= num <= 459 or num == 785:
43
+ return "circulatory"
44
+ if 460 <= num <= 519 or num == 786:
45
+ return "respiratory"
46
+ if 520 <= num <= 579 or num == 787:
47
+ return "digestive"
48
+ if 800 <= num <= 999:
49
+ return "injury"
50
+ if 710 <= num <= 739:
51
+ return "musculoskeletal"
52
+ if 580 <= num <= 629 or num == 788:
53
+ return "genitourinary"
54
+ if 140 <= num <= 239:
55
+ return "neoplasm"
56
+ return "other"
57
+
58
+
59
+ def diabetes_complication(code: str) -> str | None:
60
+ code = (code or "").strip()
61
+ if not code.startswith("250"):
62
+ return None
63
+ _, _, dec = code.partition(".")
64
+ return _DIABETES_COMPLICATION.get(dec[:1], "uncomplicated") if dec else "uncomplicated"
65
+
66
+
67
+ # --- Antidiabetic pharmacology: drug -> therapeutic class -------------------
68
+ _DRUG_CLASS = {
69
+ "metformin": "biguanide",
70
+ "glimepiride": "sulfonylurea", "glipizide": "sulfonylurea",
71
+ "glyburide": "sulfonylurea", "chlorpropamide": "sulfonylurea",
72
+ "tolbutamide": "sulfonylurea", "acetohexamide": "sulfonylurea",
73
+ "tolazamide": "sulfonylurea",
74
+ "repaglinide": "meglitinide", "nateglinide": "meglitinide",
75
+ "pioglitazone": "tzd", "rosiglitazone": "tzd", "troglitazone": "tzd",
76
+ "acarbose": "agi", "miglitol": "agi",
77
+ "insulin": "insulin",
78
+ "examide": "other", "citoglipton": "other",
79
+ "glyburide-metformin": "combination", "glipizide-metformin": "combination",
80
+ "glimepiride-pioglitazone": "combination",
81
+ "metformin-rosiglitazone": "combination", "metformin-pioglitazone": "combination",
82
+ }
83
+ _DRUGS = tuple(_DRUG_CLASS)
84
+
85
+
86
+ def _glycemic_state(a1c: str, glu: str) -> str:
87
+ a1c, glu = (a1c or "").strip(), (glu or "").strip()
88
+ if a1c in (">7", ">8") or glu in (">200", ">300"):
89
+ return "uncontrolled"
90
+ if a1c == "Norm" or glu == "Norm":
91
+ return "controlled"
92
+ return "unmeasured"
93
+
94
+
95
+ def _bin_count(n: int, lo: int = 1, hi: int = 3) -> str:
96
+ if n <= 0:
97
+ return "none"
98
+ return "low" if n <= hi else "high"
99
+
100
+
101
+ def _int(fields: dict, key: str) -> int:
102
+ try:
103
+ return int(float(fields.get(key, "0")))
104
+ except (ValueError, TypeError):
105
+ return 0
106
+
107
+
108
+ class HealthcareEncoder:
109
+ adapter_id = "healthcare"
110
+ version = "1.0.0"
111
+
112
+ def encode(self, artifact: str, **kwargs) -> EncodeResult:
113
+ import json
114
+ fields = json.loads(artifact) if isinstance(artifact, str) else dict(artifact)
115
+ return EncodeResult(self.encode_record(fields), ())
116
+
117
+ def encode_record(self, f: dict) -> Case:
118
+ # Clinical features are lifted into FUNCTORS (a structure-mapping memory
119
+ # discriminates on functor identity, not entity arguments), each over a
120
+ # constant patient node; the higher-order clinical relations connect
121
+ # them. So two patients with the same comorbidity/therapy/control profile
122
+ # share functors (MAC discriminates) AND share relational structure (SME
123
+ # systematicity).
124
+ p = entity("patient", "patient")
125
+ S: list[Statement] = []
126
+
127
+ def feat(name: str) -> Statement:
128
+ return stmt(name, p)
129
+
130
+ # 0. Raw discriminative features (high-cardinality functors = retrieval
131
+ # discrimination). A structure-mapping memory needs BOTH discriminative
132
+ # detail AND curated structure; abstraction alone collapses patients.
133
+ for col, val in f.items():
134
+ S.append(stmt(col, p, entity(str(val), "value")))
135
+
136
+ # 1. Diagnoses -> dx<Category> functors; diabetes complication -> cx<Type>.
137
+ cats: list[str] = []
138
+ dia_stmt: Statement | None = None
139
+ for col in ("diag_1", "diag_2", "diag_3"):
140
+ cat = icd9_category(f.get(col, ""))
141
+ if not cat:
142
+ continue
143
+ d = feat("dx" + cat.capitalize())
144
+ S.append(d)
145
+ cats.append(cat)
146
+ if cat == "diabetes" and dia_stmt is None:
147
+ dia_stmt = d
148
+ comp = diabetes_complication(f.get(col, ""))
149
+ if comp and comp not in ("uncomplicated", "unspecified"):
150
+ c = feat("cx" + comp.replace("_", " ").title().replace(" ", ""))
151
+ S.append(c)
152
+ S.append(stmt("manifests", d, c)) # higher-order
153
+ # comorbidity: relate the first diagnosis to each distinct other system
154
+ seen_dx = {s.functor: s for s in S if s.functor.startswith("dx")}
155
+ dx_list = list(seen_dx.values())
156
+ for other in dx_list[1:]:
157
+ S.append(stmt("comorbidWith", dx_list[0], other)) # higher-order
158
+
159
+ # 2. Therapy -> rx<Class> functors, titration -> up/down<Class>; treats.
160
+ therapy_stmts: dict[str, Statement] = {}
161
+ for drug in _DRUGS:
162
+ status = (f.get(drug, "No") or "No").strip()
163
+ if status == "No":
164
+ continue
165
+ cls = _DRUG_CLASS[drug]
166
+ if cls not in therapy_stmts:
167
+ t = feat("rx" + cls.capitalize())
168
+ therapy_stmts[cls] = t
169
+ S.append(t)
170
+ if dia_stmt is not None:
171
+ S.append(stmt("treats", t, dia_stmt)) # higher-order
172
+ if status == "Up":
173
+ S.append(feat("titrUp" + cls.capitalize()))
174
+ elif status == "Down":
175
+ S.append(feat("titrDown" + cls.capitalize()))
176
+ if len(therapy_stmts) >= 2:
177
+ S.append(feat("polytherapy"))
178
+
179
+ # 3. Glycemic control -> gly<State> functor; clinical picture links.
180
+ gly = _glycemic_state(f.get("A1Cresult", ""), f.get("max_glu_serum", ""))
181
+ gly_stmt = feat("gly" + gly.capitalize())
182
+ S.append(gly_stmt)
183
+ if dx_list:
184
+ S.append(stmt("presentsWith", dx_list[0], gly_stmt)) # higher-order
185
+ for t in therapy_stmts.values():
186
+ S.append(stmt("addresses", t, gly_stmt)) # higher-order
187
+
188
+ # 4. Poor control -> therapy escalation (the causal chain).
189
+ change = (f.get("change", "No") or "No").strip()
190
+ insulin = (f.get("insulin", "No") or "No").strip()
191
+ if gly == "uncontrolled" and (change == "Ch" or insulin in ("Up", "Steady")):
192
+ esc = feat("escalation")
193
+ S.append(esc)
194
+ S.append(stmt("cause", gly_stmt, esc)) # higher-order
195
+
196
+ # 5. Care acuity / chronicity -> functors.
197
+ prior = _int(f, "number_inpatient") + _int(f, "number_emergency") + _int(f, "number_outpatient")
198
+ util = feat("priorUtil" + _bin_count(prior).capitalize())
199
+ S.append(util)
200
+ adm = feat("admit" + str(f.get("admission_type_id", "0")))
201
+ S.append(adm)
202
+ if _bin_count(prior) == "high":
203
+ S.append(stmt("chronicity", util, adm)) # higher-order
204
+ S.append(feat("los" + _bin_count(_int(f, "time_in_hospital"), hi=4).capitalize()))
205
+
206
+ return make_case(S or [feat("encounterUnknown")],
207
+ {"adapter": self.adapter_id, "tier": 0})
@@ -0,0 +1,142 @@
1
+ """Deterministic Tier-0 log encoder.
2
+
3
+ This is a small Drain-like template masker for the MVP. When Drain3 is
4
+ installed, it can be substituted behind the same output contract.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from collections import Counter
11
+
12
+ from sma.ir.schema import Entity, Statement, entity, make_case, stmt
13
+
14
+ from .base import EncodeResult
15
+
16
+ IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
17
+ HEX_RE = re.compile(r"\b0x[0-9a-fA-F]+\b")
18
+ NUM_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
19
+ BLOCK_RE = re.compile(r"\bblk[_-]?[A-Za-z0-9_-]+\b")
20
+
21
+ # Cross-system event ontology (blueprint section 4.3 mini-ontology, tripwire
22
+ # response to the measured ~100% cross-system lattice miss): template hashes
23
+ # are system-specific by construction, so every event also gets coarse
24
+ # deterministic class statements whose functors ARE shared across systems.
25
+ # First-match-per-class keyword rules over the lowercased line; a line may
26
+ # carry several classes. Order is fixed; rules are data.
27
+ EVENT_CLASS_RULES: tuple[tuple[str, tuple[str, ...]], ...] = (
28
+ ("timeoutEvent", ("timeout", "timed out")),
29
+ ("retryEvent", ("retry", "retrying", "retransmit", "re-send", "resend")),
30
+ ("ioEvent", ("eofexception", "ioexception", "io error", "input/output", "end of file")),
31
+ ("memoryEvent", ("out of memory", "oom", "memory error", "ecc", "dimm", "cache error")),
32
+ ("kernelEvent", ("kernel", "panic", "machine check", "mce", "interrupt")),
33
+ ("networkEvent", ("connect", "socket", "network", "unreachable", "reset by peer",
34
+ "dhcp", "http", "link", "heartbeat", "packet")),
35
+ ("authEvent", ("auth", "permission", "denied", "credential", "token", "login")),
36
+ ("storageEvent", ("block", "replica", "disk", "volume", "snapshot", "image", "file system",
37
+ "filesystem", "storage")),
38
+ ("lifecycleEvent", ("start", "stop", "restart", "boot", "shutdown", "terminat", "spawn",
39
+ "delet", "creat", "launch", "instance")),
40
+ ("failureEvent", ("error", "fail", "exception", "fatal", "abort", "corrupt", "invalid")),
41
+ )
42
+
43
+
44
+ def event_classes(line_lower: str) -> list[str]:
45
+ return [name for name, keywords in EVENT_CLASS_RULES if any(k in line_lower for k in keywords)]
46
+
47
+
48
+ class LogEncoder:
49
+ adapter_id = "logs"
50
+ version = "0.2.0"
51
+
52
+ def encode(self, artifact: str, **kwargs) -> EncodeResult:
53
+ session = kwargs.get("session_id") or infer_session(artifact)
54
+ statements: list[Statement] = [stmt("logSession", entity(session, "session"))]
55
+ events: list[Statement] = []
56
+ counts: Counter[str] = Counter()
57
+ for i, line in enumerate(line for line in artifact.splitlines() if line.strip()):
58
+ template = mask_template(line)
59
+ functor = template_functor(template)
60
+ event_id = entity(f"e{i}", "event")
61
+ event_stmt = stmt(functor, event_id, entity(session, "session"))
62
+ events.append(event_stmt)
63
+ counts[functor] += 1
64
+ statements.append(event_stmt)
65
+ component = infer_component(line)
66
+ if component:
67
+ statements.append(stmt("component", event_id, entity(component, "component")))
68
+ line_lower = line.lower()
69
+ if "timeout" in line_lower:
70
+ statements.append(stmt("timeout", event_id, entity(session, "session")))
71
+ if "retry" in line_lower:
72
+ statements.append(stmt("retry", event_id, entity(session, "session")))
73
+ if "error" in line_lower or "exception" in line_lower or "fail" in line_lower:
74
+ statements.append(stmt("failureEvent", event_id, entity(session, "session")))
75
+ for cls in event_classes(line_lower):
76
+ statements.append(stmt(cls, event_id, entity(session, "session")))
77
+ for left, right in zip(events, events[1:], strict=False):
78
+ statements.append(stmt("before", left, right))
79
+ for functor, count in counts.items():
80
+ statements.append(stmt("count", entity(functor, "event_type"), entity(str(count), "integer")))
81
+ statements.extend(derive_higher_order(events, statements))
82
+ case = make_case(statements, {"adapter": self.adapter_id, "version": self.version, "tier": 0})
83
+ return EncodeResult(case, ())
84
+
85
+
86
+ def mask_template(line: str) -> str:
87
+ line = IP_RE.sub("<IP>", line)
88
+ line = HEX_RE.sub("<HEX>", line)
89
+ line = BLOCK_RE.sub("<BLOCK>", line)
90
+ line = NUM_RE.sub("<NUM>", line)
91
+ return " ".join(line.strip().split())
92
+
93
+
94
+ def template_functor(template: str) -> str:
95
+ import hashlib
96
+
97
+ digest = hashlib.blake2b(template.encode("utf-8"), digest_size=4).hexdigest()
98
+ words = [w.lower() for w in re.findall(r"[A-Za-z]+", template)]
99
+ alias = "evt"
100
+ for key in ("timeout", "retry", "error", "exception", "restart", "fail", "block"):
101
+ if key in words:
102
+ alias = key
103
+ break
104
+ return f"{alias}_{digest}"
105
+
106
+
107
+ def infer_session(text: str) -> str:
108
+ match = BLOCK_RE.search(text)
109
+ if match:
110
+ return match.group(0)
111
+ return "session_0"
112
+
113
+
114
+ def infer_component(line: str) -> str | None:
115
+ match = re.search(r"\b(?:INFO|WARN|ERROR|DEBUG)\s+([A-Za-z0-9_.-]+)", line)
116
+ if match:
117
+ return match.group(1)
118
+ return None
119
+
120
+
121
+ def derive_higher_order(events: list[Statement], statements: list[Statement]) -> list[Statement]:
122
+ out: list[Statement] = []
123
+ timeouts = [s for s in statements if s.functor == "timeout"]
124
+ retries = [s for s in statements if s.functor == "retry"]
125
+ failures = [s for s in statements if s.functor == "failureEvent"]
126
+
127
+ # rules/logs.yaml requires the antecedent event to precede the consequent
128
+ # ("timeout before retry within session").
129
+ def event_index(attribute: Statement) -> int:
130
+ name = attribute.args[0].name if isinstance(attribute.args[0], Entity) else ""
131
+ return int(name[1:]) if name.startswith("e") and name[1:].isdigit() else -1
132
+
133
+ for timeout in timeouts:
134
+ for retry in retries:
135
+ if event_index(timeout) < event_index(retry):
136
+ out.append(stmt("cause", timeout, retry))
137
+ for failure in failures:
138
+ for retry in retries:
139
+ if event_index(failure) < event_index(retry):
140
+ out.append(stmt("enables", failure, retry))
141
+ return out
142
+
@@ -0,0 +1,57 @@
1
+ """Flagged Tier-1 prose encoder.
2
+
3
+ This is a deterministic connective/clause fallback. It is marked Tier-1 and
4
+ does not support headline Tier-0 claims.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from hashlib import blake2s
11
+
12
+ from sma.ir.schema import Statement, entity, make_case, stmt
13
+
14
+ from .base import EncodeResult
15
+
16
+
17
+ CONNECTIVES = {
18
+ "because": "cause",
19
+ "therefore": "cause",
20
+ "so": "cause",
21
+ "if": "implies",
22
+ "although": "contrast",
23
+ "but": "contrast",
24
+ }
25
+
26
+
27
+ class ProseTier1Encoder:
28
+ adapter_id = "prose_tier1"
29
+ version = "0.1.0"
30
+
31
+ def encode(self, artifact: str, **kwargs) -> EncodeResult:
32
+ clauses = [c.strip() for c in re.split(r"[.;!?]\s*", artifact) if c.strip()]
33
+ statements: list[Statement] = []
34
+ clause_stmts: list[Statement] = []
35
+ for i, clause in enumerate(clauses):
36
+ pred = first_verbish(clause) or "mentions"
37
+ digest = blake2s(clause.encode("utf-8"), digest_size=8).hexdigest()
38
+ clause_stmt = stmt(pred, entity(f"clause_{i}", "clause"), entity(digest, "text_digest"))
39
+ clause_stmts.append(clause_stmt)
40
+ statements.append(clause_stmt)
41
+ lower = artifact.lower()
42
+ for token, rel in CONNECTIVES.items():
43
+ if token in lower and len(clause_stmts) >= 2:
44
+ statements.append(stmt(rel, clause_stmts[0], clause_stmts[1]))
45
+ return EncodeResult(
46
+ make_case(statements or [stmt("emptyProse", entity("doc"))], {"adapter": self.adapter_id, "tier": 1}),
47
+ ("Tier-1 prose extraction is flagged and excluded from headline claims.",),
48
+ )
49
+
50
+
51
+ def first_verbish(text: str) -> str | None:
52
+ words = re.findall(r"[A-Za-z]+", text)
53
+ for word in words:
54
+ low = word.lower()
55
+ if low.endswith("ed") or low.endswith("ing") or low in {"is", "are", "was", "were", "has", "have"}:
56
+ return low
57
+ return words[0].lower() if words else None