@simbimbo/brainstem 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,63 +1,158 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import asdict
3
+ import json
4
+ from dataclasses import asdict, dataclass, field
4
5
  from datetime import datetime
5
6
  from pathlib import Path
6
- from typing import Iterable, List
7
+ from typing import Callable, Iterable, List, Optional
7
8
 
8
9
  from .fingerprint import fingerprint_event, normalize_message
9
- from .models import CanonicalEvent, Event, RawInputEnvelope, Signature
10
+ from .models import Candidate, CanonicalEvent, Event, RawInputEnvelope, Signature
11
+ from .recurrence import build_recurrence_candidates
12
+ from .source_drivers import parse_source_payloads
13
+ from .storage import (
14
+ get_raw_envelopes_by_ids,
15
+ init_db,
16
+ RAW_ENVELOPE_STATUSES,
17
+ set_raw_envelope_status,
18
+ store_candidates,
19
+ store_events,
20
+ store_raw_envelopes,
21
+ store_signatures,
22
+ )
10
23
 
11
24
 
12
- def parse_syslog_line(line: str, *, tenant_id: str, source_path: str = "") -> CanonicalEvent:
13
- return canonicalize_raw_input_envelope(
14
- parse_syslog_envelope(line, tenant_id=tenant_id, source_path=source_path)
25
+ ErrorHandler = Callable[[Exception, str], None]
26
+
27
+
28
+ @dataclass
29
+ class IngestionItemResult:
30
+ index: int
31
+ status: str
32
+ tenant_id: str
33
+ source_type: str
34
+ source_id: str
35
+ source_name: str
36
+ raw_envelope_id: int | None
37
+ failure_reason: str | None = None
38
+
39
+
40
+ @dataclass
41
+ class IngestionResult:
42
+ raw_envelopes: List[RawInputEnvelope]
43
+ raw_envelope_ids: List[int]
44
+ events: List[CanonicalEvent]
45
+ signatures: List[Signature]
46
+ candidates: List[Candidate]
47
+ parse_failed: int
48
+ item_results: List[IngestionItemResult] = field(default_factory=list)
49
+
50
+
51
+ @dataclass
52
+ class ReplayAttempt:
53
+ raw_envelope_id: int
54
+ reason: str
55
+ status: str | None = None
56
+
57
+
58
+ @dataclass
59
+ class ReplayResult:
60
+ requested_raw_envelope_ids: List[int]
61
+ attempted_raw_envelope_ids: List[int]
62
+ skipped: List[ReplayAttempt]
63
+ events: List[CanonicalEvent]
64
+ signatures: List[Signature]
65
+ candidates: List[Candidate]
66
+ parse_failed: int
67
+
68
+
69
+ def _ingestion_item_result_from_event(
70
+ index: int,
71
+ raw_event: RawInputEnvelope,
72
+ raw_envelope_id: int | None,
73
+ status: str,
74
+ *,
75
+ failure_reason: str | None = None,
76
+ ) -> IngestionItemResult:
77
+ return IngestionItemResult(
78
+ index=index,
79
+ status=status,
80
+ tenant_id=raw_event.tenant_id,
81
+ source_type=raw_event.source_type,
82
+ source_id=raw_event.source_id,
83
+ source_name=raw_event.source_name,
84
+ raw_envelope_id=raw_envelope_id,
85
+ failure_reason=failure_reason,
15
86
  )
16
87
 
17
88
 
89
+ def parse_syslog_line(line: str, *, tenant_id: str, source_path: str = "") -> CanonicalEvent:
90
+ return canonicalize_raw_input_envelope(parse_syslog_envelope(line, tenant_id=tenant_id, source_path=source_path))
91
+
92
+
18
93
  def parse_syslog_envelope(line: str, *, tenant_id: str, source_path: str = "") -> RawInputEnvelope:
19
- text = (line or "").rstrip("\n")
20
- timestamp = datetime.utcnow().isoformat() + "Z"
21
- host = ""
22
- service = ""
23
- message = text
24
-
25
- parts = text.split()
26
- if len(parts) >= 5:
27
- host = parts[3]
28
- rest = " ".join(parts[4:])
29
- if ":" in rest:
30
- svc, _, msg = rest.partition(":")
31
- service = svc.strip()
32
- message = msg.strip() or rest.strip()
33
- else:
34
- message = rest.strip()
94
+ return parse_source_payloads("syslog", [line], tenant_id=tenant_id, source_path=source_path)[0]
35
95
 
36
- return RawInputEnvelope(
96
+
97
+ def parse_file_line(line: str, *, tenant_id: str, source_path: str = "") -> RawInputEnvelope:
98
+ return parse_source_payloads("file", [line], tenant_id=tenant_id, source_path=source_path)[0]
99
+
100
+
101
+ def parse_syslog_envelopes(lines: Iterable[str], *, tenant_id: str, source_path: str = "") -> List[RawInputEnvelope]:
102
+ parsed = [parse_syslog_envelope(line, tenant_id=tenant_id, source_path=source_path) for line in lines if str(line).strip()]
103
+ return parsed
104
+
105
+
106
+ def parse_file_envelopes(lines: Iterable[str], *, tenant_id: str, source_path: str = "") -> List[RawInputEnvelope]:
107
+ return parse_source_payloads(
108
+ "file",
109
+ [line for line in lines if str(line).strip()],
37
110
  tenant_id=tenant_id,
38
- source_type="syslog",
39
- timestamp=timestamp,
40
- message_raw=message,
41
- host=host,
42
- service=service,
43
111
  source_path=source_path,
44
- metadata={"raw_line": text},
45
112
  )
46
113
 
47
114
 
48
- def parse_syslog_envelopes(lines: Iterable[str], *, tenant_id: str, source_path: str = "") -> List[RawInputEnvelope]:
49
- return [parse_syslog_envelope(line, tenant_id=tenant_id, source_path=source_path) for line in lines if str(line).strip()]
115
+ def _coerce_raw_envelope_id(value: object) -> int | None:
116
+ if isinstance(value, bool):
117
+ return None
118
+ if isinstance(value, int):
119
+ return value
120
+ if isinstance(value, str):
121
+ value = value.strip()
122
+ if not value.isdigit():
123
+ return None
124
+ return int(value)
125
+ return None
50
126
 
51
127
 
52
- def canonicalize_raw_input_envelope(raw: RawInputEnvelope) -> CanonicalEvent:
128
+ def canonicalize_raw_input_envelope(
129
+ raw: RawInputEnvelope,
130
+ *,
131
+ raw_envelope_id: int | None = None,
132
+ ) -> CanonicalEvent:
133
+ parse_error = (raw.metadata or {}).get("parse_error")
134
+ if parse_error:
135
+ raise ValueError(f"parse_error: {parse_error}")
136
+
137
+ if not (raw.message_raw or "").strip():
138
+ raise ValueError("message_raw is empty and cannot be canonicalized")
139
+
140
+ resolved_raw_envelope_id = _coerce_raw_envelope_id(raw_envelope_id)
141
+ if resolved_raw_envelope_id is None:
142
+ resolved_raw_envelope_id = _coerce_raw_envelope_id(raw.metadata.get("raw_envelope_id"))
143
+
53
144
  message_normalized = normalize_message(raw.message_raw)
54
145
  metadata = dict(raw.metadata or {})
55
146
  metadata.setdefault("canonicalization_source", raw.source_type)
56
147
  metadata["raw_input_seen"] = True
148
+ if resolved_raw_envelope_id is not None:
149
+ metadata["raw_envelope_id"] = resolved_raw_envelope_id
150
+
57
151
  return CanonicalEvent(
58
152
  tenant_id=raw.tenant_id,
59
153
  source_type=raw.source_type,
60
154
  timestamp=raw.timestamp,
155
+ raw_envelope_id=resolved_raw_envelope_id,
61
156
  host=raw.host,
62
157
  service=raw.service,
63
158
  severity=raw.severity,
@@ -81,20 +176,310 @@ def canonicalize_raw_input_envelopes(events: Iterable[RawInputEnvelope]) -> List
81
176
  return [canonicalize_raw_input_envelope(raw_event) for raw_event in events]
82
177
 
83
178
 
179
+ def _parse_json_map(value: str | None) -> dict:
180
+ if not value:
181
+ return {}
182
+ try:
183
+ parsed = json.loads(value)
184
+ except json.JSONDecodeError:
185
+ return {}
186
+ if isinstance(parsed, dict):
187
+ return parsed
188
+ return {}
189
+
190
+
191
+ def _raw_envelope_from_row(row) -> RawInputEnvelope:
192
+ metadata = _parse_json_map(row["metadata_json"])
193
+ metadata["raw_envelope_id"] = int(row["id"])
194
+ return RawInputEnvelope(
195
+ tenant_id=row["tenant_id"],
196
+ source_type=row["source_type"],
197
+ source_id=row["source_id"] or "",
198
+ source_name=row["source_name"] or "",
199
+ timestamp=row["timestamp"],
200
+ host=row["host"] or "",
201
+ service=row["service"] or "",
202
+ severity=row["severity"] or "info",
203
+ asset_id=row["asset_id"] or "",
204
+ source_path=row["source_path"] or "",
205
+ message_raw=row["message_raw"] or "",
206
+ facility=row["facility"] or "",
207
+ structured_fields=_parse_json_map(row["structured_fields_json"]),
208
+ correlation_keys=_parse_json_map(row["correlation_keys_json"]),
209
+ metadata=metadata,
210
+ )
211
+
212
+
213
+ def replay_raw_envelopes_by_ids(
214
+ raw_envelope_ids: Iterable[int | str | object],
215
+ *,
216
+ db_path: str,
217
+ threshold: int = 2,
218
+ on_event: Optional[Callable[[CanonicalEvent], None]] = None,
219
+ on_parse_error: Optional[ErrorHandler] = None,
220
+ force: bool = False,
221
+ allowed_statuses: Iterable[str] = ("received", "parse_failed"),
222
+ ) -> ReplayResult:
223
+ requested_raw_envelope_ids = list(dict.fromkeys([_coerce_raw_envelope_id(item) for item in raw_envelope_ids]))
224
+ requested_raw_envelope_ids = [item for item in requested_raw_envelope_ids if item is not None]
225
+
226
+ if not requested_raw_envelope_ids:
227
+ return ReplayResult(
228
+ requested_raw_envelope_ids=[],
229
+ attempted_raw_envelope_ids=[],
230
+ skipped=[],
231
+ events=[],
232
+ signatures=[],
233
+ candidates=[],
234
+ parse_failed=0,
235
+ )
236
+
237
+ allowed_status_set = set(allowed_statuses)
238
+ if any(status not in RAW_ENVELOPE_STATUSES for status in allowed_status_set):
239
+ raise ValueError(
240
+ "allowed_statuses must only include one of: "
241
+ + ", ".join(RAW_ENVELOPE_STATUSES)
242
+ )
243
+
244
+ raw_rows_by_id = {
245
+ int(row["id"]): row
246
+ for row in get_raw_envelopes_by_ids(requested_raw_envelope_ids, db_path=db_path)
247
+ }
248
+
249
+ replay_rows = []
250
+ skipped: List[ReplayAttempt] = []
251
+ for raw_envelope_id in requested_raw_envelope_ids:
252
+ row = raw_rows_by_id.get(raw_envelope_id)
253
+ if row is None:
254
+ skipped.append(
255
+ ReplayAttempt(
256
+ raw_envelope_id=raw_envelope_id,
257
+ reason="not_found",
258
+ status="missing",
259
+ )
260
+ )
261
+ continue
262
+ if not force and row["canonicalization_status"] not in allowed_status_set:
263
+ skipped.append(
264
+ ReplayAttempt(
265
+ raw_envelope_id=raw_envelope_id,
266
+ reason="not_replayable",
267
+ status=row["canonicalization_status"],
268
+ )
269
+ )
270
+ continue
271
+ replay_rows.append(row)
272
+
273
+ replay_envelopes = [_raw_envelope_from_row(row) for row in replay_rows]
274
+ raw_pipeline_result = run_ingest_pipeline(
275
+ replay_envelopes,
276
+ threshold=threshold,
277
+ db_path=db_path,
278
+ on_event=on_event,
279
+ on_parse_error=on_parse_error,
280
+ store_raw=False,
281
+ )
282
+
283
+ return ReplayResult(
284
+ requested_raw_envelope_ids=requested_raw_envelope_ids,
285
+ attempted_raw_envelope_ids=[row["id"] for row in replay_rows],
286
+ skipped=skipped,
287
+ events=raw_pipeline_result.events,
288
+ signatures=raw_pipeline_result.signatures,
289
+ candidates=raw_pipeline_result.candidates,
290
+ parse_failed=raw_pipeline_result.parse_failed,
291
+ )
292
+
293
+
294
+ def run_ingest_pipeline(
295
+ raw_envelopes: Iterable[RawInputEnvelope],
296
+ *,
297
+ threshold: int = 2,
298
+ db_path: str | None = None,
299
+ store_raw: bool = True,
300
+ on_event: Optional[Callable[[CanonicalEvent], None]] = None,
301
+ on_parse_error: Optional[ErrorHandler] = None,
302
+ ) -> IngestionResult:
303
+ raw_envelopes_list = list(raw_envelopes)
304
+ raw_envelope_ids: List[int] = []
305
+ if db_path:
306
+ init_db(db_path)
307
+ if store_raw:
308
+ raw_envelope_ids = store_raw_envelopes(raw_envelopes_list, db_path)
309
+
310
+ canonical_events: List[CanonicalEvent] = []
311
+ parse_failed = 0
312
+ item_results: List[IngestionItemResult] = []
313
+ for idx, raw_event in enumerate(raw_envelopes_list):
314
+ raw_envelope_id = raw_envelope_ids[idx] if idx < len(raw_envelope_ids) else None
315
+ if raw_envelope_id is None:
316
+ raw_envelope_id = _coerce_raw_envelope_id(raw_event.metadata.get("raw_envelope_id"))
317
+ try:
318
+ canonical_event = canonicalize_raw_input_envelope(raw_event, raw_envelope_id=raw_envelope_id)
319
+ except Exception as exc:
320
+ parse_failed += 1
321
+ item_results.append(
322
+ _ingestion_item_result_from_event(
323
+ idx,
324
+ raw_event,
325
+ raw_envelope_id=raw_envelope_id,
326
+ status="parse_failed",
327
+ failure_reason=str(exc),
328
+ )
329
+ )
330
+ if raw_envelope_id is not None:
331
+ set_raw_envelope_status(
332
+ raw_envelope_id,
333
+ "parse_failed",
334
+ db_path=db_path,
335
+ failure_reason=str(exc),
336
+ )
337
+ if on_parse_error is not None:
338
+ on_parse_error(exc, raw_event.metadata.get("raw_line", raw_event.message_raw))
339
+ continue
340
+
341
+ canonical_events.append(canonical_event)
342
+ item_results.append(
343
+ _ingestion_item_result_from_event(
344
+ idx,
345
+ raw_event,
346
+ raw_envelope_id=raw_envelope_id,
347
+ status="canonicalized",
348
+ )
349
+ )
350
+ if raw_envelope_id is not None:
351
+ set_raw_envelope_status(raw_envelope_id, "canonicalized", db_path=db_path)
352
+ if on_event is not None:
353
+ on_event(canonical_event)
354
+
355
+ if not canonical_events:
356
+ return IngestionResult(
357
+ raw_envelopes=raw_envelopes_list,
358
+ raw_envelope_ids=raw_envelope_ids,
359
+ events=[],
360
+ signatures=[],
361
+ candidates=[],
362
+ parse_failed=parse_failed,
363
+ item_results=item_results,
364
+ )
365
+
366
+ signatures = signatures_for_events(canonical_events)
367
+ candidates = build_recurrence_candidates(canonical_events, signatures, threshold=threshold)
368
+ if db_path:
369
+ store_events(canonical_events, db_path)
370
+ store_signatures(signatures, db_path)
371
+ store_candidates(candidates, db_path)
372
+
373
+ return IngestionResult(
374
+ raw_envelopes=raw_envelopes_list,
375
+ raw_envelope_ids=raw_envelope_ids,
376
+ events=canonical_events,
377
+ signatures=signatures,
378
+ candidates=candidates,
379
+ parse_failed=parse_failed,
380
+ item_results=item_results,
381
+ )
382
+
383
+
384
+ def run_ingest_source_payload(
385
+ source_type: str,
386
+ payload: object,
387
+ *,
388
+ tenant_id: str,
389
+ source_path: str,
390
+ threshold: int = 2,
391
+ db_path: Optional[str] = None,
392
+ on_event: Optional[Callable[[CanonicalEvent], None]] = None,
393
+ on_parse_error: Optional[ErrorHandler] = None,
394
+ ) -> IngestionResult:
395
+ return run_ingest_pipeline(
396
+ parse_source_payloads(
397
+ source_type,
398
+ payload,
399
+ tenant_id=tenant_id,
400
+ source_path=source_path,
401
+ on_parse_error=on_parse_error,
402
+ ),
403
+ threshold=threshold,
404
+ db_path=db_path,
405
+ on_event=on_event,
406
+ on_parse_error=on_parse_error,
407
+ )
408
+
409
+
410
+ def run_ingest_file_lines(
411
+ lines: Iterable[str],
412
+ *,
413
+ tenant_id: str,
414
+ source_path: str,
415
+ threshold: int = 2,
416
+ db_path: str | None = None,
417
+ on_event: Optional[Callable[[CanonicalEvent], None]] = None,
418
+ on_parse_error: Optional[ErrorHandler] = None,
419
+ ) -> IngestionResult:
420
+ return run_ingest_source_payload(
421
+ "file",
422
+ [line for line in lines if str(line).strip()],
423
+ tenant_id=tenant_id,
424
+ source_path=source_path,
425
+ threshold=threshold,
426
+ db_path=db_path,
427
+ on_event=on_event,
428
+ on_parse_error=on_parse_error,
429
+ )
430
+
431
+
84
432
  def ingest_syslog_lines(lines: Iterable[str], *, tenant_id: str, source_path: str = "") -> List[CanonicalEvent]:
85
433
  return canonicalize_raw_input_envelopes(
86
434
  parse_syslog_envelopes(lines, tenant_id=tenant_id, source_path=source_path),
87
435
  )
88
436
 
89
437
 
438
+ def ingest_file_lines(lines: Iterable[str], *, tenant_id: str, source_path: str = "") -> List[CanonicalEvent]:
439
+ return canonicalize_raw_input_envelopes(
440
+ parse_file_envelopes(lines, tenant_id=tenant_id, source_path=source_path),
441
+ )
442
+
443
+
90
444
  def ingest_syslog_file(path: str, *, tenant_id: str) -> List[Event]:
91
445
  file_path = Path(path)
92
446
  lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
93
447
  return ingest_syslog_lines(lines, tenant_id=tenant_id, source_path=str(file_path))
94
448
 
95
449
 
450
+ def run_ingest_file(
451
+ path: str,
452
+ *,
453
+ tenant_id: str,
454
+ threshold: int = 2,
455
+ db_path: Optional[str] = None,
456
+ on_event: Optional[Callable[[CanonicalEvent], None]] = None,
457
+ on_parse_error: Optional[ErrorHandler] = None,
458
+ ) -> IngestionResult:
459
+ file_path = Path(path)
460
+ lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
461
+ return run_ingest_file_lines(
462
+ lines,
463
+ tenant_id=tenant_id,
464
+ source_path=str(file_path),
465
+ threshold=threshold,
466
+ db_path=db_path,
467
+ on_event=on_event,
468
+ on_parse_error=on_parse_error,
469
+ )
470
+
471
+
96
472
  def signatures_for_events(events: Iterable[Event]) -> List[Signature]:
97
- return [fingerprint_event(event) for event in events]
473
+ signatures = []
474
+ for event in events:
475
+ signature = fingerprint_event(event)
476
+ source_raw_envelope_id = getattr(event, "raw_envelope_id", None)
477
+ if source_raw_envelope_id is not None:
478
+ signature.metadata = dict(signature.metadata)
479
+ signature.metadata["source_raw_envelope_id"] = int(source_raw_envelope_id)
480
+ signature.metadata["source_raw_envelope_ids"] = [int(source_raw_envelope_id)]
481
+ signatures.append(signature)
482
+ return signatures
98
483
 
99
484
 
100
485
  def events_as_dicts(events: Iterable[Event]) -> List[dict]:
@@ -1,9 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Iterable, List, Dict, Any
3
+ from typing import Any, Dict, Iterable, List
4
4
 
5
5
  from .models import Candidate
6
6
 
7
+ ATTN_SIGNAL_LABELS = {
8
+ "recurrence": "recurrence",
9
+ "recovery": "recovery",
10
+ "spread": "spread",
11
+ "novelty": "novelty",
12
+ "impact": "human-impact",
13
+ "precursor": "precursor",
14
+ "memory_weight": "memory",
15
+ }
16
+
17
+ ATTN_SIGNAL_RATIONALES = {
18
+ "recurrence": "recurrence indicates repeated observation",
19
+ "recovery": "recovery suggests a pattern that often resets",
20
+ "spread": "spread shows similar behavior across context",
21
+ "novelty": "novelty indicates non-routine pattern shape",
22
+ "impact": "impact shows likely operator visibility value",
23
+ "precursor": "precursor score indicates early warning behavior",
24
+ "memory_weight": "memory_weight reflects previous recurrence context",
25
+ }
26
+
7
27
 
8
28
  def _attention_band(decision_band: str) -> str:
9
29
  mapping = {
@@ -16,10 +36,42 @@ def _attention_band(decision_band: str) -> str:
16
36
  return mapping.get(decision_band, "watch")
17
37
 
18
38
 
39
+ def _dominant_attention_signals(score_breakdown: Dict[str, float], *, limit: int = 3) -> List[Dict[str, Any]]:
40
+ ordered = sorted(score_breakdown.items(), key=lambda item: (float(item[1]), item[0]), reverse=True)
41
+ dominant = ordered[:limit]
42
+ return [
43
+ {
44
+ "signal": name,
45
+ "value": round(float(value), 3),
46
+ "label": ATTN_SIGNAL_LABELS.get(name, name.replace("_", "-")),
47
+ "rationale": ATTN_SIGNAL_RATIONALES.get(name, "prototype attention component"),
48
+ }
49
+ for name, value in dominant
50
+ if float(value) > 0
51
+ ]
52
+
53
+
54
+ def _attention_explanation(candidate: Candidate) -> Dict[str, Any]:
55
+ attention_band = _attention_band(candidate.decision_band)
56
+ dominant_signals = _dominant_attention_signals(candidate.score_breakdown)
57
+ signal_summary = ", ".join(f"{item['label']}:{item['value']}" for item in dominant_signals)
58
+ if signal_summary:
59
+ summary = f"{attention_band} attention is driven by {signal_summary}."
60
+ else:
61
+ summary = f"{attention_band} attention is currently low; no dominant attention signals are available."
62
+ return {
63
+ "attention_band": attention_band,
64
+ "dominant_signals": dominant_signals,
65
+ "summary": summary,
66
+ }
67
+
68
+
19
69
  def _why_it_matters(candidate: Candidate) -> str:
20
70
  count = int((candidate.metadata or {}).get("count") or 0)
21
71
  service = str((candidate.metadata or {}).get("service") or "").strip()
22
72
  family = candidate.candidate_type.replace("_", " ")
73
+ attention_explanation = _attention_explanation(candidate)
74
+ top_signals = ", ".join(item["label"] for item in attention_explanation["dominant_signals"])
23
75
  pieces = []
24
76
  if count:
25
77
  pieces.append(f"observed {count} times")
@@ -37,6 +89,8 @@ def _why_it_matters(candidate: Candidate) -> str:
37
89
  else:
38
90
  level = "is low-attention noise"
39
91
  detail = ", ".join(pieces) if pieces else family
92
+ if top_signals:
93
+ detail = f"{detail} ({top_signals})" if detail else top_signals
40
94
  return f"{detail}; {level}."
41
95
 
42
96
 
@@ -55,6 +109,7 @@ def interesting_items(candidates: Iterable[Candidate], *, limit: int = 5) -> Lis
55
109
  "score_total": candidate.score_total,
56
110
  "confidence": candidate.confidence,
57
111
  "why_it_matters": _why_it_matters(candidate),
112
+ "attention_explanation": _attention_explanation(candidate),
58
113
  "signals": dict(candidate.score_breakdown),
59
114
  "metadata": dict(candidate.metadata),
60
115
  }