@simbimbo/brainstem 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,29 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.0.3 — 2026-03-22
4
+
5
+ Intake Foundation follow-up release for **brAInstem**.
6
+
7
+ ### Highlights
8
+ - persists `RawInputEnvelope` intake records to SQLite before canonicalization
9
+ - records canonicalization outcomes explicitly (`received`, `canonicalized`, `parse_failed`, `unsupported`)
10
+ - adds ingest accounting for:
11
+ - received
12
+ - canonicalized
13
+ - parse_failed
14
+ - candidates_generated
15
+ - adds runtime inspection endpoints for intake trust and observability:
16
+ - `GET /stats`
17
+ - `GET /failures`
18
+ - `GET /failures/{id}`
19
+ - `GET /ingest/recent`
20
+ - `GET /sources`
21
+ - adds storage/query helpers for recent raw envelopes, recent failures, and per-source summaries
22
+ - expands tests around raw-envelope persistence, failure inspection, source summaries, and stats
23
+
24
+ ### Validation
25
+ - local test suite passed (`26 passed`)
26
+
3
27
  ## 0.0.2 — 2026-03-22
4
28
 
5
29
  First fully aligned public foundation release of **brAInstem**.
@@ -1,3 +1,3 @@
1
1
  """brAInstem — operational memory for weak signals."""
2
2
 
3
- __version__ = "0.0.2"
3
+ __version__ = "0.0.3"
package/brainstem/api.py CHANGED
@@ -8,11 +8,25 @@ from fastapi import FastAPI, HTTPException, Query
8
8
  from fastapi.responses import JSONResponse
9
9
  from pydantic import BaseModel, Field
10
10
 
11
- from .ingest import canonicalize_raw_input_envelopes
11
+ from .ingest import canonicalize_raw_input_envelope
12
12
  from .interesting import interesting_items
13
13
  from .models import Candidate, RawInputEnvelope
14
14
  from .recurrence import build_recurrence_candidates
15
- from .storage import init_db, list_candidates, store_candidates, store_events, store_signatures
15
+ from .storage import (
16
+ RAW_ENVELOPE_STATUSES,
17
+ get_ingest_stats,
18
+ init_db,
19
+ list_candidates,
20
+ get_raw_envelope_by_id,
21
+ get_source_dimension_summaries,
22
+ list_recent_failed_raw_envelopes,
23
+ list_recent_raw_envelopes,
24
+ set_raw_envelope_status,
25
+ store_candidates,
26
+ store_events,
27
+ store_raw_envelopes,
28
+ store_signatures,
29
+ )
16
30
  from .ingest import signatures_for_events
17
31
 
18
32
 
@@ -22,6 +36,8 @@ app = FastAPI(title="brAInstem Runtime")
22
36
  class RawEnvelopeRequest(BaseModel):
23
37
  tenant_id: str
24
38
  source_type: str
39
+ source_id: str = ""
40
+ source_name: str = ""
25
41
  message_raw: str
26
42
  timestamp: Optional[str] = None
27
43
  host: str = ""
@@ -49,6 +65,8 @@ def _raw_envelope_from_request(payload: RawEnvelopeRequest) -> RawInputEnvelope:
49
65
  return RawInputEnvelope(
50
66
  tenant_id=payload.tenant_id,
51
67
  source_type=payload.source_type,
68
+ source_id=payload.source_id,
69
+ source_name=payload.source_name,
52
70
  timestamp=payload.timestamp or datetime.utcnow().isoformat() + "Z",
53
71
  message_raw=payload.message_raw,
54
72
  host=payload.host,
@@ -78,15 +96,68 @@ def _candidate_from_row(row) -> Candidate:
78
96
  )
79
97
 
80
98
 
99
+ def _raw_envelope_from_row(row) -> Dict[str, Any]:
100
+ return {
101
+ "id": row["id"],
102
+ "tenant_id": row["tenant_id"],
103
+ "source_type": row["source_type"],
104
+ "source_id": row["source_id"],
105
+ "source_name": row["source_name"],
106
+ "timestamp": row["timestamp"],
107
+ "host": row["host"],
108
+ "service": row["service"],
109
+ "severity": row["severity"],
110
+ "asset_id": row["asset_id"],
111
+ "source_path": row["source_path"],
112
+ "facility": row["facility"],
113
+ "message_raw": row["message_raw"],
114
+ "structured_fields": json.loads(row["structured_fields_json"] or "{}"),
115
+ "correlation_keys": json.loads(row["correlation_keys_json"] or "{}"),
116
+ "metadata": json.loads(row["metadata_json"] or "{}"),
117
+ "canonicalization_status": row["canonicalization_status"],
118
+ "failure_reason": row["failure_reason"],
119
+ }
120
+
121
+
81
122
  def _run_ingest_batch(raw_events: List[RawInputEnvelope], *, threshold: int, db_path: Optional[str]) -> Dict[str, Any]:
82
- events = canonicalize_raw_input_envelopes(raw_events)
123
+ raw_envelope_ids: List[int] = []
124
+ if db_path:
125
+ init_db(db_path)
126
+ raw_envelope_ids = store_raw_envelopes(raw_events, db_path)
127
+
128
+ events = []
129
+ parse_failed = 0
130
+ for idx, raw_event in enumerate(raw_events):
131
+ raw_envelope_id = raw_envelope_ids[idx] if idx < len(raw_envelope_ids) else None
132
+ try:
133
+ canonical_event = canonicalize_raw_input_envelope(raw_event)
134
+ except Exception as exc:
135
+ parse_failed += 1
136
+ if raw_envelope_id is not None:
137
+ set_raw_envelope_status(
138
+ raw_envelope_id,
139
+ "parse_failed",
140
+ db_path=db_path,
141
+ failure_reason=str(exc),
142
+ )
143
+ continue
144
+ events.append(canonical_event)
145
+ if raw_envelope_id is not None:
146
+ set_raw_envelope_status(raw_envelope_id, "canonicalized", db_path=db_path)
147
+
83
148
  if not events:
84
- return {"ok": True, "event_count": 0, "signature_count": 0, "candidate_count": 0, "interesting_items": []}
149
+ return {
150
+ "ok": True,
151
+ "event_count": 0,
152
+ "signature_count": 0,
153
+ "candidate_count": 0,
154
+ "parse_failed": parse_failed,
155
+ "interesting_items": [],
156
+ }
85
157
 
86
158
  signatures = signatures_for_events(events)
87
159
  candidates = build_recurrence_candidates(events, signatures, threshold=threshold)
88
160
  if db_path:
89
- init_db(db_path)
90
161
  store_events(events, db_path)
91
162
  store_signatures(signatures, db_path)
92
163
  store_candidates(candidates, db_path)
@@ -97,6 +168,7 @@ def _run_ingest_batch(raw_events: List[RawInputEnvelope], *, threshold: int, db_
97
168
  "event_count": len(events),
98
169
  "signature_count": len({sig.signature_key for sig in signatures}),
99
170
  "candidate_count": len(candidates),
171
+ "parse_failed": parse_failed,
100
172
  "interesting_items": interesting_items(candidates, limit=max(1, 5)),
101
173
  }
102
174
 
@@ -126,6 +198,60 @@ def get_interesting(
126
198
  return {"ok": True, "items": interesting_items(candidates, limit=limit)}
127
199
 
128
200
 
201
+ @app.get("/stats")
202
+ def get_stats(db_path: Optional[str] = None) -> Dict[str, Any]:
203
+ return {"ok": True, **get_ingest_stats(db_path)}
204
+
205
+
206
+ @app.get("/failures")
207
+ def get_failures(
208
+ limit: int = Query(default=20, ge=1),
209
+ status: Optional[str] = None,
210
+ db_path: Optional[str] = None,
211
+ ) -> Dict[str, Any]:
212
+ if status is not None and status not in RAW_ENVELOPE_STATUSES:
213
+ raise HTTPException(
214
+ status_code=422,
215
+ detail=f"invalid status '{status}'; expected one of: {', '.join(RAW_ENVELOPE_STATUSES)}",
216
+ )
217
+
218
+ rows = list_recent_failed_raw_envelopes(db_path=db_path, status=status, limit=limit)
219
+ items = [_raw_envelope_from_row(row) for row in rows]
220
+ return {"ok": True, "items": items, "count": len(items), "status": status}
221
+
222
+
223
+ @app.get("/ingest/recent")
224
+ def get_ingest_recent(
225
+ limit: int = Query(default=20, ge=1),
226
+ status: Optional[str] = None,
227
+ db_path: Optional[str] = None,
228
+ ) -> Dict[str, Any]:
229
+ if status is not None and status not in RAW_ENVELOPE_STATUSES:
230
+ raise HTTPException(
231
+ status_code=422,
232
+ detail=f"invalid status '{status}'; expected one of: {', '.join(RAW_ENVELOPE_STATUSES)}",
233
+ )
234
+ rows = list_recent_raw_envelopes(db_path=db_path, status=status, limit=limit, failures_only=False)
235
+ items = [_raw_envelope_from_row(row) for row in rows]
236
+ return {"ok": True, "items": items, "count": len(items), "status": status}
237
+
238
+
239
+ @app.get("/sources")
240
+ def get_sources(
241
+ limit: int = Query(default=10, ge=1),
242
+ db_path: Optional[str] = None,
243
+ ) -> Dict[str, Any]:
244
+ return {"ok": True, "items": get_source_dimension_summaries(db_path=db_path, limit=limit)}
245
+
246
+
247
+ @app.get("/failures/{raw_envelope_id}")
248
+ def get_failure(raw_envelope_id: int, db_path: Optional[str] = None) -> Dict[str, Any]:
249
+ row = get_raw_envelope_by_id(raw_envelope_id, db_path=db_path)
250
+ if row is None:
251
+ raise HTTPException(status_code=404, detail="raw envelope not found")
252
+ return {"ok": True, "item": _raw_envelope_from_row(row)}
253
+
254
+
129
255
  @app.get("/healthz")
130
256
  def healthz() -> Dict[str, str]:
131
257
  return JSONResponse(content={"ok": True, "status": "ok"})
@@ -50,6 +50,13 @@ def parse_syslog_envelopes(lines: Iterable[str], *, tenant_id: str, source_path:
50
50
 
51
51
 
52
52
  def canonicalize_raw_input_envelope(raw: RawInputEnvelope) -> CanonicalEvent:
53
+ parse_error = (raw.metadata or {}).get("parse_error")
54
+ if parse_error:
55
+ raise ValueError(f"parse_error: {parse_error}")
56
+
57
+ if not (raw.message_raw or "").strip():
58
+ raise ValueError("message_raw is empty and cannot be canonicalized")
59
+
53
60
  message_normalized = normalize_message(raw.message_raw)
54
61
  metadata = dict(raw.metadata or {})
55
62
  metadata.setdefault("canonicalization_source", raw.source_type)
@@ -10,6 +10,8 @@ class RawInputEnvelope:
10
10
  source_type: str
11
11
  timestamp: str
12
12
  message_raw: str
13
+ source_id: str = ""
14
+ source_name: str = ""
13
15
  host: str = ""
14
16
  service: str = ""
15
17
  severity: str = "info"
@@ -2,17 +2,25 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  import sqlite3
5
- from dataclasses import asdict
6
5
  from pathlib import Path
7
- from typing import Iterable, List
6
+ from typing import Any, Iterable, List
8
7
 
9
- from .models import Candidate, Event, Signature
8
+ from .models import Candidate, Event, RawInputEnvelope, Signature
10
9
 
11
10
 
12
11
  def default_db_path() -> Path:
13
12
  return Path('.brainstem-state') / 'brainstem.sqlite3'
14
13
 
15
14
 
15
+ RAW_ENVELOPE_STATUSES = ("received", "canonicalized", "parse_failed", "unsupported")
16
+ RAW_ENVELOPE_FAILURE_STATUSES = ("parse_failed", "unsupported")
17
+
18
+
19
+ def _validate_canonicalization_status(status: str) -> None:
20
+ if status not in RAW_ENVELOPE_STATUSES:
21
+ raise ValueError(f"unsupported canonicalization_status: {status}")
22
+
23
+
16
24
  def connect(db_path: str | None = None) -> sqlite3.Connection:
17
25
  path = Path(db_path) if db_path else default_db_path()
18
26
  path.parent.mkdir(parents=True, exist_ok=True)
@@ -42,6 +50,27 @@ def init_db(db_path: str | None = None) -> None:
42
50
  correlation_keys_json TEXT NOT NULL
43
51
  );
44
52
 
53
+ CREATE TABLE IF NOT EXISTS raw_envelopes (
54
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
55
+ tenant_id TEXT NOT NULL,
56
+ source_type TEXT NOT NULL,
57
+ source_id TEXT,
58
+ source_name TEXT,
59
+ timestamp TEXT NOT NULL,
60
+ host TEXT,
61
+ service TEXT,
62
+ severity TEXT,
63
+ asset_id TEXT,
64
+ source_path TEXT,
65
+ facility TEXT,
66
+ message_raw TEXT NOT NULL,
67
+ structured_fields_json TEXT NOT NULL,
68
+ correlation_keys_json TEXT NOT NULL,
69
+ metadata_json TEXT NOT NULL,
70
+ canonicalization_status TEXT NOT NULL DEFAULT 'received',
71
+ failure_reason TEXT
72
+ );
73
+
45
74
  CREATE TABLE IF NOT EXISTS signatures (
46
75
  id INTEGER PRIMARY KEY AUTOINCREMENT,
47
76
  signature_key TEXT NOT NULL UNIQUE,
@@ -72,6 +101,133 @@ def init_db(db_path: str | None = None) -> None:
72
101
  conn.close()
73
102
 
74
103
 
104
+ def store_raw_envelopes(raw_envelopes: Iterable[RawInputEnvelope], db_path: str | None = None) -> List[int]:
105
+ conn = connect(db_path)
106
+ raw_ids: List[int] = []
107
+ try:
108
+ for raw in raw_envelopes:
109
+ cursor = conn.execute(
110
+ '''
111
+ INSERT INTO raw_envelopes (
112
+ tenant_id, source_type, source_id, source_name, timestamp, host, service, severity,
113
+ asset_id, source_path, facility, message_raw,
114
+ structured_fields_json, correlation_keys_json, metadata_json,
115
+ canonicalization_status, failure_reason
116
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
117
+ ''',
118
+ (
119
+ raw.tenant_id,
120
+ raw.source_type,
121
+ raw.source_id,
122
+ raw.source_name,
123
+ raw.timestamp,
124
+ raw.host,
125
+ raw.service,
126
+ raw.severity,
127
+ raw.asset_id,
128
+ raw.source_path,
129
+ raw.facility,
130
+ raw.message_raw,
131
+ json.dumps(raw.structured_fields, ensure_ascii=False),
132
+ json.dumps(raw.correlation_keys, ensure_ascii=False),
133
+ json.dumps(raw.metadata, ensure_ascii=False),
134
+ "received",
135
+ None,
136
+ ),
137
+ )
138
+ raw_ids.append(int(cursor.lastrowid))
139
+ conn.commit()
140
+ return raw_ids
141
+ finally:
142
+ conn.close()
143
+
144
+
145
+ def set_raw_envelope_status(
146
+ raw_envelope_id: int,
147
+ status: str,
148
+ db_path: str | None = None,
149
+ *,
150
+ failure_reason: str | None = None,
151
+ ) -> None:
152
+ _validate_canonicalization_status(status)
153
+ conn = connect(db_path)
154
+ try:
155
+ conn.execute(
156
+ '''
157
+ UPDATE raw_envelopes
158
+ SET canonicalization_status = ?, failure_reason = ?
159
+ WHERE id = ?
160
+ ''',
161
+ (status, failure_reason, raw_envelope_id),
162
+ )
163
+ conn.commit()
164
+ finally:
165
+ conn.close()
166
+
167
+
168
+ def get_raw_envelope_by_id(raw_envelope_id: int, db_path: str | None = None) -> sqlite3.Row | None:
169
+ conn = connect(db_path)
170
+ try:
171
+ return conn.execute(
172
+ "SELECT * FROM raw_envelopes WHERE id = ?",
173
+ (raw_envelope_id,),
174
+ ).fetchone()
175
+ finally:
176
+ conn.close()
177
+
178
+
179
+ def _recent_raw_envelopes_query(
180
+ canonicalization_status: str | None,
181
+ *,
182
+ failures_only: bool,
183
+ ) -> tuple[str, tuple[str, ...], bool]:
184
+ if canonicalization_status is None and failures_only:
185
+ return "WHERE canonicalization_status IN (?, ?)", RAW_ENVELOPE_FAILURE_STATUSES, True
186
+ if canonicalization_status is None and not failures_only:
187
+ return "", (), False
188
+ _validate_canonicalization_status(canonicalization_status)
189
+ return "WHERE canonicalization_status = ?", (canonicalization_status,), False
190
+
191
+
192
+ def list_recent_raw_envelopes(
193
+ db_path: str | None = None,
194
+ status: str | None = None,
195
+ limit: int = 20,
196
+ *,
197
+ failures_only: bool = False,
198
+ ) -> List[sqlite3.Row]:
199
+ conn = connect(db_path)
200
+ try:
201
+ where_clause, status_args, _ = _recent_raw_envelopes_query(status, failures_only=failures_only)
202
+ prefix = f"{where_clause} " if where_clause else ""
203
+ rows = conn.execute(
204
+ f"""
205
+ SELECT * FROM raw_envelopes
206
+ {prefix}
207
+ ORDER BY id DESC
208
+ LIMIT ?
209
+ """,
210
+ (*status_args, max(1, limit)),
211
+ ).fetchall()
212
+ return rows
213
+ finally:
214
+ conn.close()
215
+
216
+
217
+ def list_recent_failed_raw_envelopes(
218
+ db_path: str | None = None,
219
+ *,
220
+ status: str | None = None,
221
+ limit: int = 20,
222
+ ) -> List[sqlite3.Row]:
223
+ return list_recent_raw_envelopes(
224
+ db_path=db_path,
225
+ status=status,
226
+ limit=limit,
227
+ failures_only=status is None,
228
+ )
229
+
230
+
75
231
  def store_events(events: Iterable[Event], db_path: str | None = None) -> int:
76
232
  conn = connect(db_path)
77
233
  count = 0
@@ -107,6 +263,96 @@ def store_events(events: Iterable[Event], db_path: str | None = None) -> int:
107
263
  conn.close()
108
264
 
109
265
 
266
+ SOURCE_SUMMARY_DIMENSIONS = (
267
+ "source_type",
268
+ "source_path",
269
+ "source_id",
270
+ "source_name",
271
+ "host",
272
+ "service",
273
+ )
274
+
275
+
276
+ def _summarize_raw_envelopes_by_dimension(
277
+ conn: sqlite3.Connection,
278
+ dimension: str,
279
+ limit: int = 20,
280
+ ) -> List[dict[str, int | str]]:
281
+ if dimension not in SOURCE_SUMMARY_DIMENSIONS:
282
+ raise ValueError(f"unsupported dimension: {dimension}")
283
+ return [
284
+ {"value": row["value"], "count": int(row["count"])}
285
+ for row in conn.execute(
286
+ f"""
287
+ SELECT {dimension} AS value, COUNT(*) AS count
288
+ FROM raw_envelopes
289
+ WHERE COALESCE(TRIM({dimension}), '') <> ''
290
+ GROUP BY {dimension}
291
+ ORDER BY COUNT(*) DESC, value ASC
292
+ LIMIT ?
293
+ """,
294
+ (max(1, limit),),
295
+ ).fetchall()
296
+ ]
297
+
298
+
299
+ def get_source_dimension_summaries(
300
+ db_path: str | None = None,
301
+ *,
302
+ limit: int = 20,
303
+ ) -> dict[str, List[dict[str, int | str]]]:
304
+ init_db(db_path)
305
+ conn = connect(db_path)
306
+ try:
307
+ return _get_source_dimension_summaries_from_conn(conn, limit=limit)
308
+ finally:
309
+ conn.close()
310
+
311
+
312
+ def _get_source_dimension_summaries_from_conn(
313
+ conn: sqlite3.Connection,
314
+ *,
315
+ limit: int = 20,
316
+ ) -> dict[str, List[dict[str, int | str]]]:
317
+ return {
318
+ "source_type": _summarize_raw_envelopes_by_dimension(conn, "source_type", limit=limit),
319
+ "source_path": _summarize_raw_envelopes_by_dimension(conn, "source_path", limit=limit),
320
+ "source_id": _summarize_raw_envelopes_by_dimension(conn, "source_id", limit=limit),
321
+ "source_name": _summarize_raw_envelopes_by_dimension(conn, "source_name", limit=limit),
322
+ "host": _summarize_raw_envelopes_by_dimension(conn, "host", limit=limit),
323
+ "service": _summarize_raw_envelopes_by_dimension(conn, "service", limit=limit),
324
+ }
325
+
326
+
327
+ def _query_count(conn: sqlite3.Connection, query: str) -> int:
328
+ return int(conn.execute(query).fetchone()[0])
329
+
330
+
331
+ def get_ingest_stats(db_path: str | None = None) -> dict[str, Any]:
332
+ init_db(db_path)
333
+ conn = connect(db_path)
334
+ try:
335
+ return {
336
+ "received": _query_count(conn, "SELECT COUNT(*) FROM raw_envelopes"),
337
+ "canonicalized": _query_count(
338
+ conn,
339
+ "SELECT COUNT(*) FROM raw_envelopes WHERE canonicalization_status = 'canonicalized'",
340
+ ),
341
+ "parse_failed": _query_count(
342
+ conn,
343
+ "SELECT COUNT(*) FROM raw_envelopes WHERE canonicalization_status = 'parse_failed'",
344
+ ),
345
+ "unsupported": _query_count(
346
+ conn,
347
+ "SELECT COUNT(*) FROM raw_envelopes WHERE canonicalization_status = 'unsupported'",
348
+ ),
349
+ "candidates_generated": _query_count(conn, "SELECT COUNT(*) FROM candidates"),
350
+ "source_summaries": _get_source_dimension_summaries_from_conn(conn),
351
+ }
352
+ finally:
353
+ conn.close()
354
+
355
+
110
356
  def store_signatures(signatures: Iterable[Signature], db_path: str | None = None) -> int:
111
357
  conn = connect(db_path)
112
358
  count = 0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@simbimbo/brainstem",
3
- "version": "0.0.2",
3
+ "version": "0.0.3",
4
4
  "description": "brAInstem — operational memory for weak signals.",
5
5
  "license": "MIT",
6
6
  "type": "module",
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "brainstem"
7
- version = "0.0.2"
7
+ version = "0.0.3"
8
8
  description = "brAInstem — operational memory for weak signals."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
package/tests/test_api.py CHANGED
@@ -3,6 +3,12 @@ from pathlib import Path
3
3
  from fastapi.testclient import TestClient
4
4
 
5
5
  from brainstem.api import app
6
+ from brainstem.models import RawInputEnvelope
7
+ from brainstem.storage import (
8
+ init_db,
9
+ set_raw_envelope_status,
10
+ store_raw_envelopes,
11
+ )
6
12
 
7
13
 
8
14
  def test_ingest_event_endpoint_round_trip(tmp_path: Path) -> None:
@@ -65,8 +71,249 @@ def test_ingest_batch_and_interesting(tmp_path: Path) -> None:
65
71
  assert interesting_payload["items"]
66
72
 
67
73
 
74
+ def test_stats_after_successful_and_failed_ingest(tmp_path: Path) -> None:
75
+ client = TestClient(app)
76
+ db_path = tmp_path / "brainstem.sqlite3"
77
+ batch_response = client.post(
78
+ "/ingest/batch",
79
+ json={
80
+ "threshold": 2,
81
+ "db_path": str(db_path),
82
+ "events": [
83
+ {
84
+ "tenant_id": "client-a",
85
+ "source_type": "syslog",
86
+ "message_raw": "Failed password for admin from 10.1.2.3",
87
+ "host": "fw-01",
88
+ "service": "sshd",
89
+ },
90
+ {
91
+ "tenant_id": "client-a",
92
+ "source_type": "syslog",
93
+ "message_raw": "Failed password for admin from 10.1.2.3",
94
+ "host": "fw-01",
95
+ "service": "sshd",
96
+ },
97
+ {
98
+ "tenant_id": "client-a",
99
+ "source_type": "syslog",
100
+ "message_raw": "",
101
+ "host": "fw-01",
102
+ "service": "sshd",
103
+ },
104
+ ],
105
+ },
106
+ )
107
+ assert batch_response.status_code == 200
108
+ batch_payload = batch_response.json()
109
+ assert batch_payload["ok"] is True
110
+ assert batch_payload["event_count"] == 2
111
+ assert batch_payload["parse_failed"] == 1
112
+
113
+ stats = client.get(f"/stats?db_path={db_path}")
114
+ assert stats.status_code == 200
115
+ stats_payload = stats.json()
116
+ assert stats_payload["ok"] is True
117
+ assert stats_payload["received"] == 3
118
+ assert stats_payload["canonicalized"] == 2
119
+ assert stats_payload["parse_failed"] == 1
120
+ assert stats_payload["candidates_generated"] >= 1
121
+
122
+
68
123
  def test_healthz_is_ready() -> None:
69
124
  client = TestClient(app)
70
125
  response = client.get("/healthz")
71
126
  assert response.status_code == 200
72
127
  assert response.json()["ok"] is True
128
+
129
+
130
+ def test_failures_endpoint_lists_recent_parse_failures(tmp_path: Path) -> None:
131
+ client = TestClient(app)
132
+ db_path = tmp_path / "brainstem.sqlite3"
133
+ client.post(
134
+ "/ingest/batch",
135
+ json={
136
+ "threshold": 2,
137
+ "db_path": str(db_path),
138
+ "events": [
139
+ {
140
+ "tenant_id": "client-a",
141
+ "source_type": "syslog",
142
+ "message_raw": "",
143
+ "host": "fw-01",
144
+ "service": "sshd",
145
+ },
146
+ {
147
+ "tenant_id": "client-a",
148
+ "source_type": "syslog",
149
+ "message_raw": "VPN tunnel dropped and recovered",
150
+ "host": "fw-01",
151
+ "service": "charon",
152
+ },
153
+ ],
154
+ },
155
+ )
156
+
157
+ response = client.get(f"/failures?db_path={db_path}&limit=10")
158
+ assert response.status_code == 200
159
+ payload = response.json()
160
+ assert payload["ok"] is True
161
+ assert payload["count"] == 1
162
+ assert payload["items"][0]["canonicalization_status"] == "parse_failed"
163
+
164
+
165
+ def test_failures_endpoint_filters_by_status_and_fetches_single_record(tmp_path: Path) -> None:
166
+ client = TestClient(app)
167
+ db_path = tmp_path / "brainstem.sqlite3"
168
+ init_db(str(db_path))
169
+ raw_ids = store_raw_envelopes(
170
+ [
171
+ RawInputEnvelope(
172
+ tenant_id="client-a",
173
+ source_type="syslog",
174
+ timestamp="2026-03-22T00:00:01Z",
175
+ message_raw="first",
176
+ host="fw-01",
177
+ service="sshd",
178
+ ),
179
+ RawInputEnvelope(
180
+ tenant_id="client-a",
181
+ source_type="syslog",
182
+ timestamp="2026-03-22T00:00:02Z",
183
+ message_raw="second",
184
+ host="fw-01",
185
+ service="sshd",
186
+ ),
187
+ ],
188
+ db_path=str(db_path),
189
+ )
190
+ set_raw_envelope_status(raw_ids[0], "parse_failed", db_path=str(db_path), failure_reason="bad parse")
191
+ set_raw_envelope_status(raw_ids[1], "unsupported", db_path=str(db_path), failure_reason="unsupported source")
192
+
193
+ failed_only = client.get(f"/failures?db_path={db_path}&status=parse_failed&limit=10")
194
+ assert failed_only.status_code == 200
195
+ failed_payload = failed_only.json()
196
+ assert failed_payload["count"] == 1
197
+ assert failed_payload["items"][0]["id"] == raw_ids[0]
198
+
199
+ unsupported = client.get(f"/failures?db_path={db_path}&status=unsupported&limit=10")
200
+ assert unsupported.status_code == 200
201
+ unsupported_payload = unsupported.json()
202
+ assert unsupported_payload["count"] == 1
203
+ assert unsupported_payload["items"][0]["id"] == raw_ids[1]
204
+
205
+ single = client.get(f"/failures/{raw_ids[1]}?db_path={db_path}")
206
+ assert single.status_code == 200
207
+ single_payload = single.json()
208
+ assert single_payload["ok"] is True
209
+ assert single_payload["item"]["id"] == raw_ids[1]
210
+ assert single_payload["item"]["failure_reason"] == "unsupported source"
211
+
212
+ invalid = client.get(f"/failures?db_path={db_path}&status=bogus")
213
+ assert invalid.status_code == 422
214
+
215
+
216
+ def test_sources_endpoint_summarizes_ingest_dimensions(tmp_path: Path) -> None:
217
+ client = TestClient(app)
218
+ db_path = tmp_path / "brainstem.sqlite3"
219
+ batch_response = client.post(
220
+ "/ingest/batch",
221
+ json={
222
+ "threshold": 1,
223
+ "db_path": str(db_path),
224
+ "events": [
225
+ {
226
+ "tenant_id": "client-a",
227
+ "source_type": "syslog",
228
+ "source_id": "fw-01",
229
+ "source_name": "edge-fw-01",
230
+ "source_path": "/var/log/syslog",
231
+ "message_raw": "Failed password for admin from 10.1.2.3",
232
+ "host": "fw-01",
233
+ "service": "sshd",
234
+ "severity": "info",
235
+ },
236
+ {
237
+ "tenant_id": "client-a",
238
+ "source_type": "syslog",
239
+ "source_id": "fw-01",
240
+ "source_name": "edge-fw-01",
241
+ "source_path": "/var/log/syslog",
242
+ "message_raw": "Failed password for admin from 10.1.2.3",
243
+ "host": "fw-01",
244
+ "service": "sshd",
245
+ "severity": "info",
246
+ },
247
+ {
248
+ "tenant_id": "client-a",
249
+ "source_type": "logicmonitor",
250
+ "source_id": "lm-01",
251
+ "source_name": "edge-lm-01",
252
+ "source_path": "/alerts",
253
+ "message_raw": "Disk space low",
254
+ "host": "lm-01",
255
+ "service": "logicmonitor",
256
+ "severity": "warning",
257
+ },
258
+ ],
259
+ },
260
+ )
261
+ assert batch_response.status_code == 200
262
+
263
+ response = client.get(f"/sources?db_path={db_path}&limit=10")
264
+ assert response.status_code == 200
265
+ payload = response.json()
266
+ assert payload["ok"] is True
267
+ assert payload["items"]["source_type"] == [
268
+ {"value": "syslog", "count": 2},
269
+ {"value": "logicmonitor", "count": 1},
270
+ ]
271
+ assert dict((entry["value"], entry["count"]) for entry in payload["items"]["source_name"]) == {
272
+ "edge-fw-01": 2,
273
+ "edge-lm-01": 1,
274
+ }
275
+
276
+
277
+ def test_ingest_recent_endpoint_returns_recent_intake_and_allows_status_filter(tmp_path: Path) -> None:
278
+ client = TestClient(app)
279
+ db_path = tmp_path / "brainstem.sqlite3"
280
+ client.post(
281
+ "/ingest/batch",
282
+ json={
283
+ "threshold": 1,
284
+ "db_path": str(db_path),
285
+ "events": [
286
+ {
287
+ "tenant_id": "client-a",
288
+ "source_type": "syslog",
289
+ "source_id": "fw-01",
290
+ "source_name": "edge-fw-01",
291
+ "message_raw": "service restarted",
292
+ "host": "fw-01",
293
+ "service": "systemd",
294
+ },
295
+ {
296
+ "tenant_id": "client-a",
297
+ "source_type": "syslog",
298
+ "source_id": "fw-01",
299
+ "source_name": "edge-fw-01",
300
+ "message_raw": "",
301
+ "host": "fw-01",
302
+ "service": "systemd",
303
+ },
304
+ ],
305
+ },
306
+ )
307
+
308
+ response = client.get(f"/ingest/recent?db_path={db_path}&limit=10")
309
+ assert response.status_code == 200
310
+ payload = response.json()
311
+ assert payload["ok"] is True
312
+ assert payload["count"] == 2
313
+ assert len({item["canonicalization_status"] for item in payload["items"]}) == 2
314
+
315
+ failed = client.get(f"/ingest/recent?db_path={db_path}&status=parse_failed&limit=10")
316
+ assert failed.status_code == 200
317
+ failed_payload = failed.json()
318
+ assert failed_payload["count"] == 1
319
+ assert failed_payload["items"][0]["canonicalization_status"] == "parse_failed"
@@ -1,8 +1,23 @@
1
+ import sqlite3
1
2
  from pathlib import Path
2
3
 
3
4
  from brainstem.ingest import ingest_syslog_lines, signatures_for_events
5
+ from brainstem.models import RawInputEnvelope
4
6
  from brainstem.recurrence import build_recurrence_candidates
5
- from brainstem.storage import init_db, list_candidates, store_candidates, store_events, store_signatures
7
+ from brainstem.storage import (
8
+ get_raw_envelope_by_id,
9
+ get_ingest_stats,
10
+ init_db,
11
+ list_candidates,
12
+ get_source_dimension_summaries,
13
+ store_candidates,
14
+ store_events,
15
+ list_recent_failed_raw_envelopes,
16
+ list_recent_raw_envelopes,
17
+ store_raw_envelopes,
18
+ set_raw_envelope_status,
19
+ store_signatures,
20
+ )
6
21
 
7
22
 
8
23
  def test_storage_round_trip(tmp_path: Path) -> None:
@@ -24,3 +39,256 @@ def test_storage_round_trip(tmp_path: Path) -> None:
24
39
  rows = list_candidates(str(db_path), limit=10)
25
40
  assert rows
26
41
  assert rows[0]['title']
42
+
43
+
44
+ def test_raw_envelope_records_are_persisted(tmp_path: Path) -> None:
45
+ db_path = tmp_path / 'brainstem.sqlite3'
46
+ init_db(str(db_path))
47
+ raw_events = [
48
+ RawInputEnvelope(
49
+ tenant_id="client-a",
50
+ source_type="syslog",
51
+ timestamp="2026-03-22T00:00:01Z",
52
+ message_raw="VPN tunnel dropped and recovered",
53
+ host="fw-01",
54
+ service="charon",
55
+ ),
56
+ RawInputEnvelope(
57
+ tenant_id="client-a",
58
+ source_type="syslog",
59
+ timestamp="2026-03-22T00:00:02Z",
60
+ message_raw="IPsec SA rekey succeeded",
61
+ host="fw-01",
62
+ service="charon",
63
+ ),
64
+ ]
65
+ assert store_raw_envelopes(raw_events, str(db_path)) == [1, 2]
66
+
67
+ conn = sqlite3.connect(db_path)
68
+ try:
69
+ rows = conn.execute(
70
+ "SELECT tenant_id, source_type, message_raw, canonicalization_status FROM raw_envelopes ORDER BY id ASC"
71
+ ).fetchall()
72
+ finally:
73
+ conn.close()
74
+
75
+ assert len(rows) == 2
76
+ assert rows[0][0] == "client-a"
77
+ assert rows[0][1] == "syslog"
78
+ assert rows[0][2] == "VPN tunnel dropped and recovered"
79
+ assert rows[0][3] == "received"
80
+ assert rows[1][3] == "received"
81
+
82
+
83
+ def test_ingest_stats_from_raw_envelopes(tmp_path: Path) -> None:
84
+ db_path = tmp_path / 'brainstem.sqlite3'
85
+ init_db(str(db_path))
86
+ conn = sqlite3.connect(db_path)
87
+ try:
88
+ conn.execute(
89
+ """
90
+ INSERT INTO raw_envelopes (
91
+ tenant_id, source_type, timestamp, host, service, severity,
92
+ asset_id, source_path, facility, message_raw,
93
+ structured_fields_json, correlation_keys_json, metadata_json,
94
+ canonicalization_status, failure_reason
95
+ ) VALUES (
96
+ 'client-a', 'syslog', '2026-03-22T00:00:00Z',
97
+ 'fw-01', 'charon', 'info', '', '', '', 'ok', '{}', '{}', '{}',
98
+ 'canonicalized', NULL
99
+ )
100
+ """
101
+ )
102
+ conn.execute(
103
+ """
104
+ INSERT INTO raw_envelopes (
105
+ tenant_id, source_type, timestamp, host, service, severity,
106
+ asset_id, source_path, facility, message_raw,
107
+ structured_fields_json, correlation_keys_json, metadata_json,
108
+ canonicalization_status, failure_reason
109
+ ) VALUES (
110
+ 'client-a', 'syslog', '2026-03-22T00:00:00Z',
111
+ 'fw-01', 'charon', 'info', '', '', '', 'bad', '{}', '{}', '{}',
112
+ 'parse_failed', 'message empty'
113
+ )
114
+ """
115
+ )
116
+ conn.execute(
117
+ "INSERT INTO candidates (candidate_type, title, summary, score_total, score_breakdown_json, decision_band, source_signature_ids_json, source_event_ids_json, confidence, metadata_json) VALUES ('recurrence', 'x', 'y', 1.0, '{}', 'medium', '[]', '[]', 0.1, '{}')"
118
+ )
119
+ conn.commit()
120
+ finally:
121
+ conn.close()
122
+
123
+ stats = get_ingest_stats(str(db_path))
124
+ assert stats["received"] == 2
125
+ assert stats["canonicalized"] == 1
126
+ assert stats["parse_failed"] == 1
127
+ assert stats["candidates_generated"] == 1
128
+
129
+
130
+ def test_source_dimension_summaries(tmp_path: Path) -> None:
131
+ db_path = tmp_path / 'brainstem.sqlite3'
132
+ init_db(str(db_path))
133
+ store_raw_envelopes(
134
+ [
135
+ RawInputEnvelope(
136
+ tenant_id='client-a',
137
+ source_type='syslog',
138
+ source_id='fw-01',
139
+ source_name='edge-fw-01',
140
+ timestamp='2026-03-22T00:00:01Z',
141
+ message_raw='VPN tunnel dropped and recovered',
142
+ source_path='/var/log/syslog',
143
+ host='fw-01',
144
+ service='charon',
145
+ ),
146
+ RawInputEnvelope(
147
+ tenant_id='client-a',
148
+ source_type='syslog',
149
+ source_id='fw-01',
150
+ source_name='edge-fw-01',
151
+ timestamp='2026-03-22T00:00:02Z',
152
+ message_raw='IPsec SA rekey succeeded',
153
+ source_path='/var/log/syslog',
154
+ host='fw-01',
155
+ service='charon',
156
+ ),
157
+ RawInputEnvelope(
158
+ tenant_id='client-a',
159
+ source_type='logicmonitor',
160
+ source_id='lm-1',
161
+ source_name='edge-lm-01',
162
+ timestamp='2026-03-22T00:00:03Z',
163
+ message_raw='CPU usage high',
164
+ source_path='/alerts',
165
+ host='lm-01',
166
+ service='logicmonitor',
167
+ ),
168
+ ],
169
+ db_path=str(db_path),
170
+ )
171
+
172
+ summary = get_source_dimension_summaries(str(db_path), limit=10)
173
+ assert summary['source_type'][0]['value'] == "syslog"
174
+ assert summary['source_type'][0]['count'] == 2
175
+ assert summary['source_type'][1]['value'] == "logicmonitor"
176
+ assert summary['source_type'][1]['count'] == 1
177
+ assert dict((entry['value'], entry['count']) for entry in summary['source_path']) == {
178
+ '/alerts': 1,
179
+ '/var/log/syslog': 2,
180
+ }
181
+ assert dict((entry['value'], entry['count']) for entry in summary['source_id']) == {
182
+ 'fw-01': 2,
183
+ 'lm-1': 1,
184
+ }
185
+
186
+
187
+ def test_list_recent_raw_envelopes_supports_status_filtering(tmp_path: Path) -> None:
188
+ db_path = tmp_path / 'brainstem.sqlite3'
189
+ init_db(str(db_path))
190
+ raw_ids = store_raw_envelopes(
191
+ [
192
+ RawInputEnvelope(
193
+ tenant_id='client-a',
194
+ source_type='syslog',
195
+ timestamp='2026-03-22T00:00:01Z',
196
+ message_raw='first',
197
+ host='fw-01',
198
+ service='sshd',
199
+ ),
200
+ RawInputEnvelope(
201
+ tenant_id='client-a',
202
+ source_type='syslog',
203
+ timestamp='2026-03-22T00:00:02Z',
204
+ message_raw='second',
205
+ host='fw-01',
206
+ service='sshd',
207
+ ),
208
+ RawInputEnvelope(
209
+ tenant_id='client-a',
210
+ source_type='syslog',
211
+ timestamp='2026-03-22T00:00:03Z',
212
+ message_raw='third',
213
+ host='fw-01',
214
+ service='sshd',
215
+ ),
216
+ ],
217
+ db_path=str(db_path),
218
+ )
219
+ set_raw_envelope_status(raw_ids[0], 'parse_failed', db_path=str(db_path), failure_reason='empty message')
220
+ set_raw_envelope_status(raw_ids[1], 'canonicalized', db_path=str(db_path))
221
+ set_raw_envelope_status(raw_ids[2], 'unsupported', db_path=str(db_path), failure_reason='unsupported source')
222
+
223
+ all_rows = list_recent_raw_envelopes(str(db_path), limit=10)
224
+ assert [row['id'] for row in all_rows] == [raw_ids[2], raw_ids[1], raw_ids[0]]
225
+
226
+ parsed_only = list_recent_raw_envelopes(str(db_path), status='parse_failed', limit=10)
227
+ assert [row['id'] for row in parsed_only] == [raw_ids[0]]
228
+
229
+
230
+ def test_query_recent_failed_raw_envelopes_with_status_filter(tmp_path: Path) -> None:
231
+ db_path = tmp_path / 'brainstem.sqlite3'
232
+ init_db(str(db_path))
233
+ raw_events = [
234
+ RawInputEnvelope(
235
+ tenant_id="client-a",
236
+ source_type="syslog",
237
+ timestamp="2026-03-22T00:00:01Z",
238
+ message_raw="first",
239
+ host="fw-01",
240
+ service="sshd",
241
+ ),
242
+ RawInputEnvelope(
243
+ tenant_id="client-a",
244
+ source_type="syslog",
245
+ timestamp="2026-03-22T00:00:02Z",
246
+ message_raw="second",
247
+ host="fw-01",
248
+ service="sshd",
249
+ ),
250
+ RawInputEnvelope(
251
+ tenant_id="client-a",
252
+ source_type="syslog",
253
+ timestamp="2026-03-22T00:00:03Z",
254
+ message_raw="third",
255
+ host="fw-01",
256
+ service="sshd",
257
+ ),
258
+ ]
259
+ raw_ids = store_raw_envelopes(raw_events, str(db_path))
260
+ set_raw_envelope_status(raw_ids[0], "parse_failed", db_path=str(db_path), failure_reason="empty message")
261
+ set_raw_envelope_status(raw_ids[1], "canonicalized", db_path=str(db_path))
262
+ set_raw_envelope_status(raw_ids[2], "unsupported", db_path=str(db_path), failure_reason="unsupported source")
263
+
264
+ failures = list_recent_failed_raw_envelopes(str(db_path), limit=10)
265
+ assert [row["id"] for row in failures] == [raw_ids[2], raw_ids[0]]
266
+ assert failures[0]["canonicalization_status"] == "unsupported"
267
+ assert failures[1]["canonicalization_status"] == "parse_failed"
268
+
269
+ parsed_only = list_recent_failed_raw_envelopes(str(db_path), status="parse_failed", limit=10)
270
+ assert len(parsed_only) == 1
271
+ assert parsed_only[0]["id"] == raw_ids[0]
272
+
273
+
274
+ def test_get_raw_envelope_by_id(tmp_path: Path) -> None:
275
+ db_path = tmp_path / 'brainstem.sqlite3'
276
+ init_db(str(db_path))
277
+ raw_events = [
278
+ RawInputEnvelope(
279
+ tenant_id="client-a",
280
+ source_type="syslog",
281
+ timestamp="2026-03-22T00:00:01Z",
282
+ message_raw="single",
283
+ host="fw-01",
284
+ service="charon",
285
+ )
286
+ ]
287
+ (raw_id,) = store_raw_envelopes(raw_events, str(db_path))
288
+ set_raw_envelope_status(raw_id, "parse_failed", db_path=str(db_path), failure_reason="empty message")
289
+
290
+ row = get_raw_envelope_by_id(raw_id, db_path=str(db_path))
291
+ assert row is not None
292
+ assert row["id"] == raw_id
293
+ assert row["canonicalization_status"] == "parse_failed"
294
+ assert row["failure_reason"] == "empty message"