@simbimbo/brainstem 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,25 @@
1
+ import sqlite3
1
2
  from pathlib import Path
2
3
 
3
- from brainstem.ingest import ingest_syslog_lines, signatures_for_events
4
+ from brainstem.ingest import ingest_syslog_lines, parse_syslog_envelopes, run_ingest_pipeline, signatures_for_events
5
+ from brainstem.models import RawInputEnvelope
4
6
  from brainstem.recurrence import build_recurrence_candidates
5
- from brainstem.storage import init_db, list_candidates, store_candidates, store_events, store_signatures
7
+ from brainstem.storage import (
8
+ extract_source_raw_envelope_ids,
9
+ get_raw_envelope_by_id,
10
+ get_ingest_stats,
11
+ init_db,
12
+ list_candidates,
13
+ get_source_dimension_summaries,
14
+ get_source_status_summaries,
15
+ store_candidates,
16
+ store_events,
17
+ list_recent_failed_raw_envelopes,
18
+ list_recent_raw_envelopes,
19
+ store_raw_envelopes,
20
+ set_raw_envelope_status,
21
+ store_signatures,
22
+ )
6
23
 
7
24
 
8
25
  def test_storage_round_trip(tmp_path: Path) -> None:
@@ -24,3 +41,354 @@ def test_storage_round_trip(tmp_path: Path) -> None:
24
41
  rows = list_candidates(str(db_path), limit=10)
25
42
  assert rows
26
43
  assert rows[0]['title']
44
+
45
+
46
+ def test_raw_envelope_lineage_stored_in_signature_and_candidate_metadata(tmp_path: Path) -> None:
47
+ db_path = tmp_path / 'brainstem.sqlite3'
48
+ raw_envelopes = parse_syslog_envelopes(
49
+ [
50
+ "Mar 22 00:00:01 fw-01 charon: VPN tunnel dropped and recovered",
51
+ "Mar 22 00:00:03 fw-01 charon: VPN tunnel dropped and recovered",
52
+ "Mar 22 00:00:05 fw-01 charon: VPN tunnel dropped and recovered",
53
+ ],
54
+ tenant_id="client-a",
55
+ source_path="/var/log/syslog",
56
+ )
57
+ result = run_ingest_pipeline(raw_envelopes, threshold=2, db_path=str(db_path))
58
+
59
+ assert result.raw_envelope_ids == [1, 2, 3]
60
+ assert all(event.raw_envelope_id is not None for event in result.events)
61
+
62
+ signatures = sqlite3.connect(db_path)
63
+ try:
64
+ signature_rows = signatures.execute(
65
+ "SELECT metadata_json FROM signatures ORDER BY id ASC"
66
+ ).fetchall()
67
+ finally:
68
+ signatures.close()
69
+ assert signature_rows
70
+ signature_metadata = extract_source_raw_envelope_ids(signature_rows[0][0])
71
+ assert set(signature_metadata) == set(result.raw_envelope_ids)
72
+
73
+ candidate_rows = list_candidates(str(db_path), limit=10)
74
+ assert candidate_rows
75
+ candidate_metadata = extract_source_raw_envelope_ids(candidate_rows[0]["metadata_json"])
76
+ assert set(candidate_metadata) == set(result.raw_envelope_ids)
77
+
78
+
79
+ def test_raw_envelope_records_are_persisted(tmp_path: Path) -> None:
80
+ db_path = tmp_path / 'brainstem.sqlite3'
81
+ init_db(str(db_path))
82
+ raw_events = [
83
+ RawInputEnvelope(
84
+ tenant_id="client-a",
85
+ source_type="syslog",
86
+ timestamp="2026-03-22T00:00:01Z",
87
+ message_raw="VPN tunnel dropped and recovered",
88
+ host="fw-01",
89
+ service="charon",
90
+ ),
91
+ RawInputEnvelope(
92
+ tenant_id="client-a",
93
+ source_type="syslog",
94
+ timestamp="2026-03-22T00:00:02Z",
95
+ message_raw="IPsec SA rekey succeeded",
96
+ host="fw-01",
97
+ service="charon",
98
+ ),
99
+ ]
100
+ assert store_raw_envelopes(raw_events, str(db_path)) == [1, 2]
101
+
102
+ conn = sqlite3.connect(db_path)
103
+ try:
104
+ rows = conn.execute(
105
+ "SELECT tenant_id, source_type, message_raw, canonicalization_status FROM raw_envelopes ORDER BY id ASC"
106
+ ).fetchall()
107
+ finally:
108
+ conn.close()
109
+
110
+ assert len(rows) == 2
111
+ assert rows[0][0] == "client-a"
112
+ assert rows[0][1] == "syslog"
113
+ assert rows[0][2] == "VPN tunnel dropped and recovered"
114
+ assert rows[0][3] == "received"
115
+ assert rows[1][3] == "received"
116
+
117
+
118
+ def test_ingest_stats_from_raw_envelopes(tmp_path: Path) -> None:
119
+ db_path = tmp_path / 'brainstem.sqlite3'
120
+ init_db(str(db_path))
121
+ conn = sqlite3.connect(db_path)
122
+ try:
123
+ conn.execute(
124
+ """
125
+ INSERT INTO raw_envelopes (
126
+ tenant_id, source_type, timestamp, host, service, severity,
127
+ asset_id, source_path, facility, message_raw,
128
+ structured_fields_json, correlation_keys_json, metadata_json,
129
+ canonicalization_status, failure_reason
130
+ ) VALUES (
131
+ 'client-a', 'syslog', '2026-03-22T00:00:00Z',
132
+ 'fw-01', 'charon', 'info', '', '', '', 'ok', '{}', '{}', '{}',
133
+ 'canonicalized', NULL
134
+ )
135
+ """
136
+ )
137
+ conn.execute(
138
+ """
139
+ INSERT INTO raw_envelopes (
140
+ tenant_id, source_type, timestamp, host, service, severity,
141
+ asset_id, source_path, facility, message_raw,
142
+ structured_fields_json, correlation_keys_json, metadata_json,
143
+ canonicalization_status, failure_reason
144
+ ) VALUES (
145
+ 'client-a', 'syslog', '2026-03-22T00:00:00Z',
146
+ 'fw-01', 'charon', 'info', '', '', '', 'bad', '{}', '{}', '{}',
147
+ 'parse_failed', 'message empty'
148
+ )
149
+ """
150
+ )
151
+ conn.execute(
152
+ "INSERT INTO candidates (candidate_type, title, summary, score_total, score_breakdown_json, decision_band, source_signature_ids_json, source_event_ids_json, confidence, metadata_json) VALUES ('recurrence', 'x', 'y', 1.0, '{}', 'medium', '[]', '[]', 0.1, '{}')"
153
+ )
154
+ conn.commit()
155
+ finally:
156
+ conn.close()
157
+
158
+ stats = get_ingest_stats(str(db_path))
159
+ assert stats["received"] == 2
160
+ assert stats["canonicalized"] == 1
161
+ assert stats["parse_failed"] == 1
162
+ assert stats["candidates_generated"] == 1
163
+
164
+
165
+ def test_source_dimension_summaries(tmp_path: Path) -> None:
166
+ db_path = tmp_path / 'brainstem.sqlite3'
167
+ init_db(str(db_path))
168
+ store_raw_envelopes(
169
+ [
170
+ RawInputEnvelope(
171
+ tenant_id='client-a',
172
+ source_type='syslog',
173
+ source_id='fw-01',
174
+ source_name='edge-fw-01',
175
+ timestamp='2026-03-22T00:00:01Z',
176
+ message_raw='VPN tunnel dropped and recovered',
177
+ source_path='/var/log/syslog',
178
+ host='fw-01',
179
+ service='charon',
180
+ ),
181
+ RawInputEnvelope(
182
+ tenant_id='client-a',
183
+ source_type='syslog',
184
+ source_id='fw-01',
185
+ source_name='edge-fw-01',
186
+ timestamp='2026-03-22T00:00:02Z',
187
+ message_raw='IPsec SA rekey succeeded',
188
+ source_path='/var/log/syslog',
189
+ host='fw-01',
190
+ service='charon',
191
+ ),
192
+ RawInputEnvelope(
193
+ tenant_id='client-a',
194
+ source_type='logicmonitor',
195
+ source_id='lm-1',
196
+ source_name='edge-lm-01',
197
+ timestamp='2026-03-22T00:00:03Z',
198
+ message_raw='CPU usage high',
199
+ source_path='/alerts',
200
+ host='lm-01',
201
+ service='logicmonitor',
202
+ ),
203
+ ],
204
+ db_path=str(db_path),
205
+ )
206
+
207
+ summary = get_source_dimension_summaries(str(db_path), limit=10)
208
+ assert summary['source_type'][0]['value'] == "syslog"
209
+ assert summary['source_type'][0]['count'] == 2
210
+ assert summary['source_type'][1]['value'] == "logicmonitor"
211
+ assert summary['source_type'][1]['count'] == 1
212
+ assert dict((entry['value'], entry['count']) for entry in summary['source_path']) == {
213
+ '/alerts': 1,
214
+ '/var/log/syslog': 2,
215
+ }
216
+ assert dict((entry['value'], entry['count']) for entry in summary['source_id']) == {
217
+ 'fw-01': 2,
218
+ 'lm-1': 1,
219
+ }
220
+
221
+
222
+ def test_source_status_summaries_from_raw_envelope_history(tmp_path: Path) -> None:
223
+ db_path = tmp_path / 'brainstem.sqlite3'
224
+ init_db(str(db_path))
225
+ raw_ids = store_raw_envelopes(
226
+ [
227
+ RawInputEnvelope(
228
+ tenant_id='client-a',
229
+ source_type='syslog',
230
+ source_id='fw-01',
231
+ source_name='edge-fw-01',
232
+ timestamp='2026-03-22T00:00:01Z',
233
+ message_raw='event 1',
234
+ source_path='/var/log/syslog',
235
+ host='fw-01',
236
+ service='charon',
237
+ ),
238
+ RawInputEnvelope(
239
+ tenant_id='client-a',
240
+ source_type='syslog',
241
+ source_id='fw-01',
242
+ source_name='edge-fw-01',
243
+ timestamp='2026-03-22T00:00:02Z',
244
+ message_raw='event 2',
245
+ source_path='/var/log/syslog',
246
+ host='fw-01',
247
+ service='charon',
248
+ ),
249
+ RawInputEnvelope(
250
+ tenant_id='client-a',
251
+ source_type='syslog',
252
+ source_id='fw-02',
253
+ source_name='edge-fw-02',
254
+ timestamp='2026-03-22T00:00:03Z',
255
+ message_raw='event 3',
256
+ source_path='/var/log/auth.log',
257
+ host='fw-02',
258
+ service='sshd',
259
+ ),
260
+ ],
261
+ db_path=str(db_path),
262
+ )
263
+ set_raw_envelope_status(raw_ids[0], 'parse_failed', db_path=str(db_path), failure_reason='empty event')
264
+ set_raw_envelope_status(raw_ids[1], 'canonicalized', db_path=str(db_path))
265
+ set_raw_envelope_status(raw_ids[2], 'unsupported', db_path=str(db_path), failure_reason='bad source payload')
266
+
267
+ summaries = get_source_status_summaries(str(db_path), limit=10)
268
+ assert len(summaries) == 2
269
+ fw01 = next(item for item in summaries if item['source_id'] == 'fw-01' and item['source_path'] == '/var/log/syslog')
270
+ fw02 = next(item for item in summaries if item['source_id'] == 'fw-02')
271
+ assert fw01['raw_count'] == 2
272
+ assert fw01['canonicalized_count'] == 1
273
+ assert fw01['parse_failed_count'] == 1
274
+ assert fw01['unsupported_count'] == 0
275
+ assert fw01['first_seen_at'] == '2026-03-22T00:00:01Z'
276
+ assert fw01['last_seen_at'] == '2026-03-22T00:00:02Z'
277
+ assert fw02['raw_count'] == 1
278
+ assert fw02['canonicalized_count'] == 0
279
+ assert fw02['parse_failed_count'] == 0
280
+ assert fw02['unsupported_count'] == 1
281
+
282
+ filtered = get_source_status_summaries(str(db_path), source_type='syslog', source_id='fw-01', source_path='/var/log/syslog')
283
+ assert len(filtered) == 1
284
+ assert filtered[0]['source_id'] == 'fw-01'
285
+
286
+
287
+ def test_list_recent_raw_envelopes_supports_status_filtering(tmp_path: Path) -> None:
288
+ db_path = tmp_path / 'brainstem.sqlite3'
289
+ init_db(str(db_path))
290
+ raw_ids = store_raw_envelopes(
291
+ [
292
+ RawInputEnvelope(
293
+ tenant_id='client-a',
294
+ source_type='syslog',
295
+ timestamp='2026-03-22T00:00:01Z',
296
+ message_raw='first',
297
+ host='fw-01',
298
+ service='sshd',
299
+ ),
300
+ RawInputEnvelope(
301
+ tenant_id='client-a',
302
+ source_type='syslog',
303
+ timestamp='2026-03-22T00:00:02Z',
304
+ message_raw='second',
305
+ host='fw-01',
306
+ service='sshd',
307
+ ),
308
+ RawInputEnvelope(
309
+ tenant_id='client-a',
310
+ source_type='syslog',
311
+ timestamp='2026-03-22T00:00:03Z',
312
+ message_raw='third',
313
+ host='fw-01',
314
+ service='sshd',
315
+ ),
316
+ ],
317
+ db_path=str(db_path),
318
+ )
319
+ set_raw_envelope_status(raw_ids[0], 'parse_failed', db_path=str(db_path), failure_reason='empty message')
320
+ set_raw_envelope_status(raw_ids[1], 'canonicalized', db_path=str(db_path))
321
+ set_raw_envelope_status(raw_ids[2], 'unsupported', db_path=str(db_path), failure_reason='unsupported source')
322
+
323
+ all_rows = list_recent_raw_envelopes(str(db_path), limit=10)
324
+ assert [row['id'] for row in all_rows] == [raw_ids[2], raw_ids[1], raw_ids[0]]
325
+
326
+ parsed_only = list_recent_raw_envelopes(str(db_path), status='parse_failed', limit=10)
327
+ assert [row['id'] for row in parsed_only] == [raw_ids[0]]
328
+
329
+
330
+ def test_query_recent_failed_raw_envelopes_with_status_filter(tmp_path: Path) -> None:
331
+ db_path = tmp_path / 'brainstem.sqlite3'
332
+ init_db(str(db_path))
333
+ raw_events = [
334
+ RawInputEnvelope(
335
+ tenant_id="client-a",
336
+ source_type="syslog",
337
+ timestamp="2026-03-22T00:00:01Z",
338
+ message_raw="first",
339
+ host="fw-01",
340
+ service="sshd",
341
+ ),
342
+ RawInputEnvelope(
343
+ tenant_id="client-a",
344
+ source_type="syslog",
345
+ timestamp="2026-03-22T00:00:02Z",
346
+ message_raw="second",
347
+ host="fw-01",
348
+ service="sshd",
349
+ ),
350
+ RawInputEnvelope(
351
+ tenant_id="client-a",
352
+ source_type="syslog",
353
+ timestamp="2026-03-22T00:00:03Z",
354
+ message_raw="third",
355
+ host="fw-01",
356
+ service="sshd",
357
+ ),
358
+ ]
359
+ raw_ids = store_raw_envelopes(raw_events, str(db_path))
360
+ set_raw_envelope_status(raw_ids[0], "parse_failed", db_path=str(db_path), failure_reason="empty message")
361
+ set_raw_envelope_status(raw_ids[1], "canonicalized", db_path=str(db_path))
362
+ set_raw_envelope_status(raw_ids[2], "unsupported", db_path=str(db_path), failure_reason="unsupported source")
363
+
364
+ failures = list_recent_failed_raw_envelopes(str(db_path), limit=10)
365
+ assert [row["id"] for row in failures] == [raw_ids[2], raw_ids[0]]
366
+ assert failures[0]["canonicalization_status"] == "unsupported"
367
+ assert failures[1]["canonicalization_status"] == "parse_failed"
368
+
369
+ parsed_only = list_recent_failed_raw_envelopes(str(db_path), status="parse_failed", limit=10)
370
+ assert len(parsed_only) == 1
371
+ assert parsed_only[0]["id"] == raw_ids[0]
372
+
373
+
374
+ def test_get_raw_envelope_by_id(tmp_path: Path) -> None:
375
+ db_path = tmp_path / 'brainstem.sqlite3'
376
+ init_db(str(db_path))
377
+ raw_events = [
378
+ RawInputEnvelope(
379
+ tenant_id="client-a",
380
+ source_type="syslog",
381
+ timestamp="2026-03-22T00:00:01Z",
382
+ message_raw="single",
383
+ host="fw-01",
384
+ service="charon",
385
+ )
386
+ ]
387
+ (raw_id,) = store_raw_envelopes(raw_events, str(db_path))
388
+ set_raw_envelope_status(raw_id, "parse_failed", db_path=str(db_path), failure_reason="empty message")
389
+
390
+ row = get_raw_envelope_by_id(raw_id, db_path=str(db_path))
391
+ assert row is not None
392
+ assert row["id"] == raw_id
393
+ assert row["canonicalization_status"] == "parse_failed"
394
+ assert row["failure_reason"] == "empty message"