alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,746 @@
1
+ """
2
+ Transparency inventory substrate health checker.
3
+
4
+ Reads ``transparency_inventory.yaml``, validates that each row's
5
+ expected artifact exists with the expected cadence and content, and
6
+ returns per-row results. The Saturday and weekday Step Functions both
7
+ invoke this checker; the cadence flag determines which subset of rows
8
+ runs.
9
+
10
+ Phase 2 → 3 gate: ≥ 99% of inventory rows pass for 8 consecutive
11
+ weeks. The check fires per-row CloudWatch metrics so individual rows
12
+ have their own alarms — a failed row pages immediately, the gate
13
+ denominator is decremented for that row, and the 8-week clock resets.
14
+
15
+ Source kinds supported in v1:
16
+
17
+ s3_json HEAD + GET an S3 JSON object; assert_keys_present,
18
+ assert (path / op / value).
19
+ s3_csv HEAD + GET an S3 CSV; assert_columns_present,
20
+ assert_columns_non_null_for_rows_after,
21
+ assert_value_on_latest_row.
22
+ s3_parquet HEAD + GET an S3 parquet; assert_columns_present,
23
+ assert_column_non_null.
24
+ sqlite_via_s3 Download SQLite DB from S3, run PRAGMA table_info
25
+ against ``table``, assert_columns_present.
26
+ cloudwatch GetMetricData over ``window_days``, assert
27
+ success_rate_pct_gte | datapoints_gte.
28
+
29
+ Source kinds not in v1 (deferred): cloudwatch_search,
30
+ custom_python_callable.
31
+
32
+ The checker is read-only — it does not write artifacts of its own.
33
+ The caller (CLI ``main()``) emits CloudWatch metrics from the result
34
+ list and optionally publishes SNS.
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import argparse
40
+ import io
41
+ import json
42
+ import logging
43
+ import os
44
+ import sqlite3
45
+ import sys
46
+ import tempfile
47
+ from dataclasses import dataclass, field
48
+ from datetime import date, datetime, timedelta, timezone
49
+ from pathlib import Path
50
+ from typing import Any, Callable, Iterable
51
+
52
+ log = logging.getLogger(__name__)
53
+
54
+ INVENTORY_PATH = Path(__file__).parent / "transparency_inventory.yaml"
55
+
56
+ DEFAULT_BUCKET = "alpha-engine-research"
57
+ DEFAULT_NAMESPACE_OUT = "AlphaEngine/Substrate"
58
+ DEFAULT_SNS_TOPIC = "arn:aws:sns:us-east-1:711398986525:alpha-engine-alerts"
59
+
60
+
61
+ @dataclass
62
+ class CheckResult:
63
+ """Outcome of validating one inventory row."""
64
+
65
+ row_id: str
66
+ cadence: str
67
+ status: str # "ok" | "fail" | "not_yet_effective" | "error"
68
+ detail: str
69
+ effective_date: str
70
+ artifact: str | None = None
71
+ sub_failures: list[str] = field(default_factory=list)
72
+
73
+
74
+ def load_inventory(path: Path | None = None) -> dict:
75
+ """Load and parse the inventory YAML.
76
+
77
+ Imports yaml lazily so the rest of the lib stays import-light for
78
+ consumers that don't use this module.
79
+ """
80
+ import yaml
81
+
82
+ p = path or INVENTORY_PATH
83
+ with p.open() as fh:
84
+ return yaml.safe_load(fh)
85
+
86
+
87
+ def check_inventory(
88
+ cadence: str,
89
+ *,
90
+ today: date | None = None,
91
+ inventory: dict | None = None,
92
+ s3_client: Any = None,
93
+ cloudwatch_client: Any = None,
94
+ ) -> list[CheckResult]:
95
+ """Validate every inventory row whose ``cadence`` matches the input.
96
+
97
+ The Saturday SF passes ``cadence="weekly"`` to validate weekly +
98
+ daily rows (since daily artifacts from Friday should be readable
99
+ on Saturday). The weekday SF passes ``cadence="daily"`` to
100
+ validate only daily rows.
101
+
102
+ Rows with ``effective_date`` > today are returned with
103
+ ``status="not_yet_effective"`` and contribute to neither
104
+ pass-rate calculation.
105
+ """
106
+ today = today or _today_utc()
107
+ inv = inventory or load_inventory()
108
+
109
+ rows = list(_filter_rows(inv["inventory"], cadence))
110
+ results: list[CheckResult] = []
111
+
112
+ for row in rows:
113
+ results.append(_check_row(row, today, s3_client, cloudwatch_client))
114
+
115
+ return results
116
+
117
+
118
+ def _today_utc() -> date:
119
+ return datetime.now(timezone.utc).date()
120
+
121
+
122
+ def _filter_rows(rows: Iterable[dict], cadence: str) -> Iterable[dict]:
123
+ """Pick rows that the given run should validate.
124
+
125
+ Saturday (cadence='weekly') validates everything; weekday
126
+ (cadence='daily') validates only daily rows; per-event cadence
127
+ is validated only when explicitly requested.
128
+ """
129
+ if cadence == "weekly":
130
+ wanted = {"weekly", "daily"}
131
+ elif cadence == "daily":
132
+ wanted = {"daily"}
133
+ elif cadence == "per_event":
134
+ wanted = {"per_event"}
135
+ else:
136
+ raise ValueError(f"Unknown cadence: {cadence}")
137
+ for row in rows:
138
+ if row["cadence"] in wanted:
139
+ yield row
140
+
141
+
142
+ def _check_row(
143
+ row: dict,
144
+ today: date,
145
+ s3_client: Any,
146
+ cloudwatch_client: Any,
147
+ ) -> CheckResult:
148
+ eff = date.fromisoformat(str(row["effective_date"]))
149
+ if today < eff:
150
+ return CheckResult(
151
+ row_id=row["id"],
152
+ cadence=row["cadence"],
153
+ status="not_yet_effective",
154
+ detail=f"effective_date={eff} > today={today}",
155
+ effective_date=str(eff),
156
+ )
157
+
158
+ sub: list[str] = []
159
+ artifact_hint: str | None = None
160
+ for src in row["sources"]:
161
+ try:
162
+ ok, detail, artifact = _check_source(
163
+ src, today, s3_client, cloudwatch_client
164
+ )
165
+ except Exception as exc: # pragma: no cover — defensive
166
+ ok, detail, artifact = False, f"checker error: {exc!r}", None
167
+ if artifact and artifact_hint is None:
168
+ artifact_hint = artifact
169
+ if ok:
170
+ return CheckResult(
171
+ row_id=row["id"],
172
+ cadence=row["cadence"],
173
+ status="ok",
174
+ detail=detail,
175
+ effective_date=str(eff),
176
+ artifact=artifact_hint,
177
+ )
178
+ sub.append(detail)
179
+
180
+ return CheckResult(
181
+ row_id=row["id"],
182
+ cadence=row["cadence"],
183
+ status="fail",
184
+ detail="; ".join(sub),
185
+ effective_date=str(eff),
186
+ artifact=artifact_hint,
187
+ sub_failures=sub,
188
+ )
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # Source-kind dispatchers
193
+ # ---------------------------------------------------------------------------
194
+
195
+
196
+ def _check_source(
197
+ src: dict,
198
+ today: date,
199
+ s3_client: Any,
200
+ cloudwatch_client: Any,
201
+ ) -> tuple[bool, str, str | None]:
202
+ kind = src["kind"]
203
+ handler = _SOURCE_HANDLERS.get(kind)
204
+ if handler is None:
205
+ return False, f"unsupported source kind: {kind}", None
206
+ return handler(src, today, s3_client, cloudwatch_client)
207
+
208
+
209
+ def _resolve_key(src: dict, today: date) -> tuple[str, str]:
210
+ """Return (key, age_window_label).
211
+
212
+ Two patterns:
213
+ key — fixed S3 key, no date templating
214
+ key_pattern — contains {date}; checker walks back N days to
215
+ find the most recent matching object
216
+ """
217
+ if "key" in src:
218
+ return src["key"], "fixed"
219
+ if "key_pattern" not in src:
220
+ raise ValueError(f"source missing key/key_pattern: {src}")
221
+ return src["key_pattern"], "templated"
222
+
223
+
224
+ def _walk_back(
225
+ pattern: str,
226
+ today: date,
227
+ max_age_days: int,
228
+ exists: Callable[[str], bool],
229
+ ) -> tuple[str | None, int]:
230
+ """Walk back day-by-day, return first key whose object exists.
231
+
232
+ Returns (key, age_in_days) or (None, age_at_limit).
233
+ """
234
+ for i in range(max_age_days + 1):
235
+ d = today - timedelta(days=i)
236
+ key = pattern.format(date=d.isoformat())
237
+ if exists(key):
238
+ return key, i
239
+ return None, max_age_days + 1
240
+
241
+
242
+ def _s3_head(s3_client: Any, bucket: str, key: str) -> bool:
243
+ try:
244
+ s3_client.head_object(Bucket=bucket, Key=key)
245
+ return True
246
+ except Exception:
247
+ return False
248
+
249
+
250
+ def _s3_get_bytes(s3_client: Any, bucket: str, key: str) -> bytes:
251
+ obj = s3_client.get_object(Bucket=bucket, Key=key)
252
+ return obj["Body"].read()
253
+
254
+
255
+ def _s3_age_days(s3_client: Any, bucket: str, key: str) -> int | None:
256
+ try:
257
+ resp = s3_client.head_object(Bucket=bucket, Key=key)
258
+ modified = resp["LastModified"]
259
+ return (datetime.now(timezone.utc) - modified).days
260
+ except Exception:
261
+ return None
262
+
263
+
264
+ def _resolve_and_age(
265
+ src: dict, today: date, s3_client: Any
266
+ ) -> tuple[str | None, int | None, str]:
267
+ """Locate the artifact key + report its age. Common to all S3 kinds."""
268
+ bucket = src.get("bucket", DEFAULT_BUCKET)
269
+ key, mode = _resolve_key(src, today)
270
+ max_age = src.get("max_age_days", 8)
271
+ if mode == "fixed":
272
+ age = _s3_age_days(s3_client, bucket, key)
273
+ if age is None:
274
+ return None, None, f"missing s3://{bucket}/{key}"
275
+ if age > max_age:
276
+ return key, age, (
277
+ f"stale s3://{bucket}/{key} (age={age}d > {max_age}d)"
278
+ )
279
+ return key, age, "ok"
280
+ # templated
281
+ resolved_key, age = _walk_back(
282
+ key,
283
+ today,
284
+ max_age,
285
+ lambda k: _s3_head(s3_client, bucket, k),
286
+ )
287
+ if resolved_key is None:
288
+ return None, None, (
289
+ f"no object matching s3://{bucket}/{key} within {max_age}d"
290
+ )
291
+ return resolved_key, age, "ok"
292
+
293
+
294
+ def _check_s3_json(
295
+ src: dict, today: date, s3_client: Any, _cw: Any
296
+ ) -> tuple[bool, str, str | None]:
297
+ bucket = src.get("bucket", DEFAULT_BUCKET)
298
+ key, age, status = _resolve_and_age(src, today, s3_client)
299
+ if key is None:
300
+ # Companion fallback for "ok_if_companion_present"
301
+ if src.get("treat_absent_as") == "ok_if_companion_present":
302
+ comp_pattern = src.get("companion_key_pattern")
303
+ if comp_pattern:
304
+ comp_key, _ = _walk_back(
305
+ comp_pattern,
306
+ today,
307
+ src.get("max_age_days", 8),
308
+ lambda k: _s3_head(s3_client, bucket, k),
309
+ )
310
+ if comp_key:
311
+ return True, (
312
+ f"primary absent, companion present: "
313
+ f"s3://{bucket}/{comp_key}"
314
+ ), comp_key
315
+ return False, status, None
316
+ if status != "ok":
317
+ return False, status, key
318
+
319
+ body = _s3_get_bytes(s3_client, bucket, key)
320
+ try:
321
+ payload = json.loads(body)
322
+ except Exception as exc:
323
+ return False, f"json parse error on s3://{bucket}/{key}: {exc!r}", key
324
+
325
+ failures: list[str] = []
326
+ for required in src.get("assert_keys_present", []):
327
+ if required not in payload:
328
+ failures.append(f"missing key '{required}'")
329
+ for assertion in src.get("assert", []):
330
+ ok, detail = _eval_path_assertion(payload, assertion)
331
+ if not ok:
332
+ failures.append(detail)
333
+ if failures:
334
+ return False, "; ".join(failures), key
335
+ return True, f"ok (age={age}d)", key
336
+
337
+
338
+ def _check_s3_csv(
339
+ src: dict, today: date, s3_client: Any, _cw: Any
340
+ ) -> tuple[bool, str, str | None]:
341
+ bucket = src.get("bucket", DEFAULT_BUCKET)
342
+ key, age, status = _resolve_and_age(src, today, s3_client)
343
+ if key is None or status != "ok":
344
+ return False, status, key
345
+
346
+ body = _s3_get_bytes(s3_client, bucket, key)
347
+ try:
348
+ import pandas as pd
349
+
350
+ df = pd.read_csv(io.BytesIO(body))
351
+ except Exception as exc:
352
+ return False, f"csv parse error on s3://{bucket}/{key}: {exc!r}", key
353
+
354
+ failures: list[str] = []
355
+ for col in src.get("assert_columns_present", []):
356
+ if col not in df.columns:
357
+ failures.append(f"missing column '{col}'")
358
+
359
+ rule = src.get("assert_columns_non_null_for_rows_after")
360
+ if rule and not failures:
361
+ date_col = rule["date_column"]
362
+ threshold = date.fromisoformat(str(rule["rows_after"]))
363
+ cols = rule["columns"]
364
+ action_filter = rule.get("action_filter")
365
+ if date_col not in df.columns:
366
+ failures.append(f"missing date_column '{date_col}'")
367
+ else:
368
+ try:
369
+ # Coerce date_column to date for comparison; tolerate
370
+ # both 'YYYY-MM-DD' and ISO timestamps.
371
+ d_col = pd.to_datetime(df[date_col], errors="coerce").dt.date
372
+ mask = d_col > threshold
373
+ sub = df[mask]
374
+ if action_filter:
375
+ a_col = action_filter["column"]
376
+ a_val = action_filter["equals"]
377
+ if a_col in sub.columns:
378
+ sub = sub[sub[a_col] == a_val]
379
+ if not sub.empty:
380
+ for col in cols:
381
+ if col not in sub.columns:
382
+ failures.append(f"missing column '{col}' for non-null assertion")
383
+ continue
384
+ nulls = sub[col].isna().sum()
385
+ if nulls > 0:
386
+ failures.append(
387
+ f"column '{col}' has {int(nulls)} null rows after {threshold}"
388
+ )
389
+ except Exception as exc:
390
+ failures.append(f"non-null check error: {exc!r}")
391
+
392
+ latest = src.get("assert_value_on_latest_row")
393
+ if latest and not failures:
394
+ col = latest["column"]
395
+ if col not in df.columns:
396
+ failures.append(f"missing column '{col}' for latest-row assertion")
397
+ elif df.empty:
398
+ failures.append(f"csv empty — cannot evaluate '{col}' on latest row")
399
+ else:
400
+ val = df[col].iloc[-1]
401
+ ok, detail = _eval_op(val, latest["op"], latest["value"])
402
+ if not ok:
403
+ failures.append(detail)
404
+
405
+ if failures:
406
+ return False, "; ".join(failures), key
407
+ return True, f"ok (age={age}d, rows={len(df)})", key
408
+
409
+
410
+ def _check_s3_parquet(
411
+ src: dict, today: date, s3_client: Any, _cw: Any
412
+ ) -> tuple[bool, str, str | None]:
413
+ bucket = src.get("bucket", DEFAULT_BUCKET)
414
+ key, age, status = _resolve_and_age(src, today, s3_client)
415
+ if key is None or status != "ok":
416
+ return False, status, key
417
+
418
+ body = _s3_get_bytes(s3_client, bucket, key)
419
+ try:
420
+ import pandas as pd
421
+
422
+ df = pd.read_parquet(io.BytesIO(body))
423
+ except Exception as exc:
424
+ return False, f"parquet parse error on s3://{bucket}/{key}: {exc!r}", key
425
+
426
+ failures: list[str] = []
427
+ for col in src.get("assert_columns_present", []):
428
+ if col not in df.columns:
429
+ failures.append(f"missing column '{col}'")
430
+ for col in src.get("assert_column_non_null", []):
431
+ if col not in df.columns:
432
+ failures.append(f"missing column '{col}' for non-null check")
433
+ continue
434
+ nulls = df[col].isna().sum()
435
+ if nulls > 0:
436
+ failures.append(f"column '{col}' has {int(nulls)} null rows")
437
+
438
+ if failures:
439
+ return False, "; ".join(failures), key
440
+ return True, f"ok (age={age}d, rows={len(df)})", key
441
+
442
+
443
+ def _check_sqlite_via_s3(
444
+ src: dict, today: date, s3_client: Any, _cw: Any
445
+ ) -> tuple[bool, str, str | None]:
446
+ bucket = src.get("bucket", DEFAULT_BUCKET)
447
+ key, age, status = _resolve_and_age(src, today, s3_client)
448
+ if key is None or status != "ok":
449
+ return False, status, key
450
+
451
+ table = src["table"]
452
+ body = _s3_get_bytes(s3_client, bucket, key)
453
+
454
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as fh:
455
+ fh.write(body)
456
+ db_path = fh.name
457
+ try:
458
+ conn = sqlite3.connect(db_path)
459
+ try:
460
+ cur = conn.execute(f"PRAGMA table_info({table})")
461
+ cols = {row[1] for row in cur.fetchall()}
462
+ finally:
463
+ conn.close()
464
+ finally:
465
+ try:
466
+ os.unlink(db_path)
467
+ except OSError:
468
+ pass
469
+
470
+ if not cols:
471
+ return False, f"table '{table}' missing in s3://{bucket}/{key}", key
472
+
473
+ failures = [
474
+ f"missing column '{c}' in table '{table}'"
475
+ for c in src.get("assert_columns_present", [])
476
+ if c not in cols
477
+ ]
478
+ if failures:
479
+ return False, "; ".join(failures), key
480
+ return True, f"ok (age={age}d, table='{table}')", key
481
+
482
+
483
+ def _check_cloudwatch(
484
+ src: dict, today: date, _s3: Any, cloudwatch_client: Any
485
+ ) -> tuple[bool, str, str | None]:
486
+ if cloudwatch_client is None:
487
+ import boto3
488
+
489
+ cloudwatch_client = boto3.client("cloudwatch", region_name="us-east-1")
490
+
491
+ namespace = src["namespace"]
492
+ metric = src["metric"]
493
+ window_days = src.get("window_days", 7)
494
+ end = datetime.now(timezone.utc)
495
+ start = end - timedelta(days=window_days)
496
+ # AWS GetMetricStatistics requires Period to be a multiple of 60. Aim
497
+ # for ~100 datapoints across the window, then round down to a multiple
498
+ # of 60 (with a 60s floor for the smallest windows).
499
+ raw_period = max(60, window_days * 86400 // 100)
500
+ period = max(60, (raw_period // 60) * 60)
501
+
502
+ artifact = f"cw://{namespace}/{metric}"
503
+ assertion = src.get("assert", {})
504
+ op = assertion.get("op")
505
+
506
+ if op == "success_rate_pct_gte":
507
+ return _check_cw_success_rate(
508
+ cloudwatch_client, src, start, end, period, assertion["value"]
509
+ )
510
+ if op == "datapoints_gte":
511
+ return _check_cw_datapoints(
512
+ cloudwatch_client, namespace, metric, start, end, period,
513
+ assertion["value"], artifact,
514
+ )
515
+ return False, f"unsupported cloudwatch assert op: {op}", artifact
516
+
517
+
518
+ def _check_cw_success_rate(
519
+ cw: Any, src: dict, start: datetime, end: datetime, period: int, threshold: float
520
+ ) -> tuple[bool, str, str | None]:
521
+ namespace = src["namespace"]
522
+ dim_field = src.get("dimensions", {})
523
+ arns = list(dim_field.get("StateMachineArn", [])) or [None]
524
+
525
+ failures: list[str] = []
526
+ for arn in arns:
527
+ kw = {
528
+ "Namespace": namespace,
529
+ "Period": period,
530
+ "Statistics": ["Sum"],
531
+ "StartTime": start,
532
+ "EndTime": end,
533
+ }
534
+ if arn:
535
+ kw["Dimensions"] = [{"Name": "StateMachineArn", "Value": arn}]
536
+ succ = cw.get_metric_statistics(MetricName="ExecutionsSucceeded", **kw)
537
+ fail = cw.get_metric_statistics(MetricName="ExecutionsFailed", **kw)
538
+ s = sum(p["Sum"] for p in succ.get("Datapoints", []))
539
+ f = sum(p["Sum"] for p in fail.get("Datapoints", []))
540
+ denom = s + f
541
+ if denom == 0:
542
+ failures.append(f"{arn or 'aggregate'}: no datapoints in window")
543
+ continue
544
+ pct = 100.0 * s / denom
545
+ if pct < threshold:
546
+ failures.append(
547
+ f"{arn or 'aggregate'}: success_rate={pct:.2f}% < {threshold}%"
548
+ )
549
+ if failures:
550
+ return False, "; ".join(failures), f"cw://{namespace}/ExecutionsSucceeded"
551
+ return True, "ok", f"cw://{namespace}/ExecutionsSucceeded"
552
+
553
+
554
+ def _check_cw_datapoints(
555
+ cw: Any,
556
+ namespace: str,
557
+ metric: str,
558
+ start: datetime,
559
+ end: datetime,
560
+ period: int,
561
+ threshold: int,
562
+ artifact: str,
563
+ ) -> tuple[bool, str, str | None]:
564
+ resp = cw.get_metric_statistics(
565
+ Namespace=namespace,
566
+ MetricName=metric,
567
+ Period=period,
568
+ Statistics=["SampleCount"],
569
+ StartTime=start,
570
+ EndTime=end,
571
+ )
572
+ n = sum(p["SampleCount"] for p in resp.get("Datapoints", []))
573
+ if n < threshold:
574
+ return False, (
575
+ f"only {int(n)} datapoints in {namespace}/{metric} (need ≥ {threshold})"
576
+ ), artifact
577
+ return True, f"ok (n={int(n)})", artifact
578
+
579
+
580
+ _SOURCE_HANDLERS: dict[str, Callable] = {
581
+ "s3_json": _check_s3_json,
582
+ "s3_csv": _check_s3_csv,
583
+ "s3_parquet": _check_s3_parquet,
584
+ "sqlite_via_s3": _check_sqlite_via_s3,
585
+ "cloudwatch": _check_cloudwatch,
586
+ }
587
+
588
+
589
+ # ---------------------------------------------------------------------------
590
+ # Assertion primitives
591
+ # ---------------------------------------------------------------------------
592
+
593
+
594
+ def _eval_path_assertion(payload: Any, assertion: dict) -> tuple[bool, str]:
595
+ path = assertion["path"]
596
+ cur: Any = payload
597
+ for part in path.split("."):
598
+ if isinstance(cur, dict) and part in cur:
599
+ cur = cur[part]
600
+ else:
601
+ return False, f"path '{path}' not found"
602
+ return _eval_op(cur, assertion["op"], assertion["value"], path=path)
603
+
604
+
605
+ def _eval_op(value: Any, op: str, target: Any, path: str | None = None) -> tuple[bool, str]:
606
+ label = path or "value"
607
+ try:
608
+ v = float(value) if not isinstance(value, bool) else value
609
+ t = float(target) if not isinstance(target, bool) else target
610
+ except (TypeError, ValueError):
611
+ return False, f"{label}={value!r} not comparable to {target!r}"
612
+ if op == "gte":
613
+ return (v >= t, f"{label}={v} {'>=' if v >= t else '<'} {t}")
614
+ if op == "gt":
615
+ return (v > t, f"{label}={v} {'>' if v > t else '<='} {t}")
616
+ if op == "lte":
617
+ return (v <= t, f"{label}={v} {'<=' if v <= t else '>'} {t}")
618
+ if op == "lt":
619
+ return (v < t, f"{label}={v} {'<' if v < t else '>='} {t}")
620
+ if op == "eq":
621
+ return (v == t, f"{label}={v} {'==' if v == t else '!='} {t}")
622
+ return False, f"unsupported op '{op}'"
623
+
624
+
625
+ # ---------------------------------------------------------------------------
626
+ # CLI + side effects
627
+ # ---------------------------------------------------------------------------
628
+
629
+
630
+ def emit_cloudwatch_metrics(results: list[CheckResult], cloudwatch_client: Any = None) -> None:
631
+ """Publish per-row + aggregate metrics to ``AlphaEngine/Substrate``."""
632
+ if cloudwatch_client is None:
633
+ import boto3
634
+
635
+ cloudwatch_client = boto3.client("cloudwatch", region_name="us-east-1")
636
+
637
+ metric_data = []
638
+ for r in results:
639
+ # 1 = ok or not_yet_effective (counts as healthy), 0 = fail
640
+ value = 1.0 if r.status in ("ok", "not_yet_effective") else 0.0
641
+ metric_data.append({
642
+ "MetricName": "SubstrateRowOK",
643
+ "Dimensions": [{"Name": "RowID", "Value": r.row_id}],
644
+ "Value": value,
645
+ "Unit": "Count",
646
+ })
647
+ n_ok = sum(1 for r in results if r.status == "ok")
648
+ n_fail = sum(1 for r in results if r.status == "fail")
649
+ n_pending = sum(1 for r in results if r.status == "not_yet_effective")
650
+ metric_data.extend([
651
+ {"MetricName": "SubstrateChecksOK", "Value": float(n_ok), "Unit": "Count"},
652
+ {"MetricName": "SubstrateChecksFailed", "Value": float(n_fail), "Unit": "Count"},
653
+ {"MetricName": "SubstrateChecksPending", "Value": float(n_pending), "Unit": "Count"},
654
+ ])
655
+
656
+ for i in range(0, len(metric_data), 20):
657
+ cloudwatch_client.put_metric_data(
658
+ Namespace=DEFAULT_NAMESPACE_OUT,
659
+ MetricData=metric_data[i : i + 20],
660
+ )
661
+
662
+
663
+ def format_report(results: list[CheckResult]) -> str:
664
+ lines = ["Substrate Health Report", "=" * 50]
665
+ n_ok = sum(1 for r in results if r.status == "ok")
666
+ n_fail = sum(1 for r in results if r.status == "fail")
667
+ n_pending = sum(1 for r in results if r.status == "not_yet_effective")
668
+ n_total = len(results)
669
+ pct = (100.0 * n_ok / max(1, n_total - n_pending)) if n_total > n_pending else 0.0
670
+ lines.append(
671
+ f"OK: {n_ok} Failed: {n_fail} Pending: {n_pending} "
672
+ f"({pct:.1f}% of effective rows passing)"
673
+ )
674
+ lines.append("")
675
+ icon = {"ok": "OK ", "fail": "FAIL", "not_yet_effective": "PEND", "error": "ERR "}
676
+ for r in results:
677
+ lines.append(f" [{icon.get(r.status, '?')}] {r.row_id:30s} {r.detail}")
678
+ failures = [r for r in results if r.status == "fail"]
679
+ if failures:
680
+ lines.append("")
681
+ lines.append("ACTIONS NEEDED:")
682
+ for r in failures:
683
+ lines.append(f" - {r.row_id}: {r.detail}")
684
+ return "\n".join(lines)
685
+
686
+
687
+ def main(argv: list[str] | None = None) -> int:
688
+ parser = argparse.ArgumentParser(description=__doc__)
689
+ parser.add_argument(
690
+ "--cadence",
691
+ choices=["daily", "weekly", "per_event"],
692
+ required=True,
693
+ help="Run weekly (Saturday SF) or daily (weekday SF) check.",
694
+ )
695
+ parser.add_argument("--json", action="store_true")
696
+ parser.add_argument("--alert", action="store_true", help="Publish SNS on failure.")
697
+ parser.add_argument("--no-emit", action="store_true", help="Skip CloudWatch emission.")
698
+ parser.add_argument(
699
+ "--inventory", type=Path, default=None, help="Override inventory path."
700
+ )
701
+ args = parser.parse_args(argv)
702
+
703
+ logging.basicConfig(level=logging.WARNING)
704
+ inv = load_inventory(args.inventory) if args.inventory else None
705
+
706
+ import boto3
707
+
708
+ s3 = boto3.client("s3")
709
+ cw = boto3.client("cloudwatch", region_name="us-east-1")
710
+
711
+ results = check_inventory(
712
+ args.cadence, inventory=inv, s3_client=s3, cloudwatch_client=cw
713
+ )
714
+
715
+ if args.json:
716
+ print(json.dumps([r.__dict__ for r in results], indent=2, default=str))
717
+ else:
718
+ print(format_report(results))
719
+
720
+ if not args.no_emit:
721
+ try:
722
+ emit_cloudwatch_metrics(results, cw)
723
+ except Exception as exc: # pragma: no cover — non-fatal
724
+ log.warning("CloudWatch emission failed: %s", exc)
725
+
726
+ failures = [r for r in results if r.status == "fail"]
727
+ if failures and args.alert:
728
+ try:
729
+ sns = boto3.client("sns", region_name="us-east-1")
730
+ topic = os.environ.get("SNS_TOPIC_ARN", DEFAULT_SNS_TOPIC)
731
+ sns.publish(
732
+ TopicArn=topic,
733
+ Subject=(
734
+ f"Alpha Engine — Substrate Health "
735
+ f"({args.cadence}): {len(failures)} row(s) failed"
736
+ ),
737
+ Message=format_report(results),
738
+ )
739
+ except Exception as exc: # pragma: no cover — non-fatal
740
+ log.warning("SNS publish failed: %s", exc)
741
+
742
+ return 1 if failures else 0
743
+
744
+
745
+ if __name__ == "__main__":
746
+ sys.exit(main())