alpha-engine-lib 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alpha_engine_lib/__init__.py +3 -0
- alpha_engine_lib/agent_schemas.py +663 -0
- alpha_engine_lib/alerts.py +576 -0
- alpha_engine_lib/arcticdb.py +340 -0
- alpha_engine_lib/collector_results.py +69 -0
- alpha_engine_lib/cost.py +665 -0
- alpha_engine_lib/dates.py +273 -0
- alpha_engine_lib/decision_capture.py +462 -0
- alpha_engine_lib/ec2_spot.py +363 -0
- alpha_engine_lib/email_sender.py +206 -0
- alpha_engine_lib/eval_artifacts.py +361 -0
- alpha_engine_lib/logging.py +303 -0
- alpha_engine_lib/model_pricing.yaml +73 -0
- alpha_engine_lib/pillars.py +756 -0
- alpha_engine_lib/pipeline_status/__init__.py +70 -0
- alpha_engine_lib/pipeline_status/read.py +541 -0
- alpha_engine_lib/pipeline_status/registry.py +368 -0
- alpha_engine_lib/pipeline_status/templates.py +120 -0
- alpha_engine_lib/preflight.py +444 -0
- alpha_engine_lib/rag/__init__.py +39 -0
- alpha_engine_lib/rag/db.py +96 -0
- alpha_engine_lib/rag/embeddings.py +63 -0
- alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
- alpha_engine_lib/rag/rerank.py +377 -0
- alpha_engine_lib/rag/retrieval.py +465 -0
- alpha_engine_lib/rag/schema.sql +65 -0
- alpha_engine_lib/reconcile.py +203 -0
- alpha_engine_lib/secrets.py +186 -0
- alpha_engine_lib/sources/__init__.py +35 -0
- alpha_engine_lib/sources/protocols.py +227 -0
- alpha_engine_lib/ssm_log_capture.py +274 -0
- alpha_engine_lib/telegram.py +165 -0
- alpha_engine_lib/trading_calendar.py +236 -0
- alpha_engine_lib/transparency.py +746 -0
- alpha_engine_lib/transparency_inventory.yaml +260 -0
- alpha_engine_lib/universe.py +83 -0
- alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
- alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
- alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
- alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,746 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transparency inventory substrate health checker.
|
|
3
|
+
|
|
4
|
+
Reads ``transparency_inventory.yaml``, validates that each row's
|
|
5
|
+
expected artifact exists with the expected cadence and content, and
|
|
6
|
+
returns per-row results. The Saturday and weekday Step Functions both
|
|
7
|
+
invoke this checker; the cadence flag determines which subset of rows
|
|
8
|
+
runs.
|
|
9
|
+
|
|
10
|
+
Phase 2 → 3 gate: ≥ 99% of inventory rows pass for 8 consecutive
|
|
11
|
+
weeks. The check fires per-row CloudWatch metrics so individual rows
|
|
12
|
+
have their own alarms — a failed row pages immediately, the gate
|
|
13
|
+
denominator is decremented for that row, and the 8-week clock resets.
|
|
14
|
+
|
|
15
|
+
Source kinds supported in v1:
|
|
16
|
+
|
|
17
|
+
s3_json HEAD + GET an S3 JSON object; assert_keys_present,
|
|
18
|
+
assert (path / op / value).
|
|
19
|
+
s3_csv HEAD + GET an S3 CSV; assert_columns_present,
|
|
20
|
+
assert_columns_non_null_for_rows_after,
|
|
21
|
+
assert_value_on_latest_row.
|
|
22
|
+
s3_parquet HEAD + GET an S3 parquet; assert_columns_present,
|
|
23
|
+
assert_column_non_null.
|
|
24
|
+
sqlite_via_s3 Download SQLite DB from S3, run PRAGMA table_info
|
|
25
|
+
against ``table``, assert_columns_present.
|
|
26
|
+
cloudwatch GetMetricData over ``window_days``, assert
|
|
27
|
+
success_rate_pct_gte | datapoints_gte.
|
|
28
|
+
|
|
29
|
+
Source kinds not in v1 (deferred): cloudwatch_search,
|
|
30
|
+
custom_python_callable.
|
|
31
|
+
|
|
32
|
+
The checker is read-only — it does not write artifacts of its own.
|
|
33
|
+
The caller (CLI ``main()``) emits CloudWatch metrics from the result
|
|
34
|
+
list and optionally publishes SNS.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import argparse
|
|
40
|
+
import io
|
|
41
|
+
import json
|
|
42
|
+
import logging
|
|
43
|
+
import os
|
|
44
|
+
import sqlite3
|
|
45
|
+
import sys
|
|
46
|
+
import tempfile
|
|
47
|
+
from dataclasses import dataclass, field
|
|
48
|
+
from datetime import date, datetime, timedelta, timezone
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Any, Callable, Iterable
|
|
51
|
+
|
|
52
|
+
log = logging.getLogger(__name__)
|
|
53
|
+
|
|
54
|
+
INVENTORY_PATH = Path(__file__).parent / "transparency_inventory.yaml"
|
|
55
|
+
|
|
56
|
+
DEFAULT_BUCKET = "alpha-engine-research"
|
|
57
|
+
DEFAULT_NAMESPACE_OUT = "AlphaEngine/Substrate"
|
|
58
|
+
DEFAULT_SNS_TOPIC = "arn:aws:sns:us-east-1:711398986525:alpha-engine-alerts"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class CheckResult:
|
|
63
|
+
"""Outcome of validating one inventory row."""
|
|
64
|
+
|
|
65
|
+
row_id: str
|
|
66
|
+
cadence: str
|
|
67
|
+
status: str # "ok" | "fail" | "not_yet_effective" | "error"
|
|
68
|
+
detail: str
|
|
69
|
+
effective_date: str
|
|
70
|
+
artifact: str | None = None
|
|
71
|
+
sub_failures: list[str] = field(default_factory=list)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def load_inventory(path: Path | None = None) -> dict:
|
|
75
|
+
"""Load and parse the inventory YAML.
|
|
76
|
+
|
|
77
|
+
Imports yaml lazily so the rest of the lib stays import-light for
|
|
78
|
+
consumers that don't use this module.
|
|
79
|
+
"""
|
|
80
|
+
import yaml
|
|
81
|
+
|
|
82
|
+
p = path or INVENTORY_PATH
|
|
83
|
+
with p.open() as fh:
|
|
84
|
+
return yaml.safe_load(fh)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def check_inventory(
|
|
88
|
+
cadence: str,
|
|
89
|
+
*,
|
|
90
|
+
today: date | None = None,
|
|
91
|
+
inventory: dict | None = None,
|
|
92
|
+
s3_client: Any = None,
|
|
93
|
+
cloudwatch_client: Any = None,
|
|
94
|
+
) -> list[CheckResult]:
|
|
95
|
+
"""Validate every inventory row whose ``cadence`` matches the input.
|
|
96
|
+
|
|
97
|
+
The Saturday SF passes ``cadence="weekly"`` to validate weekly +
|
|
98
|
+
daily rows (since daily artifacts from Friday should be readable
|
|
99
|
+
on Saturday). The weekday SF passes ``cadence="daily"`` to
|
|
100
|
+
validate only daily rows.
|
|
101
|
+
|
|
102
|
+
Rows with ``effective_date`` > today are returned with
|
|
103
|
+
``status="not_yet_effective"`` and contribute to neither
|
|
104
|
+
pass-rate calculation.
|
|
105
|
+
"""
|
|
106
|
+
today = today or _today_utc()
|
|
107
|
+
inv = inventory or load_inventory()
|
|
108
|
+
|
|
109
|
+
rows = list(_filter_rows(inv["inventory"], cadence))
|
|
110
|
+
results: list[CheckResult] = []
|
|
111
|
+
|
|
112
|
+
for row in rows:
|
|
113
|
+
results.append(_check_row(row, today, s3_client, cloudwatch_client))
|
|
114
|
+
|
|
115
|
+
return results
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _today_utc() -> date:
|
|
119
|
+
return datetime.now(timezone.utc).date()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _filter_rows(rows: Iterable[dict], cadence: str) -> Iterable[dict]:
|
|
123
|
+
"""Pick rows that the given run should validate.
|
|
124
|
+
|
|
125
|
+
Saturday (cadence='weekly') validates everything; weekday
|
|
126
|
+
(cadence='daily') validates only daily rows; per-event cadence
|
|
127
|
+
is validated only when explicitly requested.
|
|
128
|
+
"""
|
|
129
|
+
if cadence == "weekly":
|
|
130
|
+
wanted = {"weekly", "daily"}
|
|
131
|
+
elif cadence == "daily":
|
|
132
|
+
wanted = {"daily"}
|
|
133
|
+
elif cadence == "per_event":
|
|
134
|
+
wanted = {"per_event"}
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(f"Unknown cadence: {cadence}")
|
|
137
|
+
for row in rows:
|
|
138
|
+
if row["cadence"] in wanted:
|
|
139
|
+
yield row
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _check_row(
|
|
143
|
+
row: dict,
|
|
144
|
+
today: date,
|
|
145
|
+
s3_client: Any,
|
|
146
|
+
cloudwatch_client: Any,
|
|
147
|
+
) -> CheckResult:
|
|
148
|
+
eff = date.fromisoformat(str(row["effective_date"]))
|
|
149
|
+
if today < eff:
|
|
150
|
+
return CheckResult(
|
|
151
|
+
row_id=row["id"],
|
|
152
|
+
cadence=row["cadence"],
|
|
153
|
+
status="not_yet_effective",
|
|
154
|
+
detail=f"effective_date={eff} > today={today}",
|
|
155
|
+
effective_date=str(eff),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
sub: list[str] = []
|
|
159
|
+
artifact_hint: str | None = None
|
|
160
|
+
for src in row["sources"]:
|
|
161
|
+
try:
|
|
162
|
+
ok, detail, artifact = _check_source(
|
|
163
|
+
src, today, s3_client, cloudwatch_client
|
|
164
|
+
)
|
|
165
|
+
except Exception as exc: # pragma: no cover — defensive
|
|
166
|
+
ok, detail, artifact = False, f"checker error: {exc!r}", None
|
|
167
|
+
if artifact and artifact_hint is None:
|
|
168
|
+
artifact_hint = artifact
|
|
169
|
+
if ok:
|
|
170
|
+
return CheckResult(
|
|
171
|
+
row_id=row["id"],
|
|
172
|
+
cadence=row["cadence"],
|
|
173
|
+
status="ok",
|
|
174
|
+
detail=detail,
|
|
175
|
+
effective_date=str(eff),
|
|
176
|
+
artifact=artifact_hint,
|
|
177
|
+
)
|
|
178
|
+
sub.append(detail)
|
|
179
|
+
|
|
180
|
+
return CheckResult(
|
|
181
|
+
row_id=row["id"],
|
|
182
|
+
cadence=row["cadence"],
|
|
183
|
+
status="fail",
|
|
184
|
+
detail="; ".join(sub),
|
|
185
|
+
effective_date=str(eff),
|
|
186
|
+
artifact=artifact_hint,
|
|
187
|
+
sub_failures=sub,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
# Source-kind dispatchers
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _check_source(
|
|
197
|
+
src: dict,
|
|
198
|
+
today: date,
|
|
199
|
+
s3_client: Any,
|
|
200
|
+
cloudwatch_client: Any,
|
|
201
|
+
) -> tuple[bool, str, str | None]:
|
|
202
|
+
kind = src["kind"]
|
|
203
|
+
handler = _SOURCE_HANDLERS.get(kind)
|
|
204
|
+
if handler is None:
|
|
205
|
+
return False, f"unsupported source kind: {kind}", None
|
|
206
|
+
return handler(src, today, s3_client, cloudwatch_client)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _resolve_key(src: dict, today: date) -> tuple[str, str]:
|
|
210
|
+
"""Return (key, age_window_label).
|
|
211
|
+
|
|
212
|
+
Two patterns:
|
|
213
|
+
key — fixed S3 key, no date templating
|
|
214
|
+
key_pattern — contains {date}; checker walks back N days to
|
|
215
|
+
find the most recent matching object
|
|
216
|
+
"""
|
|
217
|
+
if "key" in src:
|
|
218
|
+
return src["key"], "fixed"
|
|
219
|
+
if "key_pattern" not in src:
|
|
220
|
+
raise ValueError(f"source missing key/key_pattern: {src}")
|
|
221
|
+
return src["key_pattern"], "templated"
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _walk_back(
|
|
225
|
+
pattern: str,
|
|
226
|
+
today: date,
|
|
227
|
+
max_age_days: int,
|
|
228
|
+
exists: Callable[[str], bool],
|
|
229
|
+
) -> tuple[str | None, int]:
|
|
230
|
+
"""Walk back day-by-day, return first key whose object exists.
|
|
231
|
+
|
|
232
|
+
Returns (key, age_in_days) or (None, age_at_limit).
|
|
233
|
+
"""
|
|
234
|
+
for i in range(max_age_days + 1):
|
|
235
|
+
d = today - timedelta(days=i)
|
|
236
|
+
key = pattern.format(date=d.isoformat())
|
|
237
|
+
if exists(key):
|
|
238
|
+
return key, i
|
|
239
|
+
return None, max_age_days + 1
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _s3_head(s3_client: Any, bucket: str, key: str) -> bool:
|
|
243
|
+
try:
|
|
244
|
+
s3_client.head_object(Bucket=bucket, Key=key)
|
|
245
|
+
return True
|
|
246
|
+
except Exception:
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _s3_get_bytes(s3_client: Any, bucket: str, key: str) -> bytes:
|
|
251
|
+
obj = s3_client.get_object(Bucket=bucket, Key=key)
|
|
252
|
+
return obj["Body"].read()
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _s3_age_days(s3_client: Any, bucket: str, key: str) -> int | None:
|
|
256
|
+
try:
|
|
257
|
+
resp = s3_client.head_object(Bucket=bucket, Key=key)
|
|
258
|
+
modified = resp["LastModified"]
|
|
259
|
+
return (datetime.now(timezone.utc) - modified).days
|
|
260
|
+
except Exception:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _resolve_and_age(
|
|
265
|
+
src: dict, today: date, s3_client: Any
|
|
266
|
+
) -> tuple[str | None, int | None, str]:
|
|
267
|
+
"""Locate the artifact key + report its age. Common to all S3 kinds."""
|
|
268
|
+
bucket = src.get("bucket", DEFAULT_BUCKET)
|
|
269
|
+
key, mode = _resolve_key(src, today)
|
|
270
|
+
max_age = src.get("max_age_days", 8)
|
|
271
|
+
if mode == "fixed":
|
|
272
|
+
age = _s3_age_days(s3_client, bucket, key)
|
|
273
|
+
if age is None:
|
|
274
|
+
return None, None, f"missing s3://{bucket}/{key}"
|
|
275
|
+
if age > max_age:
|
|
276
|
+
return key, age, (
|
|
277
|
+
f"stale s3://{bucket}/{key} (age={age}d > {max_age}d)"
|
|
278
|
+
)
|
|
279
|
+
return key, age, "ok"
|
|
280
|
+
# templated
|
|
281
|
+
resolved_key, age = _walk_back(
|
|
282
|
+
key,
|
|
283
|
+
today,
|
|
284
|
+
max_age,
|
|
285
|
+
lambda k: _s3_head(s3_client, bucket, k),
|
|
286
|
+
)
|
|
287
|
+
if resolved_key is None:
|
|
288
|
+
return None, None, (
|
|
289
|
+
f"no object matching s3://{bucket}/{key} within {max_age}d"
|
|
290
|
+
)
|
|
291
|
+
return resolved_key, age, "ok"
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _check_s3_json(
|
|
295
|
+
src: dict, today: date, s3_client: Any, _cw: Any
|
|
296
|
+
) -> tuple[bool, str, str | None]:
|
|
297
|
+
bucket = src.get("bucket", DEFAULT_BUCKET)
|
|
298
|
+
key, age, status = _resolve_and_age(src, today, s3_client)
|
|
299
|
+
if key is None:
|
|
300
|
+
# Companion fallback for "ok_if_companion_present"
|
|
301
|
+
if src.get("treat_absent_as") == "ok_if_companion_present":
|
|
302
|
+
comp_pattern = src.get("companion_key_pattern")
|
|
303
|
+
if comp_pattern:
|
|
304
|
+
comp_key, _ = _walk_back(
|
|
305
|
+
comp_pattern,
|
|
306
|
+
today,
|
|
307
|
+
src.get("max_age_days", 8),
|
|
308
|
+
lambda k: _s3_head(s3_client, bucket, k),
|
|
309
|
+
)
|
|
310
|
+
if comp_key:
|
|
311
|
+
return True, (
|
|
312
|
+
f"primary absent, companion present: "
|
|
313
|
+
f"s3://{bucket}/{comp_key}"
|
|
314
|
+
), comp_key
|
|
315
|
+
return False, status, None
|
|
316
|
+
if status != "ok":
|
|
317
|
+
return False, status, key
|
|
318
|
+
|
|
319
|
+
body = _s3_get_bytes(s3_client, bucket, key)
|
|
320
|
+
try:
|
|
321
|
+
payload = json.loads(body)
|
|
322
|
+
except Exception as exc:
|
|
323
|
+
return False, f"json parse error on s3://{bucket}/{key}: {exc!r}", key
|
|
324
|
+
|
|
325
|
+
failures: list[str] = []
|
|
326
|
+
for required in src.get("assert_keys_present", []):
|
|
327
|
+
if required not in payload:
|
|
328
|
+
failures.append(f"missing key '{required}'")
|
|
329
|
+
for assertion in src.get("assert", []):
|
|
330
|
+
ok, detail = _eval_path_assertion(payload, assertion)
|
|
331
|
+
if not ok:
|
|
332
|
+
failures.append(detail)
|
|
333
|
+
if failures:
|
|
334
|
+
return False, "; ".join(failures), key
|
|
335
|
+
return True, f"ok (age={age}d)", key
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _check_s3_csv(
|
|
339
|
+
src: dict, today: date, s3_client: Any, _cw: Any
|
|
340
|
+
) -> tuple[bool, str, str | None]:
|
|
341
|
+
bucket = src.get("bucket", DEFAULT_BUCKET)
|
|
342
|
+
key, age, status = _resolve_and_age(src, today, s3_client)
|
|
343
|
+
if key is None or status != "ok":
|
|
344
|
+
return False, status, key
|
|
345
|
+
|
|
346
|
+
body = _s3_get_bytes(s3_client, bucket, key)
|
|
347
|
+
try:
|
|
348
|
+
import pandas as pd
|
|
349
|
+
|
|
350
|
+
df = pd.read_csv(io.BytesIO(body))
|
|
351
|
+
except Exception as exc:
|
|
352
|
+
return False, f"csv parse error on s3://{bucket}/{key}: {exc!r}", key
|
|
353
|
+
|
|
354
|
+
failures: list[str] = []
|
|
355
|
+
for col in src.get("assert_columns_present", []):
|
|
356
|
+
if col not in df.columns:
|
|
357
|
+
failures.append(f"missing column '{col}'")
|
|
358
|
+
|
|
359
|
+
rule = src.get("assert_columns_non_null_for_rows_after")
|
|
360
|
+
if rule and not failures:
|
|
361
|
+
date_col = rule["date_column"]
|
|
362
|
+
threshold = date.fromisoformat(str(rule["rows_after"]))
|
|
363
|
+
cols = rule["columns"]
|
|
364
|
+
action_filter = rule.get("action_filter")
|
|
365
|
+
if date_col not in df.columns:
|
|
366
|
+
failures.append(f"missing date_column '{date_col}'")
|
|
367
|
+
else:
|
|
368
|
+
try:
|
|
369
|
+
# Coerce date_column to date for comparison; tolerate
|
|
370
|
+
# both 'YYYY-MM-DD' and ISO timestamps.
|
|
371
|
+
d_col = pd.to_datetime(df[date_col], errors="coerce").dt.date
|
|
372
|
+
mask = d_col > threshold
|
|
373
|
+
sub = df[mask]
|
|
374
|
+
if action_filter:
|
|
375
|
+
a_col = action_filter["column"]
|
|
376
|
+
a_val = action_filter["equals"]
|
|
377
|
+
if a_col in sub.columns:
|
|
378
|
+
sub = sub[sub[a_col] == a_val]
|
|
379
|
+
if not sub.empty:
|
|
380
|
+
for col in cols:
|
|
381
|
+
if col not in sub.columns:
|
|
382
|
+
failures.append(f"missing column '{col}' for non-null assertion")
|
|
383
|
+
continue
|
|
384
|
+
nulls = sub[col].isna().sum()
|
|
385
|
+
if nulls > 0:
|
|
386
|
+
failures.append(
|
|
387
|
+
f"column '{col}' has {int(nulls)} null rows after {threshold}"
|
|
388
|
+
)
|
|
389
|
+
except Exception as exc:
|
|
390
|
+
failures.append(f"non-null check error: {exc!r}")
|
|
391
|
+
|
|
392
|
+
latest = src.get("assert_value_on_latest_row")
|
|
393
|
+
if latest and not failures:
|
|
394
|
+
col = latest["column"]
|
|
395
|
+
if col not in df.columns:
|
|
396
|
+
failures.append(f"missing column '{col}' for latest-row assertion")
|
|
397
|
+
elif df.empty:
|
|
398
|
+
failures.append(f"csv empty — cannot evaluate '{col}' on latest row")
|
|
399
|
+
else:
|
|
400
|
+
val = df[col].iloc[-1]
|
|
401
|
+
ok, detail = _eval_op(val, latest["op"], latest["value"])
|
|
402
|
+
if not ok:
|
|
403
|
+
failures.append(detail)
|
|
404
|
+
|
|
405
|
+
if failures:
|
|
406
|
+
return False, "; ".join(failures), key
|
|
407
|
+
return True, f"ok (age={age}d, rows={len(df)})", key
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _check_s3_parquet(
|
|
411
|
+
src: dict, today: date, s3_client: Any, _cw: Any
|
|
412
|
+
) -> tuple[bool, str, str | None]:
|
|
413
|
+
bucket = src.get("bucket", DEFAULT_BUCKET)
|
|
414
|
+
key, age, status = _resolve_and_age(src, today, s3_client)
|
|
415
|
+
if key is None or status != "ok":
|
|
416
|
+
return False, status, key
|
|
417
|
+
|
|
418
|
+
body = _s3_get_bytes(s3_client, bucket, key)
|
|
419
|
+
try:
|
|
420
|
+
import pandas as pd
|
|
421
|
+
|
|
422
|
+
df = pd.read_parquet(io.BytesIO(body))
|
|
423
|
+
except Exception as exc:
|
|
424
|
+
return False, f"parquet parse error on s3://{bucket}/{key}: {exc!r}", key
|
|
425
|
+
|
|
426
|
+
failures: list[str] = []
|
|
427
|
+
for col in src.get("assert_columns_present", []):
|
|
428
|
+
if col not in df.columns:
|
|
429
|
+
failures.append(f"missing column '{col}'")
|
|
430
|
+
for col in src.get("assert_column_non_null", []):
|
|
431
|
+
if col not in df.columns:
|
|
432
|
+
failures.append(f"missing column '{col}' for non-null check")
|
|
433
|
+
continue
|
|
434
|
+
nulls = df[col].isna().sum()
|
|
435
|
+
if nulls > 0:
|
|
436
|
+
failures.append(f"column '{col}' has {int(nulls)} null rows")
|
|
437
|
+
|
|
438
|
+
if failures:
|
|
439
|
+
return False, "; ".join(failures), key
|
|
440
|
+
return True, f"ok (age={age}d, rows={len(df)})", key
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _check_sqlite_via_s3(
|
|
444
|
+
src: dict, today: date, s3_client: Any, _cw: Any
|
|
445
|
+
) -> tuple[bool, str, str | None]:
|
|
446
|
+
bucket = src.get("bucket", DEFAULT_BUCKET)
|
|
447
|
+
key, age, status = _resolve_and_age(src, today, s3_client)
|
|
448
|
+
if key is None or status != "ok":
|
|
449
|
+
return False, status, key
|
|
450
|
+
|
|
451
|
+
table = src["table"]
|
|
452
|
+
body = _s3_get_bytes(s3_client, bucket, key)
|
|
453
|
+
|
|
454
|
+
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as fh:
|
|
455
|
+
fh.write(body)
|
|
456
|
+
db_path = fh.name
|
|
457
|
+
try:
|
|
458
|
+
conn = sqlite3.connect(db_path)
|
|
459
|
+
try:
|
|
460
|
+
cur = conn.execute(f"PRAGMA table_info({table})")
|
|
461
|
+
cols = {row[1] for row in cur.fetchall()}
|
|
462
|
+
finally:
|
|
463
|
+
conn.close()
|
|
464
|
+
finally:
|
|
465
|
+
try:
|
|
466
|
+
os.unlink(db_path)
|
|
467
|
+
except OSError:
|
|
468
|
+
pass
|
|
469
|
+
|
|
470
|
+
if not cols:
|
|
471
|
+
return False, f"table '{table}' missing in s3://{bucket}/{key}", key
|
|
472
|
+
|
|
473
|
+
failures = [
|
|
474
|
+
f"missing column '{c}' in table '{table}'"
|
|
475
|
+
for c in src.get("assert_columns_present", [])
|
|
476
|
+
if c not in cols
|
|
477
|
+
]
|
|
478
|
+
if failures:
|
|
479
|
+
return False, "; ".join(failures), key
|
|
480
|
+
return True, f"ok (age={age}d, table='{table}')", key
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def _check_cloudwatch(
|
|
484
|
+
src: dict, today: date, _s3: Any, cloudwatch_client: Any
|
|
485
|
+
) -> tuple[bool, str, str | None]:
|
|
486
|
+
if cloudwatch_client is None:
|
|
487
|
+
import boto3
|
|
488
|
+
|
|
489
|
+
cloudwatch_client = boto3.client("cloudwatch", region_name="us-east-1")
|
|
490
|
+
|
|
491
|
+
namespace = src["namespace"]
|
|
492
|
+
metric = src["metric"]
|
|
493
|
+
window_days = src.get("window_days", 7)
|
|
494
|
+
end = datetime.now(timezone.utc)
|
|
495
|
+
start = end - timedelta(days=window_days)
|
|
496
|
+
# AWS GetMetricStatistics requires Period to be a multiple of 60. Aim
|
|
497
|
+
# for ~100 datapoints across the window, then round down to a multiple
|
|
498
|
+
# of 60 (with a 60s floor for the smallest windows).
|
|
499
|
+
raw_period = max(60, window_days * 86400 // 100)
|
|
500
|
+
period = max(60, (raw_period // 60) * 60)
|
|
501
|
+
|
|
502
|
+
artifact = f"cw://{namespace}/{metric}"
|
|
503
|
+
assertion = src.get("assert", {})
|
|
504
|
+
op = assertion.get("op")
|
|
505
|
+
|
|
506
|
+
if op == "success_rate_pct_gte":
|
|
507
|
+
return _check_cw_success_rate(
|
|
508
|
+
cloudwatch_client, src, start, end, period, assertion["value"]
|
|
509
|
+
)
|
|
510
|
+
if op == "datapoints_gte":
|
|
511
|
+
return _check_cw_datapoints(
|
|
512
|
+
cloudwatch_client, namespace, metric, start, end, period,
|
|
513
|
+
assertion["value"], artifact,
|
|
514
|
+
)
|
|
515
|
+
return False, f"unsupported cloudwatch assert op: {op}", artifact
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _check_cw_success_rate(
|
|
519
|
+
cw: Any, src: dict, start: datetime, end: datetime, period: int, threshold: float
|
|
520
|
+
) -> tuple[bool, str, str | None]:
|
|
521
|
+
namespace = src["namespace"]
|
|
522
|
+
dim_field = src.get("dimensions", {})
|
|
523
|
+
arns = list(dim_field.get("StateMachineArn", [])) or [None]
|
|
524
|
+
|
|
525
|
+
failures: list[str] = []
|
|
526
|
+
for arn in arns:
|
|
527
|
+
kw = {
|
|
528
|
+
"Namespace": namespace,
|
|
529
|
+
"Period": period,
|
|
530
|
+
"Statistics": ["Sum"],
|
|
531
|
+
"StartTime": start,
|
|
532
|
+
"EndTime": end,
|
|
533
|
+
}
|
|
534
|
+
if arn:
|
|
535
|
+
kw["Dimensions"] = [{"Name": "StateMachineArn", "Value": arn}]
|
|
536
|
+
succ = cw.get_metric_statistics(MetricName="ExecutionsSucceeded", **kw)
|
|
537
|
+
fail = cw.get_metric_statistics(MetricName="ExecutionsFailed", **kw)
|
|
538
|
+
s = sum(p["Sum"] for p in succ.get("Datapoints", []))
|
|
539
|
+
f = sum(p["Sum"] for p in fail.get("Datapoints", []))
|
|
540
|
+
denom = s + f
|
|
541
|
+
if denom == 0:
|
|
542
|
+
failures.append(f"{arn or 'aggregate'}: no datapoints in window")
|
|
543
|
+
continue
|
|
544
|
+
pct = 100.0 * s / denom
|
|
545
|
+
if pct < threshold:
|
|
546
|
+
failures.append(
|
|
547
|
+
f"{arn or 'aggregate'}: success_rate={pct:.2f}% < {threshold}%"
|
|
548
|
+
)
|
|
549
|
+
if failures:
|
|
550
|
+
return False, "; ".join(failures), f"cw://{namespace}/ExecutionsSucceeded"
|
|
551
|
+
return True, "ok", f"cw://{namespace}/ExecutionsSucceeded"
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _check_cw_datapoints(
|
|
555
|
+
cw: Any,
|
|
556
|
+
namespace: str,
|
|
557
|
+
metric: str,
|
|
558
|
+
start: datetime,
|
|
559
|
+
end: datetime,
|
|
560
|
+
period: int,
|
|
561
|
+
threshold: int,
|
|
562
|
+
artifact: str,
|
|
563
|
+
) -> tuple[bool, str, str | None]:
|
|
564
|
+
resp = cw.get_metric_statistics(
|
|
565
|
+
Namespace=namespace,
|
|
566
|
+
MetricName=metric,
|
|
567
|
+
Period=period,
|
|
568
|
+
Statistics=["SampleCount"],
|
|
569
|
+
StartTime=start,
|
|
570
|
+
EndTime=end,
|
|
571
|
+
)
|
|
572
|
+
n = sum(p["SampleCount"] for p in resp.get("Datapoints", []))
|
|
573
|
+
if n < threshold:
|
|
574
|
+
return False, (
|
|
575
|
+
f"only {int(n)} datapoints in {namespace}/{metric} (need ≥ {threshold})"
|
|
576
|
+
), artifact
|
|
577
|
+
return True, f"ok (n={int(n)})", artifact
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
_SOURCE_HANDLERS: dict[str, Callable] = {
|
|
581
|
+
"s3_json": _check_s3_json,
|
|
582
|
+
"s3_csv": _check_s3_csv,
|
|
583
|
+
"s3_parquet": _check_s3_parquet,
|
|
584
|
+
"sqlite_via_s3": _check_sqlite_via_s3,
|
|
585
|
+
"cloudwatch": _check_cloudwatch,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
# ---------------------------------------------------------------------------
|
|
590
|
+
# Assertion primitives
|
|
591
|
+
# ---------------------------------------------------------------------------
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _eval_path_assertion(payload: Any, assertion: dict) -> tuple[bool, str]:
|
|
595
|
+
path = assertion["path"]
|
|
596
|
+
cur: Any = payload
|
|
597
|
+
for part in path.split("."):
|
|
598
|
+
if isinstance(cur, dict) and part in cur:
|
|
599
|
+
cur = cur[part]
|
|
600
|
+
else:
|
|
601
|
+
return False, f"path '{path}' not found"
|
|
602
|
+
return _eval_op(cur, assertion["op"], assertion["value"], path=path)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _eval_op(value: Any, op: str, target: Any, path: str | None = None) -> tuple[bool, str]:
|
|
606
|
+
label = path or "value"
|
|
607
|
+
try:
|
|
608
|
+
v = float(value) if not isinstance(value, bool) else value
|
|
609
|
+
t = float(target) if not isinstance(target, bool) else target
|
|
610
|
+
except (TypeError, ValueError):
|
|
611
|
+
return False, f"{label}={value!r} not comparable to {target!r}"
|
|
612
|
+
if op == "gte":
|
|
613
|
+
return (v >= t, f"{label}={v} {'>=' if v >= t else '<'} {t}")
|
|
614
|
+
if op == "gt":
|
|
615
|
+
return (v > t, f"{label}={v} {'>' if v > t else '<='} {t}")
|
|
616
|
+
if op == "lte":
|
|
617
|
+
return (v <= t, f"{label}={v} {'<=' if v <= t else '>'} {t}")
|
|
618
|
+
if op == "lt":
|
|
619
|
+
return (v < t, f"{label}={v} {'<' if v < t else '>='} {t}")
|
|
620
|
+
if op == "eq":
|
|
621
|
+
return (v == t, f"{label}={v} {'==' if v == t else '!='} {t}")
|
|
622
|
+
return False, f"unsupported op '{op}'"
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
# ---------------------------------------------------------------------------
|
|
626
|
+
# CLI + side effects
|
|
627
|
+
# ---------------------------------------------------------------------------
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def emit_cloudwatch_metrics(results: list[CheckResult], cloudwatch_client: Any = None) -> None:
|
|
631
|
+
"""Publish per-row + aggregate metrics to ``AlphaEngine/Substrate``."""
|
|
632
|
+
if cloudwatch_client is None:
|
|
633
|
+
import boto3
|
|
634
|
+
|
|
635
|
+
cloudwatch_client = boto3.client("cloudwatch", region_name="us-east-1")
|
|
636
|
+
|
|
637
|
+
metric_data = []
|
|
638
|
+
for r in results:
|
|
639
|
+
# 1 = ok or not_yet_effective (counts as healthy), 0 = fail
|
|
640
|
+
value = 1.0 if r.status in ("ok", "not_yet_effective") else 0.0
|
|
641
|
+
metric_data.append({
|
|
642
|
+
"MetricName": "SubstrateRowOK",
|
|
643
|
+
"Dimensions": [{"Name": "RowID", "Value": r.row_id}],
|
|
644
|
+
"Value": value,
|
|
645
|
+
"Unit": "Count",
|
|
646
|
+
})
|
|
647
|
+
n_ok = sum(1 for r in results if r.status == "ok")
|
|
648
|
+
n_fail = sum(1 for r in results if r.status == "fail")
|
|
649
|
+
n_pending = sum(1 for r in results if r.status == "not_yet_effective")
|
|
650
|
+
metric_data.extend([
|
|
651
|
+
{"MetricName": "SubstrateChecksOK", "Value": float(n_ok), "Unit": "Count"},
|
|
652
|
+
{"MetricName": "SubstrateChecksFailed", "Value": float(n_fail), "Unit": "Count"},
|
|
653
|
+
{"MetricName": "SubstrateChecksPending", "Value": float(n_pending), "Unit": "Count"},
|
|
654
|
+
])
|
|
655
|
+
|
|
656
|
+
for i in range(0, len(metric_data), 20):
|
|
657
|
+
cloudwatch_client.put_metric_data(
|
|
658
|
+
Namespace=DEFAULT_NAMESPACE_OUT,
|
|
659
|
+
MetricData=metric_data[i : i + 20],
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def format_report(results: list[CheckResult]) -> str:
|
|
664
|
+
lines = ["Substrate Health Report", "=" * 50]
|
|
665
|
+
n_ok = sum(1 for r in results if r.status == "ok")
|
|
666
|
+
n_fail = sum(1 for r in results if r.status == "fail")
|
|
667
|
+
n_pending = sum(1 for r in results if r.status == "not_yet_effective")
|
|
668
|
+
n_total = len(results)
|
|
669
|
+
pct = (100.0 * n_ok / max(1, n_total - n_pending)) if n_total > n_pending else 0.0
|
|
670
|
+
lines.append(
|
|
671
|
+
f"OK: {n_ok} Failed: {n_fail} Pending: {n_pending} "
|
|
672
|
+
f"({pct:.1f}% of effective rows passing)"
|
|
673
|
+
)
|
|
674
|
+
lines.append("")
|
|
675
|
+
icon = {"ok": "OK ", "fail": "FAIL", "not_yet_effective": "PEND", "error": "ERR "}
|
|
676
|
+
for r in results:
|
|
677
|
+
lines.append(f" [{icon.get(r.status, '?')}] {r.row_id:30s} {r.detail}")
|
|
678
|
+
failures = [r for r in results if r.status == "fail"]
|
|
679
|
+
if failures:
|
|
680
|
+
lines.append("")
|
|
681
|
+
lines.append("ACTIONS NEEDED:")
|
|
682
|
+
for r in failures:
|
|
683
|
+
lines.append(f" - {r.row_id}: {r.detail}")
|
|
684
|
+
return "\n".join(lines)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def main(argv: list[str] | None = None) -> int:
|
|
688
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
689
|
+
parser.add_argument(
|
|
690
|
+
"--cadence",
|
|
691
|
+
choices=["daily", "weekly", "per_event"],
|
|
692
|
+
required=True,
|
|
693
|
+
help="Run weekly (Saturday SF) or daily (weekday SF) check.",
|
|
694
|
+
)
|
|
695
|
+
parser.add_argument("--json", action="store_true")
|
|
696
|
+
parser.add_argument("--alert", action="store_true", help="Publish SNS on failure.")
|
|
697
|
+
parser.add_argument("--no-emit", action="store_true", help="Skip CloudWatch emission.")
|
|
698
|
+
parser.add_argument(
|
|
699
|
+
"--inventory", type=Path, default=None, help="Override inventory path."
|
|
700
|
+
)
|
|
701
|
+
args = parser.parse_args(argv)
|
|
702
|
+
|
|
703
|
+
logging.basicConfig(level=logging.WARNING)
|
|
704
|
+
inv = load_inventory(args.inventory) if args.inventory else None
|
|
705
|
+
|
|
706
|
+
import boto3
|
|
707
|
+
|
|
708
|
+
s3 = boto3.client("s3")
|
|
709
|
+
cw = boto3.client("cloudwatch", region_name="us-east-1")
|
|
710
|
+
|
|
711
|
+
results = check_inventory(
|
|
712
|
+
args.cadence, inventory=inv, s3_client=s3, cloudwatch_client=cw
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
if args.json:
|
|
716
|
+
print(json.dumps([r.__dict__ for r in results], indent=2, default=str))
|
|
717
|
+
else:
|
|
718
|
+
print(format_report(results))
|
|
719
|
+
|
|
720
|
+
if not args.no_emit:
|
|
721
|
+
try:
|
|
722
|
+
emit_cloudwatch_metrics(results, cw)
|
|
723
|
+
except Exception as exc: # pragma: no cover — non-fatal
|
|
724
|
+
log.warning("CloudWatch emission failed: %s", exc)
|
|
725
|
+
|
|
726
|
+
failures = [r for r in results if r.status == "fail"]
|
|
727
|
+
if failures and args.alert:
|
|
728
|
+
try:
|
|
729
|
+
sns = boto3.client("sns", region_name="us-east-1")
|
|
730
|
+
topic = os.environ.get("SNS_TOPIC_ARN", DEFAULT_SNS_TOPIC)
|
|
731
|
+
sns.publish(
|
|
732
|
+
TopicArn=topic,
|
|
733
|
+
Subject=(
|
|
734
|
+
f"Alpha Engine — Substrate Health "
|
|
735
|
+
f"({args.cadence}): {len(failures)} row(s) failed"
|
|
736
|
+
),
|
|
737
|
+
Message=format_report(results),
|
|
738
|
+
)
|
|
739
|
+
except Exception as exc: # pragma: no cover — non-fatal
|
|
740
|
+
log.warning("SNS publish failed: %s", exc)
|
|
741
|
+
|
|
742
|
+
return 1 if failures else 0
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
if __name__ == "__main__":
|
|
746
|
+
sys.exit(main())
|