alpha-engine-lib 0.42.0__tar.gz → 0.44.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/PKG-INFO +1 -1
  2. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/pyproject.toml +1 -1
  3. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/__init__.py +1 -1
  4. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/artifact_freshness.py +157 -0
  5. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/transparency.py +84 -13
  6. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/transparency_inventory.yaml +27 -2
  7. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib.egg-info/PKG-INFO +1 -1
  8. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_artifact_freshness.py +113 -0
  9. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_transparency.py +109 -0
  10. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/README.md +0 -0
  11. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/setup.cfg +0 -0
  12. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/agent_schemas.py +0 -0
  13. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/alerts.py +0 -0
  14. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/anthropic_payload.py +0 -0
  15. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/arcticdb.py +0 -0
  16. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/collector_results.py +0 -0
  17. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/cost.py +0 -0
  18. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/dates.py +0 -0
  19. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/decision_capture.py +0 -0
  20. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/ec2_spot.py +0 -0
  21. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/email_sender.py +0 -0
  22. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/eval_artifacts.py +0 -0
  23. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/locks.py +0 -0
  24. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/logging.py +0 -0
  25. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/model_pricing.yaml +0 -0
  26. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/pillars.py +0 -0
  27. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/pipeline_status/__init__.py +0 -0
  28. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/pipeline_status/read.py +0 -0
  29. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/pipeline_status/registry.py +0 -0
  30. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/pipeline_status/templates.py +0 -0
  31. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/preflight.py +0 -0
  32. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/__init__.py +0 -0
  33. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/db.py +0 -0
  34. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/embeddings.py +0 -0
  35. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/migrations/0001_content_tsv.sql +0 -0
  36. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/rerank.py +0 -0
  37. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/retrieval.py +0 -0
  38. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/rag/schema.sql +0 -0
  39. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/reconcile.py +0 -0
  40. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/secrets.py +0 -0
  41. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/sources/__init__.py +0 -0
  42. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/sources/protocols.py +0 -0
  43. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/ssm_dispatcher.py +0 -0
  44. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/ssm_log_capture.py +0 -0
  45. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/telegram.py +0 -0
  46. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/trading_calendar.py +0 -0
  47. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib/universe.py +0 -0
  48. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib.egg-info/SOURCES.txt +0 -0
  49. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib.egg-info/dependency_links.txt +0 -0
  50. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib.egg-info/requires.txt +0 -0
  51. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/src/alpha_engine_lib.egg-info/top_level.txt +0 -0
  52. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_agent_schemas.py +0 -0
  53. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_alerts.py +0 -0
  54. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_anthropic_payload.py +0 -0
  55. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_arcticdb.py +0 -0
  56. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_collector_results.py +0 -0
  57. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_cost.py +0 -0
  58. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_dates.py +0 -0
  59. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_decision_capture.py +0 -0
  60. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_ec2_spot.py +0 -0
  61. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_email_sender.py +0 -0
  62. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_eval_artifacts.py +0 -0
  63. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_locks.py +0 -0
  64. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_logging.py +0 -0
  65. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_pillars.py +0 -0
  66. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_pipeline_status_read.py +0 -0
  67. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_pipeline_status_registry.py +0 -0
  68. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_pipeline_status_templates.py +0 -0
  69. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_preflight.py +0 -0
  70. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_rag.py +0 -0
  71. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_rag_rerank.py +0 -0
  72. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_rag_retrieval_hybrid.py +0 -0
  73. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_reconcile.py +0 -0
  74. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_secrets.py +0 -0
  75. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_sources_protocols.py +0 -0
  76. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_ssm_dispatcher.py +0 -0
  77. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_ssm_log_capture.py +0 -0
  78. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_telegram.py +0 -0
  79. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_trading_calendar.py +0 -0
  80. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_universe.py +0 -0
  81. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_version_bump_workflow.py +0 -0
  82. {alpha_engine_lib-0.42.0 → alpha_engine_lib-0.44.0}/tests/test_version_pin.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alpha-engine-lib
3
- Version: 0.42.0
3
+ Version: 0.44.0
4
4
  Summary: Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, and S3-conditional-PUT writer locks. Full surface documented in README.
5
5
  Author: Brian McMahon
6
6
  License: Proprietary
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alpha-engine-lib"
7
- version = "0.42.0"
7
+ version = "0.44.0"
8
8
  description = "Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, and S3-conditional-PUT writer locks. Full surface documented in README."
9
9
  readme = "README.md"
10
10
  # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops
@@ -1,3 +1,3 @@
1
1
  """alpha-engine-lib — shared utilities for Alpha Engine modules."""
2
2
 
3
- __version__ = "0.42.0"
3
+ __version__ = "0.44.0"
@@ -71,6 +71,7 @@ ships the freshness-monitor Lambda that wires the two together.
71
71
 
72
72
  from __future__ import annotations
73
73
 
74
+ from collections.abc import Iterable
74
75
  from dataclasses import dataclass, field
75
76
  from datetime import date, datetime, timedelta, timezone
76
77
  from typing import Any, Final, Literal
@@ -596,3 +597,159 @@ def _cycle_length_seconds(spec: ArtifactSpec) -> float:
596
597
  assert spec.interval_minutes is not None
597
598
  return spec.interval_minutes * 60
598
599
  raise ValueError(f"unknown cadence {spec.cadence!r}")
600
+
601
+
602
+ # ── Per-cycle completion rollup ───────────────────────────────────────────────
603
+
604
+
605
+ CycleState = Literal["complete", "incomplete", "indeterminate"]
606
+
607
+
608
+ @dataclass
609
+ class CycleCompletion:
610
+ """Per-cycle completion verdict — the artifact-union judgment.
611
+
612
+ Aggregates the per-artifact :class:`CheckResult` rows for one
613
+ execution cycle into a single verdict over the *required* set
614
+ (the ``severity="critical"`` rows). Answers the question the
615
+ raw orchestrator status cannot on a recovery-stitched run: *did
616
+ this cycle actually deliver every load-bearing artifact?*
617
+
618
+ Recovery substitution is already folded in upstream — a
619
+ canonical-missing artifact rescued by its ``recovery_key_template``
620
+ arrives here as ``state="fresh"``. So this rollup judges the
621
+ execution UNION without re-HEADing anything.
622
+
623
+ Attributes:
624
+ state: ``"complete"`` ⇒ every required artifact is present +
625
+ valid (``fresh``, or suppressed by ``grace_period``).
626
+ ``"incomplete"`` ⇒ at least one required artifact is
627
+ ``missing`` / ``stale`` (a real delivery gap).
628
+ ``"indeterminate"`` ⇒ no real gap, but at least one probe
629
+ ``probe_failed`` (the monitor itself is broken, so the
630
+ cycle can't be confirmed). A real gap outranks an
631
+ indeterminate probe.
632
+ complete: ``True`` iff ``state == "complete"``.
633
+ cycle_label: The cycle's window label (e.g. ``"2026-W22"``),
634
+ for reporting. Informational — the caller passes it.
635
+ n_required: Count of ``severity="critical"`` artifacts judged.
636
+ n_satisfied: Count present + valid (``fresh`` + ``grace_period``).
637
+ missing / stale / probe_failed / grace_period: ``artifact_id``
638
+ localization lists — which artifacts landed in each state.
639
+ reason: Human-readable summary; routed to the report surface.
640
+ """
641
+
642
+ state: CycleState
643
+ complete: bool
644
+ cycle_label: str | None = None
645
+ n_required: int = 0
646
+ n_satisfied: int = 0
647
+ missing: list[str] = field(default_factory=list)
648
+ stale: list[str] = field(default_factory=list)
649
+ probe_failed: list[str] = field(default_factory=list)
650
+ grace_period: list[str] = field(default_factory=list)
651
+ reason: str = ""
652
+
653
+
654
+ def cycle_completion(
655
+ spec_results: Iterable[tuple[ArtifactSpec, CheckResult]],
656
+ *,
657
+ cycle_label: str | None = None,
658
+ ) -> CycleCompletion:
659
+ """Roll per-artifact freshness results up into one cycle verdict.
660
+
661
+ ``cycle_completion(C) = ∀ required artifact a: present(a@C) ∧ valid(a@C)``
662
+ over the execution UNION, where the required set is the
663
+ ``severity="critical"`` rows. Non-critical (``warning``) artifacts
664
+ are excluded — they inform per-artifact alerting but never gate the
665
+ cycle verdict.
666
+
667
+ Pure: consumes already-computed :class:`CheckResult` rows (as
668
+ ``(spec, result)`` pairs so there's no positional-pairing hazard)
669
+ and performs no I/O. Recovery substitution and the calendar-holiday
670
+ short-circuit are already reflected in each ``result.state`` by
671
+ :func:`check_freshness`, so a holiday cycle or a recovery-rescued
672
+ artifact both count as satisfied here.
673
+
674
+ State precedence: a real delivery gap (``missing`` / ``stale``)
675
+ outranks a broken probe (``probe_failed``) — a confirmed miss is
676
+ more actionable than an unconfirmable one. ``grace_period`` counts
677
+ as satisfied (the producer is newly onboarded; suppressed by design)
678
+ but is surfaced in its own list so the caller can see it.
679
+
680
+ An empty required set returns ``state="complete"`` (vacuous truth) —
681
+ a cycle with no critical artifacts cannot be incomplete.
682
+ """
683
+ required = [(s, r) for s, r in spec_results if s.severity == "critical"]
684
+
685
+ missing: list[str] = []
686
+ stale: list[str] = []
687
+ probe_failed: list[str] = []
688
+ grace_period: list[str] = []
689
+ satisfied = 0
690
+
691
+ for spec, res in required:
692
+ if res.state == "fresh":
693
+ satisfied += 1
694
+ elif res.state == "grace_period":
695
+ satisfied += 1
696
+ grace_period.append(spec.artifact_id)
697
+ elif res.state == "stale":
698
+ stale.append(spec.artifact_id)
699
+ elif res.state == "missing":
700
+ missing.append(spec.artifact_id)
701
+ elif res.state == "probe_failed":
702
+ probe_failed.append(spec.artifact_id)
703
+
704
+ n_required = len(required)
705
+
706
+ if missing or stale:
707
+ gaps = []
708
+ if missing:
709
+ gaps.append(f"missing={missing}")
710
+ if stale:
711
+ gaps.append(f"stale={stale}")
712
+ return CycleCompletion(
713
+ state="incomplete",
714
+ complete=False,
715
+ cycle_label=cycle_label,
716
+ n_required=n_required,
717
+ n_satisfied=satisfied,
718
+ missing=missing,
719
+ stale=stale,
720
+ probe_failed=probe_failed,
721
+ grace_period=grace_period,
722
+ reason=(
723
+ f"cycle incomplete: {satisfied}/{n_required} critical artifacts "
724
+ f"present+valid; " + "; ".join(gaps)
725
+ ),
726
+ )
727
+
728
+ if probe_failed:
729
+ return CycleCompletion(
730
+ state="indeterminate",
731
+ complete=False,
732
+ cycle_label=cycle_label,
733
+ n_required=n_required,
734
+ n_satisfied=satisfied,
735
+ probe_failed=probe_failed,
736
+ grace_period=grace_period,
737
+ reason=(
738
+ f"cycle indeterminate: monitor probe failed for {probe_failed} — "
739
+ f"cannot confirm cycle ({satisfied}/{n_required} confirmed fresh)"
740
+ ),
741
+ )
742
+
743
+ grace_note = f" ({len(grace_period)} in grace period)" if grace_period else ""
744
+ return CycleCompletion(
745
+ state="complete",
746
+ complete=True,
747
+ cycle_label=cycle_label,
748
+ n_required=n_required,
749
+ n_satisfied=satisfied,
750
+ grace_period=grace_period,
751
+ reason=(
752
+ f"cycle complete: all {n_required} critical artifacts present+valid"
753
+ + grace_note
754
+ ),
755
+ )
@@ -64,7 +64,13 @@ class CheckResult:
64
64
 
65
65
  row_id: str
66
66
  cadence: str
67
- status: str # "ok" | "fail" | "not_yet_effective" | "error"
67
+ # "ok" | "fail" | "degraded" | "not_yet_effective" | "error"
68
+ # "degraded" = non-fatal: either a diagnostic row (non_fatal: true, e.g.
69
+ # pipeline_execution success_rate — observability, not a gate) or a present
70
+ # artifact carrying a benign producer status (non_fatal_statuses, e.g.
71
+ # no_recent_sf_run = no upstream data this cycle, not a missing diagnostic).
72
+ # Degraded does NOT count as a failure: no SNS alert, exit 0, CW value 1.0.
73
+ status: str
68
74
  detail: str
69
75
  effective_date: str
70
76
  artifact: str | None = None
@@ -157,13 +163,16 @@ def _check_row(
157
163
 
158
164
  sub: list[str] = []
159
165
  artifact_hint: str | None = None
166
+ degraded_detail: str | None = None
160
167
  for src in row["sources"]:
161
168
  try:
162
- ok, detail, artifact = _check_source(
169
+ ok, detail, artifact, status_hint = _check_source(
163
170
  src, today, s3_client, cloudwatch_client
164
171
  )
165
172
  except Exception as exc: # pragma: no cover — defensive
166
- ok, detail, artifact = False, f"checker error: {exc!r}", None
173
+ ok, detail, artifact, status_hint = (
174
+ False, f"checker error: {exc!r}", None, None
175
+ )
167
176
  if artifact and artifact_hint is None:
168
177
  artifact_hint = artifact
169
178
  if ok:
@@ -175,8 +184,28 @@ def _check_row(
175
184
  effective_date=str(eff),
176
185
  artifact=artifact_hint,
177
186
  )
187
+ if status_hint == "degraded" and degraded_detail is None:
188
+ degraded_detail = detail
178
189
  sub.append(detail)
179
190
 
191
+ # All sources failed. Classify non-fatal degradation vs hard fail:
192
+ # - row-level ``non_fatal: true`` → diagnostic/observability row demoted
193
+ # from a gate (Phase 1c: pipeline_execution success_rate).
194
+ # - any source signalled "degraded" → present artifact carrying a benign
195
+ # producer status (Phase 1a: e.g. no_recent_sf_run = no upstream data
196
+ # this cycle, not a missing diagnostic).
197
+ # Either way the cycle isn't "broken" — surface it without failing the gate.
198
+ if row.get("non_fatal") or degraded_detail is not None:
199
+ return CheckResult(
200
+ row_id=row["id"],
201
+ cadence=row["cadence"],
202
+ status="degraded",
203
+ detail=degraded_detail or "; ".join(sub),
204
+ effective_date=str(eff),
205
+ artifact=artifact_hint,
206
+ sub_failures=sub,
207
+ )
208
+
180
209
  return CheckResult(
181
210
  row_id=row["id"],
182
211
  cadence=row["cadence"],
@@ -198,12 +227,22 @@ def _check_source(
198
227
  today: date,
199
228
  s3_client: Any,
200
229
  cloudwatch_client: Any,
201
- ) -> tuple[bool, str, str | None]:
230
+ ) -> tuple[bool, str, str | None, str | None]:
231
+ """Run a source handler, normalized to ``(ok, detail, artifact, status_hint)``.
232
+
233
+ Handlers may return a 3-tuple (the common case) or a 4-tuple whose 4th
234
+ element is a ``status_hint`` ("degraded") used to mark a non-fatal
235
+ non-pass. Normalizing here keeps handlers that don't care unchanged.
236
+ """
202
237
  kind = src["kind"]
203
238
  handler = _SOURCE_HANDLERS.get(kind)
204
239
  if handler is None:
205
- return False, f"unsupported source kind: {kind}", None
206
- return handler(src, today, s3_client, cloudwatch_client)
240
+ return False, f"unsupported source kind: {kind}", None, None
241
+ result = handler(src, today, s3_client, cloudwatch_client)
242
+ if len(result) == 4:
243
+ return result
244
+ ok, detail, artifact = result
245
+ return ok, detail, artifact, None
207
246
 
208
247
 
209
248
  def _resolve_key(src: dict, today: date) -> tuple[str, str]:
@@ -293,7 +332,7 @@ def _resolve_and_age(
293
332
 
294
333
  def _check_s3_json(
295
334
  src: dict, today: date, s3_client: Any, _cw: Any
296
- ) -> tuple[bool, str, str | None]:
335
+ ) -> tuple[bool, str, str | None] | tuple[bool, str, str | None, str | None]:
297
336
  bucket = src.get("bucket", DEFAULT_BUCKET)
298
337
  key, age, status = _resolve_and_age(src, today, s3_client)
299
338
  if key is None:
@@ -322,6 +361,22 @@ def _check_s3_json(
322
361
  except Exception as exc:
323
362
  return False, f"json parse error on s3://{bucket}/{key}: {exc!r}", key
324
363
 
364
+ # Phase 1a: a present artifact carrying a benign producer status is a
365
+ # legitimate cycle state (no upstream data), NOT a missing diagnostic and
366
+ # NOT a hard failure. Short-circuit BEFORE evaluating asserts so we don't
367
+ # report a misleading "coverage 0% < 99". Always-emit (producer side) is
368
+ # what makes this distinguishable from absence.
369
+ non_fatal_statuses = src.get("non_fatal_statuses", [])
370
+ prod_status = payload.get("status") if isinstance(payload, dict) else None
371
+ if non_fatal_statuses and prod_status in non_fatal_statuses:
372
+ return (
373
+ False,
374
+ f"degraded: producer status='{prod_status}' — no upstream data "
375
+ f"this cycle (s3://{bucket}/{key})",
376
+ key,
377
+ "degraded",
378
+ )
379
+
325
380
  failures: list[str] = []
326
381
  for required in src.get("assert_keys_present", []):
327
382
  if required not in payload:
@@ -636,8 +691,9 @@ def emit_cloudwatch_metrics(results: list[CheckResult], cloudwatch_client: Any =
636
691
 
637
692
  metric_data = []
638
693
  for r in results:
639
- # 1 = ok or not_yet_effective (counts as healthy), 0 = fail
640
- value = 1.0 if r.status in ("ok", "not_yet_effective") else 0.0
694
+ # 1 = ok / not_yet_effective / degraded (all non-failing), 0 = fail.
695
+ # Degraded is non-fatal so it must not trip the SubstrateRowOK alarm.
696
+ value = 1.0 if r.status in ("ok", "not_yet_effective", "degraded") else 0.0
641
697
  metric_data.append({
642
698
  "MetricName": "SubstrateRowOK",
643
699
  "Dimensions": [{"Name": "RowID", "Value": r.row_id}],
@@ -646,10 +702,12 @@ def emit_cloudwatch_metrics(results: list[CheckResult], cloudwatch_client: Any =
646
702
  })
647
703
  n_ok = sum(1 for r in results if r.status == "ok")
648
704
  n_fail = sum(1 for r in results if r.status == "fail")
705
+ n_degraded = sum(1 for r in results if r.status == "degraded")
649
706
  n_pending = sum(1 for r in results if r.status == "not_yet_effective")
650
707
  metric_data.extend([
651
708
  {"MetricName": "SubstrateChecksOK", "Value": float(n_ok), "Unit": "Count"},
652
709
  {"MetricName": "SubstrateChecksFailed", "Value": float(n_fail), "Unit": "Count"},
710
+ {"MetricName": "SubstrateChecksDegraded", "Value": float(n_degraded), "Unit": "Count"},
653
711
  {"MetricName": "SubstrateChecksPending", "Value": float(n_pending), "Unit": "Count"},
654
712
  ])
655
713
 
@@ -664,15 +722,22 @@ def format_report(results: list[CheckResult]) -> str:
664
722
  lines = ["Substrate Health Report", "=" * 50]
665
723
  n_ok = sum(1 for r in results if r.status == "ok")
666
724
  n_fail = sum(1 for r in results if r.status == "fail")
725
+ n_degraded = sum(1 for r in results if r.status == "degraded")
667
726
  n_pending = sum(1 for r in results if r.status == "not_yet_effective")
668
727
  n_total = len(results)
669
- pct = (100.0 * n_ok / max(1, n_total - n_pending)) if n_total > n_pending else 0.0
728
+ # Gating denominator excludes pending (not yet effective) AND degraded
729
+ # (non-fatal, can't be scored pass/fail this cycle).
730
+ n_gating = n_total - n_pending - n_degraded
731
+ pct = (100.0 * n_ok / n_gating) if n_gating > 0 else 100.0
670
732
  lines.append(
671
- f"OK: {n_ok} Failed: {n_fail} Pending: {n_pending} "
672
- f"({pct:.1f}% of effective rows passing)"
733
+ f"OK: {n_ok} Failed: {n_fail} Degraded: {n_degraded} "
734
+ f"Pending: {n_pending} ({pct:.1f}% of gating rows passing)"
673
735
  )
674
736
  lines.append("")
675
- icon = {"ok": "OK ", "fail": "FAIL", "not_yet_effective": "PEND", "error": "ERR "}
737
+ icon = {
738
+ "ok": "OK ", "fail": "FAIL", "degraded": "DEGR",
739
+ "not_yet_effective": "PEND", "error": "ERR ",
740
+ }
676
741
  for r in results:
677
742
  lines.append(f" [{icon.get(r.status, '?')}] {r.row_id:30s} {r.detail}")
678
743
  failures = [r for r in results if r.status == "fail"]
@@ -681,6 +746,12 @@ def format_report(results: list[CheckResult]) -> str:
681
746
  lines.append("ACTIONS NEEDED:")
682
747
  for r in failures:
683
748
  lines.append(f" - {r.row_id}: {r.detail}")
749
+ degraded = [r for r in results if r.status == "degraded"]
750
+ if degraded:
751
+ lines.append("")
752
+ lines.append("DEGRADED (non-fatal — observability, no action gate):")
753
+ for r in degraded:
754
+ lines.append(f" - {r.row_id}: {r.detail}")
684
755
  return "\n".join(lines)
685
756
 
686
757
 
@@ -24,9 +24,23 @@ inventory:
24
24
  - id: pipeline_execution
25
25
  cadence: weekly
26
26
  effective_date: 2026-04-01
27
+ # non_fatal (Phase 1c, 2026-05-29): raw CloudWatch SF success_rate is a
28
+ # FIRST-PASS / OPERATOR-TOIL diagnostic, NOT a health gate. On a young,
29
+ # recovery-stitched system a cycle that needs N executions (1 scheduled
30
+ # fail + recovery reruns) to produce a complete, valid artifact set is a
31
+ # healthy cycle with an efficiency cost — counting every manual recovery /
32
+ # smoke / postfix run against a 99% SLA produces false alarms (Sat 40% /
33
+ # weekday 71% on 2026-05-29, both already-recovered incidents). Cycle
34
+ # health is judged by the per-artifact deliverable rows below (the
35
+ # asset-based / data-contract gate); this row stays for visibility but a
36
+ # low success_rate degrades, it does not fail. See
37
+ # private-docs/artifact-completion-monitoring-design-260529.md (Move 1) +
38
+ # feedback_judge_recovery_sf_by_artifact_union_not_execution_status.
39
+ non_fatal: true
27
40
  description: >-
28
- SF success rate ≥ 99% across Saturday + weekday + EOD pipelines,
29
- with per-stage durations recorded.
41
+ First-pass SF success rate across Saturday + weekday + EOD pipelines
42
+ (operator-toil diagnostic, NOT a gate — recovery-stitched cycles are
43
+ healthy; deliverable rows below are authoritative).
30
44
  sources:
31
45
  - kind: cloudwatch
32
46
  namespace: AWS/States
@@ -64,6 +78,17 @@ inventory:
64
78
  # The most recent date's artifact is the authoritative one.
65
79
  key_pattern: backtest/{date}/decision_capture_coverage.json
66
80
  max_age_days: 8
81
+ # non_fatal_statuses (Phase 1a, 2026-05-29): the backtester always
82
+ # emits this artifact now (alpha-engine-backtester #265), so absence
83
+ # means the diagnostic never ran (a real failure), while a present
84
+ # artifact with status=no_recent_sf_run means the upstream Saturday SF
85
+ # produced no agent captures this cycle (research failed / recovery
86
+ # was predictor-only) — a legitimate recovery-stitched state, not a
87
+ # coverage regression. Degrade rather than fail; a genuinely dead
88
+ # research pipeline is caught independently by the research_signals
89
+ # freshness row. See artifact-completion-monitoring-design-260529.md.
90
+ non_fatal_statuses:
91
+ - no_recent_sf_run
67
92
  assert:
68
93
  - path: coverage_pct
69
94
  op: gte
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alpha-engine-lib
3
- Version: 0.42.0
3
+ Version: 0.44.0
4
4
  Summary: Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, and S3-conditional-PUT writer locks. Full surface documented in README.
5
5
  Author: Brian McMahon
6
6
  License: Proprietary
@@ -41,7 +41,9 @@ from alpha_engine_lib.artifact_freshness import (
41
41
  ArtifactSpec,
42
42
  CADENCE_SYMBOLS,
43
43
  CheckResult,
44
+ CycleCompletion,
44
45
  check_freshness,
46
+ cycle_completion,
45
47
  resolve_current_cycle,
46
48
  resolve_dedup_key,
47
49
  )
@@ -510,3 +512,114 @@ def test_cadence_symbols_match_documented_set():
510
512
  assert CADENCE_SYMBOLS == frozenset(
511
513
  {"saturday_sf", "weekday_sf", "eod_sf", "continuous"}
512
514
  )
515
+
516
+
517
+ # ── Per-cycle completion rollup (Phase 1b) ──────────────────────────────────
518
+
519
+
520
+ def _critical(artifact_id: str) -> ArtifactSpec:
521
+ return _spec(artifact_id=artifact_id, severity="critical")
522
+
523
+
524
+ def _warning(artifact_id: str) -> ArtifactSpec:
525
+ return _spec(artifact_id=artifact_id, severity="warning")
526
+
527
+
528
+ def _res(state: str) -> CheckResult:
529
+ return CheckResult(state=state, reason=f"test {state}")
530
+
531
+
532
+ class TestCycleCompletion:
533
+ def test_all_critical_fresh_is_complete(self):
534
+ pairs = [
535
+ (_critical("a"), _res("fresh")),
536
+ (_critical("b"), _res("fresh")),
537
+ (_critical("c"), _res("fresh")),
538
+ ]
539
+ v = cycle_completion(pairs, cycle_label="2026-W22")
540
+ assert isinstance(v, CycleCompletion)
541
+ assert v.state == "complete"
542
+ assert v.complete is True
543
+ assert v.n_required == 3
544
+ assert v.n_satisfied == 3
545
+ assert v.cycle_label == "2026-W22"
546
+
547
+ def test_one_missing_is_incomplete(self):
548
+ v = cycle_completion([
549
+ (_critical("a"), _res("fresh")),
550
+ (_critical("b"), _res("missing")),
551
+ ])
552
+ assert v.state == "incomplete"
553
+ assert v.complete is False
554
+ assert v.missing == ["b"]
555
+ assert v.n_satisfied == 1
556
+
557
+ def test_one_stale_is_incomplete(self):
558
+ v = cycle_completion([
559
+ (_critical("a"), _res("fresh")),
560
+ (_critical("b"), _res("stale")),
561
+ ])
562
+ assert v.state == "incomplete"
563
+ assert v.stale == ["b"]
564
+
565
+ def test_probe_failed_only_is_indeterminate(self):
566
+ v = cycle_completion([
567
+ (_critical("a"), _res("fresh")),
568
+ (_critical("b"), _res("probe_failed")),
569
+ ])
570
+ assert v.state == "indeterminate"
571
+ assert v.complete is False
572
+ assert v.probe_failed == ["b"]
573
+
574
+ def test_real_gap_outranks_probe_failure(self):
575
+ """A confirmed miss is more actionable than an unconfirmable probe."""
576
+ v = cycle_completion([
577
+ (_critical("a"), _res("missing")),
578
+ (_critical("b"), _res("probe_failed")),
579
+ ])
580
+ assert v.state == "incomplete"
581
+ assert v.missing == ["a"]
582
+ assert v.probe_failed == ["b"] # still localized, but doesn't set the verdict
583
+
584
+ def test_grace_period_counts_as_satisfied(self):
585
+ v = cycle_completion([
586
+ (_critical("a"), _res("fresh")),
587
+ (_critical("b"), _res("grace_period")),
588
+ ])
589
+ assert v.state == "complete"
590
+ assert v.complete is True
591
+ assert v.n_satisfied == 2
592
+ assert v.grace_period == ["b"]
593
+
594
+ def test_warning_severity_excluded_from_required_set(self):
595
+ """A missing WARNING artifact must not fail the cycle — only
596
+ critical rows gate the completion verdict."""
597
+ v = cycle_completion([
598
+ (_critical("a"), _res("fresh")),
599
+ (_warning("b"), _res("missing")),
600
+ ])
601
+ assert v.state == "complete"
602
+ assert v.n_required == 1
603
+ assert v.missing == []
604
+
605
+ def test_empty_required_set_is_vacuously_complete(self):
606
+ v = cycle_completion([(_warning("a"), _res("missing"))])
607
+ assert v.state == "complete"
608
+ assert v.complete is True
609
+ assert v.n_required == 0
610
+
611
+ def test_mixed_states_incomplete_localizes_all_gaps(self):
612
+ v = cycle_completion([
613
+ (_critical("a"), _res("fresh")),
614
+ (_critical("b"), _res("grace_period")),
615
+ (_critical("c"), _res("missing")),
616
+ (_critical("d"), _res("stale")),
617
+ (_critical("e"), _res("probe_failed")),
618
+ ])
619
+ assert v.state == "incomplete"
620
+ assert v.n_required == 5
621
+ assert v.n_satisfied == 2 # fresh + grace_period
622
+ assert v.missing == ["c"]
623
+ assert v.stale == ["d"]
624
+ assert v.probe_failed == ["e"]
625
+ assert v.grace_period == ["b"]
@@ -955,3 +955,112 @@ def test_format_report_lists_actions_for_failed_rows():
955
955
  out = format_report(results)
956
956
  assert "ACTIONS NEEDED" in out
957
957
  assert "b: missing column" in out
958
+
959
+
960
+ # ---------------------------------------------------------------------------
961
+ # Phase 1 — degraded (non-fatal) status: non_fatal_statuses + non_fatal row
962
+ # ---------------------------------------------------------------------------
963
+
964
+
965
+ def _s3_json_row(extra_src=None):
966
+ src = {
967
+ "kind": "s3_json",
968
+ "bucket": "b",
969
+ "key": "k.json",
970
+ "max_age_days": 4,
971
+ "assert": [{"path": "coverage_pct", "op": "gte", "value": 99}],
972
+ }
973
+ if extra_src:
974
+ src.update(extra_src)
975
+ return {
976
+ "version": 1,
977
+ "inventory": [{
978
+ "id": "agent_decisions", "cadence": "daily",
979
+ "effective_date": "2026-01-01", "description": "x",
980
+ "sources": [src],
981
+ }],
982
+ }
983
+
984
+
985
+ def test_s3_json_non_fatal_status_degrades_not_fails():
986
+ """A present artifact carrying a benign producer status (no_recent_sf_run)
987
+ degrades — it is NOT a coverage failure and NOT a missing diagnostic."""
988
+ inv = _s3_json_row({"non_fatal_statuses": ["no_recent_sf_run"]})
989
+ s3 = StubS3()
990
+ s3.put("b", "k.json", json.dumps(
991
+ {"status": "no_recent_sf_run", "coverage_pct": 0.0}).encode())
992
+ res = check_inventory("daily", today=date(2026, 6, 1), inventory=inv, s3_client=s3)
993
+ assert res[0].status == "degraded"
994
+ assert "no_recent_sf_run" in res[0].detail
995
+
996
+
997
+ def test_s3_json_non_fatal_status_ok_passes_normally():
998
+ """status=ok + coverage passing → ok (non_fatal_statuses doesn't interfere)."""
999
+ inv = _s3_json_row({"non_fatal_statuses": ["no_recent_sf_run"]})
1000
+ s3 = StubS3()
1001
+ s3.put("b", "k.json", json.dumps(
1002
+ {"status": "ok", "coverage_pct": 100.0}).encode())
1003
+ res = check_inventory("daily", today=date(2026, 6, 1), inventory=inv, s3_client=s3)
1004
+ assert res[0].status == "ok"
1005
+
1006
+
1007
+ def test_s3_json_status_not_in_non_fatal_set_still_fails():
1008
+ """A failing status NOT in non_fatal_statuses (here: low coverage) still
1009
+ fails — only the listed benign statuses degrade."""
1010
+ inv = _s3_json_row({"non_fatal_statuses": ["no_recent_sf_run"]})
1011
+ s3 = StubS3()
1012
+ s3.put("b", "k.json", json.dumps(
1013
+ {"status": "ok", "coverage_pct": 50.0}).encode())
1014
+ res = check_inventory("daily", today=date(2026, 6, 1), inventory=inv, s3_client=s3)
1015
+ assert res[0].status == "fail"
1016
+
1017
+
1018
+ def test_missing_artifact_still_fails_even_with_non_fatal_statuses():
1019
+ """Absence is a real failure: always-emit (producer) means a missing
1020
+ object = diagnostic never ran, distinct from a benign present status."""
1021
+ inv = _s3_json_row({"non_fatal_statuses": ["no_recent_sf_run"]})
1022
+ s3 = StubS3() # nothing put
1023
+ res = check_inventory("daily", today=date(2026, 6, 1), inventory=inv, s3_client=s3)
1024
+ assert res[0].status == "fail"
1025
+
1026
+
1027
+ def test_row_non_fatal_degrades_on_source_failure():
1028
+ """A row marked non_fatal: true degrades instead of failing when its
1029
+ source fails (Phase 1c: pipeline_execution success_rate is a diagnostic)."""
1030
+ inv = _s3_json_row()
1031
+ inv["inventory"][0]["non_fatal"] = True
1032
+ inv["inventory"][0]["id"] = "pipeline_execution"
1033
+ s3 = StubS3()
1034
+ s3.put("b", "k.json", json.dumps({"coverage_pct": 10.0}).encode())
1035
+ res = check_inventory("daily", today=date(2026, 6, 1), inventory=inv, s3_client=s3)
1036
+ assert res[0].status == "degraded"
1037
+
1038
+
1039
+ def test_format_report_separates_degraded_from_actions():
1040
+ from alpha_engine_lib.transparency import CheckResult, format_report
1041
+ results = [
1042
+ CheckResult("ok_row", "daily", "ok", "fine", "2026-01-01"),
1043
+ CheckResult("bad_row", "daily", "fail", "broke", "2026-01-01"),
1044
+ CheckResult("degr_row", "daily", "degraded", "no upstream data", "2026-01-01"),
1045
+ ]
1046
+ report = format_report(results)
1047
+ assert "Degraded: 1" in report
1048
+ assert "[DEGR] degr_row" in report
1049
+ # degraded must NOT appear under ACTIONS NEEDED
1050
+ actions = report.split("ACTIONS NEEDED:")[1].split("DEGRADED")[0]
1051
+ assert "degr_row" not in actions
1052
+ assert "bad_row" in actions
1053
+ assert "DEGRADED (non-fatal" in report
1054
+
1055
+
1056
+ def test_real_inventory_pipeline_execution_is_non_fatal():
1057
+ inv = load_inventory()
1058
+ row = next(r for r in inv["inventory"] if r["id"] == "pipeline_execution")
1059
+ assert row.get("non_fatal") is True
1060
+
1061
+
1062
+ def test_real_inventory_agent_decisions_degrades_on_no_recent_sf_run():
1063
+ inv = load_inventory()
1064
+ row = next(r for r in inv["inventory"] if r["id"] == "agent_decisions")
1065
+ src = next(s for s in row["sources"] if s["kind"] == "s3_json")
1066
+ assert "no_recent_sf_run" in src.get("non_fatal_statuses", [])