alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3 @@
1
+ """alpha-engine-lib — shared utilities for Alpha Engine modules."""
2
+
3
+ __version__ = "0.32.0"
@@ -0,0 +1,663 @@
1
+ """LLM-output Pydantic schemas — shared contract surface for agents
2
+ across alpha-engine-research, replay tooling in alpha-engine-backtester,
3
+ and any future consumer that needs to validate agent output against the
4
+ canonical shape.
5
+
6
+ Why these live in the shared lib (not in alpha-engine-research):
7
+
8
+ Replay harness invocation isomorphism — the replay path in
9
+ alpha-engine-backtester needs to call ``with_structured_output(Schema)``
10
+ using the EXACT same Pydantic schema the production agent used.
11
+ Without a shared lib, backtester would either need a heavy cross-repo
12
+ dependency on research (pulls langgraph + every research dep) or have
13
+ to vendor a local copy that drifts. This submodule is the canonical
14
+ home — research re-exports from here so existing call sites keep
15
+ working unchanged.
16
+
17
+ What's here:
18
+
19
+ - 14 ``LLMOutput`` and supporting schemas used in
20
+ ``langchain_anthropic.ChatAnthropic.with_structured_output(...)``
21
+ calls across the research pipeline.
22
+ - The literal types they reference (``RegimeLiteral``,
23
+ ``CIORawDecisionLiteral``).
24
+
25
+ What's NOT here (intentionally):
26
+
27
+ - State-machine objects (``SectorTeamOutput``, ``MacroEconomistOutput``,
28
+ ``CIOOutput``, ``InvestmentThesis``) — these are research-internal
29
+ state types, not LLM-output contracts. They live in
30
+ ``alpha-engine-research/graph/state_schemas.py`` and stay there.
31
+ - Domain types coupled to research's tool layer (``ToolCall``,
32
+ ``ExitEvent``, ``PopulationRotationEvent``).
33
+
34
+ Schema-validation discipline: every class here has
35
+ ``model_config = ConfigDict(extra="allow")`` because LLM outputs may
36
+ include additional fields (forward-compatible drift). Validators that
37
+ defend against observed LLM failure modes (e.g. ``selected_decisions``
38
+ returned as a JSON-encoded string) move with the class.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import json
44
+ import logging
45
+ from typing import Literal
46
+
47
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
48
+
49
+
50
+ # ── Literals ─────────────────────────────────────────────────────────────
51
+
52
+
53
+ RegimeLiteral = Literal["bull", "neutral", "bear", "caution"]
54
+ """Macro market regime — output of the macro_economist agent and the
55
+ macro critic. Drives sector_modifiers downstream and the executor's
56
+ graduated drawdown gate."""
57
+
58
+
59
+ CIORawDecisionLiteral = Literal["ADVANCE", "REJECT", "NO_ADVANCE_DEADLOCK"]
60
+ """CIO emits the literal ``NO_ADVANCE_DEADLOCK`` for low-conviction
61
+ picks that don't clear the floor; post-processing in the research
62
+ layer's ``_parse_cio_response`` may synthesize ``ADVANCE_FORCED`` to
63
+ fill below-floor open slots, but that synthesis happens AFTER the LLM
64
+ extraction, so the raw schema only enumerates the three values the LLM
65
+ is allowed to emit."""
66
+
67
+
68
+ STANCE_NAMES: tuple[str, ...] = ("momentum", "value", "quality", "catalyst")
69
+ """Canonical ordering of stance names. Source of truth for both the
70
+ ``StanceLiteral`` type and ``StanceLoadings`` field iteration order.
71
+ Pinning the tuple lets tests assert equality rather than set-equality,
72
+ surfacing unintended reordering. Iteration is also the tie-break order
73
+ for ``StanceLoadings.argmax()``."""
74
+
75
+
76
+ StanceLiteral = Literal["momentum", "value", "quality", "catalyst"]
77
+ """Per-pick investment stance — the shared vocabulary across the
78
+ stance-taxonomy arc. Routes downstream executor gating:
79
+
80
+ - ``momentum`` — trend-following; ticker has strong recent price
81
+ action (20d ret > 0, MA50 > MA200, RSI 40-70). Executor applies the
82
+ standard momentum_veto (block if 20d < -X%).
83
+ - ``value`` — contrarian; quality business at discounted price
84
+ after sell-off. Executor inverts momentum_veto (requires drawdown to
85
+ qualify) and applies smaller sizing (0.7×) + wider ATR stops (3× vs
86
+ 2×). 30d time-bounded — exit if no bounce.
87
+ - ``quality`` — defensive; stable earnings, hold-through-cycle.
88
+ Executor relaxes momentum_veto threshold (-15% vs -5%), applies 0.8×
89
+ sizing, disables time decay (longer hold), tighter sector cap.
90
+ - ``catalyst`` — event-driven; specific upcoming catalyst (earnings
91
+ beat, FDA approval, M&A) drives thesis. Executor skips momentum_veto
92
+ entirely but requires ``catalyst_date`` (within 30 days) AND applies
93
+ 0.6× sizing (event-driven = higher variance) + hard exit on
94
+ catalyst_date+3d if no follow-through.
95
+
96
+ Origin: 2026-05-11 stance taxonomy arc (private plan at
97
+ ``alpha-engine-docs/private/stance-taxonomy-arc-260511.md``).
98
+
99
+ **Stance is DERIVED downstream of agents, not declared by them.**
100
+ The sector-team agents (quant + qual + peer review) focus on alpha
101
+ generation; a heuristic stance classifier in ``alpha-engine-predictor``
102
+ reads per-ticker features (momentum_20d, vol, fundamental ratios,
103
+ upcoming earnings) + FMP catalyst calendar and emits the stance label
104
+ on ``predictions.json``. The executor consumes ``pred_data["stance"]``
105
+ when applying stance-conditional gating. Rationale: factor models at
106
+ AQR / BlackRock / Barra derive loadings from data rather than asking
107
+ analysts to self-tag; that's the institutional pattern. Adding a 5th
108
+ declaration task to the agents would also degrade focus on their
109
+ core alpha-generation work.
110
+
111
+ Closed set of 4 chosen deliberately — small enough for decisive
112
+ classification, large enough to cover real strategies. There is no
113
+ "mixed" / "other" option on the discrete label because picks ARE
114
+ naturally mixed — the ``StanceLoadings`` continuous emission below
115
+ captures the mixed exposure faithfully; ``StanceLiteral`` is the
116
+ ``argmax(loadings)`` convenience label for simple consumers.
117
+ """
118
+
119
+
120
+ class StanceLoadings(BaseModel):
121
+ """Continuous per-stance loadings — institutional factor-model
122
+ pattern. Each field is in ``[0, 1]``; the four fields sum to ``1.0``.
123
+
124
+ Most picks have mixed factor exposure (e.g., 0.65 momentum +
125
+ 0.20 quality + 0.10 value + 0.05 catalyst). Discrete labels force
126
+ an artificial single-choice when reality is mixed; this model
127
+ captures the mix faithfully.
128
+
129
+ Producer: the heuristic stance classifier in alpha-engine-predictor
130
+ (``model/stance_classifier.py``). Smooth functions over per-ticker
131
+ features produce raw scores; ``softmax`` normalizes to a proper
132
+ probability distribution.
133
+
134
+ Consumers:
135
+ - **Simple consumers** (executor v1, dashboards): use
136
+ ``StanceLiteral`` (``argmax(loadings)``) instead and route to
137
+ one gate. No need to read this model.
138
+ - **Nuanced consumers** (backtester per-loading attribution,
139
+ future weighted-gate executor v2, future ML stance classifier):
140
+ read this model and weight gates / sizing / attribution by
141
+ each loading.
142
+
143
+ The discrete-vs-continuous split lets us ship the simple
144
+ consumer path now (v1 routes by argmax) while leaving the
145
+ institutional-grade continuous data on predictions.json for
146
+ later sophistication (no future schema migration required).
147
+ """
148
+
149
+ model_config = ConfigDict(extra="forbid")
150
+
151
+ momentum: float = Field(ge=0.0, le=1.0, description="Trending-up factor loading")
152
+ value: float = Field(ge=0.0, le=1.0, description="Oversold-but-defensible factor loading")
153
+ quality: float = Field(ge=0.0, le=1.0, description="Low-vol / defensive factor loading")
154
+ catalyst: float = Field(ge=0.0, le=1.0, description="Event-driven factor loading")
155
+
156
+ @model_validator(mode="after")
157
+ def _check_sum_to_one(self):
158
+ """Loadings must form a proper probability distribution. 1e-3
159
+ tolerance accommodates float roundoff in softmax + the producer's
160
+ rounding-to-6-decimals on serialization."""
161
+ total = self.momentum + self.value + self.quality + self.catalyst
162
+ if not (0.999 < total < 1.001):
163
+ raise ValueError(
164
+ f"stance_loadings must sum to 1.0 (±1e-3); got {total:.6f}"
165
+ )
166
+ return self
167
+
168
+ def argmax(self) -> StanceLiteral:
169
+ """Convenience: return the dominant stance label (highest
170
+ loading). Ties broken in canonical ``STANCE_NAMES`` order. Used
171
+ by executor v1 + dashboards for single-label routing."""
172
+ pairs = (
173
+ ("momentum", self.momentum),
174
+ ("value", self.value),
175
+ ("quality", self.quality),
176
+ ("catalyst", self.catalyst),
177
+ )
178
+ return max(pairs, key=lambda p: p[1])[0] # type: ignore[return-value]
179
+
180
+
181
+ CIORuleTagLiteral = Literal[
182
+ "qual_veto",
183
+ "quant_veto",
184
+ "dual_score_floor",
185
+ "rr_asymmetry",
186
+ "macro_alignment",
187
+ "portfolio_fit",
188
+ "catalyst_specificity",
189
+ "prior_continuity",
190
+ "other",
191
+ ]
192
+ """Per-decision attribution tag identifying which rule(s) drove the
193
+ CIO's verdict. Vocabulary mirrors the prompt's EVALUATION CRITERIA
194
+ (items 1-5) plus two implicit veto gates and a continuity tag:
195
+
196
+ - ``qual_veto`` — qual_score < 50 trip
197
+ - ``quant_veto`` — quant_score < 50 trip
198
+ - ``dual_score_floor`` — both quant + qual < 60 with no compensating R/R
199
+ - ``rr_asymmetry`` — R/R-ratio framing as primary justification
200
+ - ``macro_alignment`` — sector under/overweight as primary factor
201
+ - ``portfolio_fit`` — diversification, concentration, or already-held
202
+ - ``catalyst_specificity`` — time-bound, named catalyst as primary factor
203
+ - ``prior_continuity`` — prior IC continuity (rolled-over advance)
204
+ - ``other`` — escape hatch for non-fitting reasoning
205
+
206
+ Multiple tags per decision are allowed (a REJECT can be both
207
+ ``qual_veto`` AND ``macro_alignment``). Optional list[str] | None on
208
+ the schema — None means the LLM didn't emit tags (legacy artifacts
209
+ from before the v0.7.0 prompt update). Backtester analysis (per-tag
210
+ precision over time) reads this field to surface which gates are
211
+ systematically over- or under-rejecting."""
212
+
213
+
214
+ # ── Quant analyst (sector_quant) ─────────────────────────────────────────
215
+
216
+
217
+ class QuantPick(BaseModel):
218
+ """One ranked candidate from the quant analyst's ReAct loop."""
219
+
220
+ model_config = ConfigDict(extra="allow")
221
+
222
+ ticker: str
223
+ quant_score: float = Field(ge=0, le=100)
224
+ rationale: str = ""
225
+ key_metrics: dict = Field(default_factory=dict)
226
+
227
+
228
+ class QuantAnalystOutput(BaseModel):
229
+ """Wrapper for the quant ReAct agent's structured response.
230
+
231
+ LangGraph ``create_react_agent(response_format=...)`` runs an extra
232
+ LLM call after the tool-loop terminates to extract this typed shape
233
+ from the conversation."""
234
+
235
+ model_config = ConfigDict(extra="allow")
236
+
237
+ ranked_picks: list[QuantPick] = Field(default_factory=list)
238
+
239
+
240
+ # ── Qual analyst (sector_qual) ───────────────────────────────────────────
241
+
242
+
243
+ class QualAssessment(BaseModel):
244
+ """One per-ticker qualitative assessment from the qual analyst."""
245
+
246
+ model_config = ConfigDict(extra="allow")
247
+
248
+ ticker: str
249
+ qual_score: float | None = Field(default=None, ge=0, le=100)
250
+ bull_case: str = ""
251
+ bear_case: str = ""
252
+ catalysts: list[str] = Field(default_factory=list)
253
+ risks: list[str] = Field(default_factory=list)
254
+ conviction: int | None = Field(default=None, ge=0, le=100)
255
+
256
+
257
+ class QualAnalystOutput(BaseModel):
258
+ """Wrapper for the qual ReAct agent's structured response.
259
+
260
+ ``additional_candidate`` is the qual-side proposal that the peer-review
261
+ quant gate then accepts or rejects (see :class:`QuantAcceptanceVerdict`).
262
+ """
263
+
264
+ model_config = ConfigDict(extra="allow")
265
+
266
+ assessments: list[QualAssessment] = Field(default_factory=list)
267
+ additional_candidate: QualAssessment | None = None
268
+
269
+
270
+ # ── Peer review (sector_peer_review) ─────────────────────────────────────
271
+
272
+
273
+ class QuantAcceptanceVerdict(BaseModel):
274
+ """Side-LLM call: peer_review's quant analyst rules on whether to
275
+ accept the qual analyst's added candidate."""
276
+
277
+ model_config = ConfigDict(extra="allow")
278
+
279
+ accept: bool
280
+ reason: str = ""
281
+
282
+
283
+ class JointFinalizationDecision(BaseModel):
284
+ """One per-ticker decision from peer_review's joint finalization.
285
+
286
+ Per-ticker rationale enables LLM-as-judge eval to score the
287
+ synthesis reasoning at decision granularity (rather than one
288
+ rationale string covering all 2-3 picks). Composes with the
289
+ LLM-as-judge workstream (ROADMAP Phase 2 P1).
290
+ """
291
+
292
+ model_config = ConfigDict(extra="allow")
293
+
294
+ ticker: str
295
+ rationale: str = Field(
296
+ default="",
297
+ description=(
298
+ "Why this ticker was selected — name R/R reasoning, score "
299
+ "asymmetry, catalyst. 1-2 sentences."
300
+ ),
301
+ )
302
+
303
+
304
+ class JointSelectionOutput(BaseModel):
305
+ """Side-LLM call: peer_review's two-pass joint finalization, Pass 1.
306
+
307
+ Pass 1 emits the ticker-list selection + cross-pick context only,
308
+ deferring per-ticker rationale generation to Pass 2 (which fans out
309
+ to one bounded ``JointFinalizationDecision`` call per selected
310
+ ticker). Two-pass design replaces the prior single-pass
311
+ ``JointFinalizationOutput`` call after 2026-05-03 + 2026-05-06
312
+ truncation incidents where Haiku-emitted rationales blew past
313
+ ``max_tokens_strategic`` mid-emission and the entire selection was
314
+ lost. With the selection separated, Pass 1's output is bounded by
315
+ construction (N tickers × ~10 tokens + a 1-2 sentence team
316
+ rationale = ~200 tokens), eliminating the truncation class for the
317
+ selection step regardless of model verbosity drift.
318
+
319
+ The legacy ``JointFinalizationOutput`` schema below stays for
320
+ replay-harness compatibility against historical artifacts.
321
+ """
322
+
323
+ model_config = ConfigDict(extra="allow")
324
+
325
+ selected_tickers: list[str] = Field(
326
+ default_factory=list,
327
+ description=(
328
+ "Array of selected ticker symbols (e.g. ['NVDA', 'PLTR', "
329
+ "'RKLB']). One entry per pick, structured array (NOT a "
330
+ "JSON-encoded string)."
331
+ ),
332
+ )
333
+ team_rationale: str = Field(
334
+ default="",
335
+ description=(
336
+ "Cross-pick rationale — sector concentration, regime fit "
337
+ "across the slate, asymmetry mix. 1-2 sentences."
338
+ ),
339
+ )
340
+
341
+
342
+ class JointFinalizationOutput(BaseModel):
343
+ """Side-LLM call: peer_review's joint quant+qual finalization, picks
344
+ the team's 2-3 final recommendations from the merged candidate set.
345
+
346
+ Per-ticker rationale lives on each ``selected_decisions`` entry;
347
+ ``team_rationale`` carries cross-pick context (sector concentration,
348
+ regime fit across the slate).
349
+
350
+ LEGACY single-pass schema. Production peer_review now uses the
351
+ two-pass flow (``JointSelectionOutput`` + per-ticker
352
+ ``JointFinalizationDecision`` calls). This schema stays for replay
353
+ harness invocation against historical artifacts pre-cutover."""
354
+
355
+ model_config = ConfigDict(extra="allow")
356
+
357
+ selected_decisions: list[JointFinalizationDecision] = Field(
358
+ default_factory=list,
359
+ description=(
360
+ "Array of per-ticker selection decisions. Return one entry "
361
+ "per selected ticker as a structured array (NOT a single "
362
+ "JSON-encoded string). Each entry must be a JSON object "
363
+ "with `ticker` and `rationale` fields."
364
+ ),
365
+ )
366
+ team_rationale: str = Field(
367
+ default="",
368
+ description=(
369
+ "Cross-pick rationale — sector concentration, regime fit "
370
+ "across the slate, asymmetry mix. 1-2 sentences."
371
+ ),
372
+ )
373
+
374
+ @field_validator("selected_decisions", mode="before")
375
+ @classmethod
376
+ def _parse_string_as_list(cls, v):
377
+ """Defense for an observed Sonnet failure mode: the model
378
+ occasionally returns ``selected_decisions`` as a JSON-encoded
379
+ string instead of a structured array, even though the tool
380
+ spec declares it as a list. First seen 2026-05-03 in SF
381
+ ``eval-pipeline-validation-2`` where Sonnet returned
382
+ ``'[\\n {\\n "ticker": "C..."\\n'`` — valid JSON inside a
383
+ string wrapper.
384
+
385
+ We log loudly (so flow-doctor surfaces the drift event in CW
386
+ alarms) and parse-and-continue rather than hard-fail, because
387
+ the downstream cost of a hard-fail is a wasted ~$5 Research
388
+ run; the log entry preserves observability while the parse
389
+ salvages the run. If the string isn't valid JSON list, fall
390
+ through to the normal Pydantic list-type error so the failure
391
+ mode stays loud.
392
+ """
393
+ if isinstance(v, str):
394
+ try:
395
+ parsed = json.loads(v)
396
+ if isinstance(parsed, list):
397
+ logging.getLogger(__name__).warning(
398
+ "[joint_finalization_schema] LLM returned "
399
+ "selected_decisions as JSON-string of length %d "
400
+ "instead of a structured array; parsed-and-continued "
401
+ "(see schema-vs-LLM drift class).",
402
+ len(v),
403
+ )
404
+ return parsed
405
+ except json.JSONDecodeError:
406
+ pass
407
+ return v
408
+
409
+
410
+ # ── Macro economist + critic ─────────────────────────────────────────────
411
+
412
+
413
+ class MacroEconomistRawOutput(BaseModel):
414
+ """Wrapping schema for ``run_macro_agent`` output. The agent emits
415
+ free-form prose (``report_md``) interleaved with a JSON block
416
+ carrying structured fields; ``with_structured_output`` extracts both."""
417
+
418
+ model_config = ConfigDict(extra="allow")
419
+
420
+ report_md: str = ""
421
+ market_regime: RegimeLiteral = "neutral"
422
+ sector_modifiers: dict[str, float] = Field(default_factory=dict)
423
+ sector_ratings: dict[str, dict] = Field(default_factory=dict)
424
+ key_theme: str = ""
425
+ material_changes: list[str] = Field(default_factory=list)
426
+
427
+ @field_validator("sector_modifiers")
428
+ @classmethod
429
+ def clamp_modifiers(cls, v: dict[str, float]) -> dict[str, float]:
430
+ """Mirror MacroEconomistOutput's clamp on the [0.70, 1.30] band."""
431
+ for sector, m in v.items():
432
+ if not (0.70 <= float(m) <= 1.30):
433
+ raise ValueError(
434
+ f"sector_modifiers[{sector!r}]={m} outside [0.70, 1.30]"
435
+ )
436
+ return v
437
+
438
+
439
+ class MacroCriticOutput(BaseModel):
440
+ """Reflection-loop critic output for the macro agent.
441
+
442
+ The critic accepts or revises the macro_economist's draft. ``revise``
443
+ triggers another macro_economist call; ``accept`` ends the loop.
444
+ """
445
+
446
+ model_config = ConfigDict(extra="allow")
447
+
448
+ action: Literal["accept", "revise"]
449
+ critique: str = ""
450
+ suggested_regime: RegimeLiteral | None = None
451
+
452
+
453
+ # ── Held-stock thesis update (sector_team) ───────────────────────────────
454
+
455
+
456
+ class HeldThesisUpdateLLMOutput(BaseModel):
457
+ """LLM-extraction shape for ``_update_thesis_for_held_stock``.
458
+
459
+ Intentionally narrative-only — NO score fields. The held-stock LLM
460
+ update path must NOT overwrite prior_scores; the existing strip-nulls
461
+ merge logic exists today specifically because the LLM occasionally
462
+ emits ``final_score: null``. By omitting score fields from the schema
463
+ entirely, the LLM cannot emit them, and the strip-nulls workaround
464
+ becomes unnecessary.
465
+
466
+ Field-level ``description`` strings are propagated by
467
+ ``with_structured_output()`` into the tool-input schema the LLM sees,
468
+ so the per-field length/count guidance previously inlined in the
469
+ prompt body lives here now (audit finding F1, PR B 2026-05-02).
470
+ """
471
+
472
+ model_config = ConfigDict(extra="allow")
473
+
474
+ bull_case: str = Field(default="", description="Bull case narrative — 1-2 sentences, ~200 chars.")
475
+ bear_case: str = Field(default="", description="Bear case narrative — 1-2 sentences, ~200 chars.")
476
+ catalysts: list[str] = Field(default_factory=list, description="Up to 5 catalysts.")
477
+ risks: list[str] = Field(default_factory=list, description="Up to 5 risks.")
478
+ conviction: int | None = Field(default=None, ge=0, le=100, description="Strength of view (0-100). ≥70 high, 40-69 moderate, <40 low.")
479
+ conviction_rationale: str = Field(default="", description="Why this conviction level — ~100 chars.")
480
+ thesis_summary: str = ""
481
+ triggers_response: str = ""
482
+
483
+
484
+ # ── CIO (ic_cio) ─────────────────────────────────────────────────────────
485
+
486
+
487
+ class CIORawDecision(BaseModel):
488
+ """One CIO decision as emitted by the LLM (pre-post-processing).
489
+
490
+ Note ``decision`` (LLM-emitted) vs ``thesis_type`` (post-processed,
491
+ used in research's ``CIODecision``): the LLM never emits ``HOLD``
492
+ directly — ``HOLD`` is what the post-processing maps ``REJECT`` to
493
+ for held tickers in the current population. The two shapes are kept
494
+ separate so each can describe its own contract precisely.
495
+ """
496
+
497
+ model_config = ConfigDict(extra="allow")
498
+
499
+ ticker: str
500
+ decision: CIORawDecisionLiteral
501
+ rank: int | None = Field(default=None, ge=0, description="1-based rank for ADVANCE picks; null for REJECT / NO_ADVANCE_DEADLOCK.")
502
+ conviction: int | None = Field(default=None, ge=0, le=100, description="Strength of view (0-100).")
503
+ rationale: str = Field(default="", description="Why this decision — name R/R reasoning (sub-scores, rr_ratio, catalyst).")
504
+ rule_tags: list[CIORuleTagLiteral] | None = Field(
505
+ default=None,
506
+ description=(
507
+ "Which gating rule(s) drove this decision. ≥1 tag per decision "
508
+ "in v1.3.0+ prompts; multiple tags allowed (a REJECT can be "
509
+ "both qual_veto AND macro_alignment). None on legacy artifacts "
510
+ "from prompts < v1.3.0."
511
+ ),
512
+ )
513
+ entry_thesis: HeldThesisUpdateLLMOutput | None = Field(default=None, description="Required for ADVANCE; null for REJECT / NO_ADVANCE_DEADLOCK.")
514
+
515
+
516
+ class CIORawOutput(BaseModel):
517
+ """Wrapper for the CIO agent's structured response. The list-of-
518
+ decisions shape mirrors what ``_parse_cio_response`` consumes today
519
+ via balanced-brace JSON extraction.
520
+
521
+ ``min_length=1`` is propagated to the LLM via the structured-output
522
+ tool schema description AND validated by the SDK parser. Caught
523
+ 2026-05-02: PR B's strip of the CIO prompt's inline JSON example
524
+ let Sonnet emit ``decisions: []`` because the structural cue that
525
+ "one entry per candidate" was lost. The prompt fix (config #21,
526
+ explicit OUTPUT REQUIREMENT block) addresses the LLM-side cue;
527
+ this constraint is the schema-side defense — empty list now
528
+ surfaces as a parsing_error at the call boundary rather than as a
529
+ later "empty decisions" raise inside ``run_cio``.
530
+ """
531
+
532
+ # ``validate_default=True`` ensures the ``min_length=1`` constraint
533
+ # fires even when ``decisions`` falls back to ``default_factory=list``.
534
+ # Pydantic v2 skips default validation by default; without this the
535
+ # empty-list rejection only triggers when a caller explicitly passes
536
+ # ``decisions=[]`` — defeating the schema-side defense.
537
+ model_config = ConfigDict(extra="allow", validate_default=True)
538
+
539
+ decisions: list[CIORawDecision] = Field(
540
+ default_factory=list,
541
+ min_length=1,
542
+ description="One entry per input candidate. Never empty — every candidate must receive a decision (ADVANCE / REJECT / NO_ADVANCE_DEADLOCK).",
543
+ )
544
+
545
+
546
+ # ── LLM-as-judge eval ────────────────────────────────────────────────────
547
+
548
+
549
+ class RubricDimensionScore(BaseModel):
550
+ """One dimension's score from the eval judge.
551
+
552
+ Score is integer 1-5 per the rubric anchors (see
553
+ eval_rubric_*.txt prompts in alpha-engine-config). The ``reasoning``
554
+ string carries the judge's per-dimension justification — used by
555
+ the dashboard's quality-trend page to surface WHY scores dropped,
556
+ not just THAT they dropped.
557
+ """
558
+
559
+ model_config = ConfigDict(extra="allow")
560
+
561
+ dimension: str = Field(description="Rubric dimension name (e.g. 'numerical_grounding', 'signal_calibration').")
562
+ score: int = Field(ge=1, le=5, description="Integer score 1-5 per the rubric anchors.")
563
+ reasoning: str = Field(description="1-2 sentence justification citing specific artifact content that drove the score.")
564
+
565
+
566
+ class RubricEvalLLMOutput(BaseModel):
567
+ """LLM-extraction shape for the eval judge call.
568
+
569
+ The judge LLM (Haiku or Sonnet) produces this against a rubric
570
+ prompt + DecisionArtifact pair. Wrapped in ``RubricEvalArtifact``
571
+ by ``evals.judge.evaluate_artifact`` before persisting to S3.
572
+ """
573
+
574
+ model_config = ConfigDict(extra="allow")
575
+
576
+ dimension_scores: list[RubricDimensionScore] = Field(
577
+ default_factory=list,
578
+ min_length=1,
579
+ description=(
580
+ "Array of per-dimension score entries. Return one entry "
581
+ "per rubric dimension as a structured array (NOT a single "
582
+ "JSON-encoded string). Each entry must be a JSON object "
583
+ "with `dimension`, `score`, and `reasoning` fields. Order "
584
+ "matches the rubric prompt's dimension list."
585
+ ),
586
+ )
587
+ overall_reasoning: str = Field(
588
+ description="1-2 sentence cross-dimension summary — strongest signal + most concerning gap.",
589
+ )
590
+
591
+ @field_validator("dimension_scores", mode="before")
592
+ @classmethod
593
+ def _parse_string_as_list(cls, v):
594
+ """Defense for an observed Haiku failure mode (first surfaced
595
+ 2026-05-03 in judge_only smoke against new-format Sat 5/3
596
+ captures): the model occasionally returns ``dimension_scores``
597
+ as a JSON-encoded string instead of a structured array, even
598
+ though the tool spec declares it as a list. Same pattern PR
599
+ #99 fixed for ``JointFinalizationOutput.selected_decisions``.
600
+
601
+ We log loudly (so flow-doctor surfaces the drift event in CW
602
+ alarms) and parse-and-continue rather than hard-fail, because
603
+ the downstream cost of a hard-fail is a wasted ~$0.0001 judge
604
+ call and a missing eval datapoint; the log entry preserves
605
+ observability while the parse salvages the run. If the string
606
+ isn't valid JSON list, fall through to the normal Pydantic
607
+ list-type error so the failure mode stays loud.
608
+ """
609
+ if isinstance(v, str):
610
+ try:
611
+ parsed = json.loads(v)
612
+ if isinstance(parsed, list):
613
+ logging.getLogger(__name__).warning(
614
+ "[rubric_eval_schema] LLM returned "
615
+ "dimension_scores as JSON-string of length %d "
616
+ "instead of a structured array; parsed-and-continued "
617
+ "(see schema-vs-LLM drift class).",
618
+ len(v),
619
+ )
620
+ return parsed
621
+ except json.JSONDecodeError:
622
+ pass
623
+ return v
624
+
625
+
626
+ # ── agent_id → SchemaClass dispatch ──────────────────────────────────────
627
+
628
+
629
+ SCHEMA_BY_AGENT_ID_BASE: dict[str, type[BaseModel]] = {
630
+ "sector_quant": QuantAnalystOutput,
631
+ "sector_qual": QualAnalystOutput,
632
+ "sector_peer_review": JointFinalizationOutput,
633
+ "macro_economist": MacroEconomistRawOutput,
634
+ "ic_cio": CIORawOutput,
635
+ "thesis_update": HeldThesisUpdateLLMOutput,
636
+ }
637
+ """Dispatch map for replay tooling and any other consumer that needs
638
+ to resolve an agent_id family to its canonical output schema. The
639
+ key is the agent_id base (the part before the first colon — e.g.
640
+ ``sector_quant`` for ``sector_quant:technology``).
641
+
642
+ ``sector_peer_review`` is mapped to JointFinalizationOutput (the
643
+ finalization step's output) rather than QuantAcceptanceVerdict (the
644
+ intermediate quant gate); the finalization output is what carries
645
+ the team's 2-3 final picks downstream and is the meaningful surface
646
+ for replay concordance.
647
+
648
+ QuantAcceptanceVerdict and MacroCriticOutput are intentionally NOT in
649
+ this map — they're side-LLM utility calls inside larger agents, not
650
+ the agent's final output."""
651
+
652
+
653
+ def resolve_schema_for_agent(agent_id: str) -> type[BaseModel] | None:
654
+ """Look up the LLM-output schema for an ``agent_id``. Returns None
655
+ when the agent_id family doesn't have a registered schema (matches
656
+ the ``evals.judge.resolve_rubric_for_agent`` pattern).
657
+
658
+ The agent_id may be plain (``"macro_economist"``) or namespaced
659
+ (``"sector_quant:technology"``, ``"thesis_update:AAPL"``). The
660
+ colon namespace separator is the existing capture convention.
661
+ """
662
+ base_id = (agent_id or "").split(":", 1)[0]
663
+ return SCHEMA_BY_AGENT_ID_BASE.get(base_id)