forgecraft-mcp 1.2.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +525 -525
- package/dist/cli/help.js +44 -44
- package/dist/registry/renderer-skeletons.js +92 -92
- package/dist/shared/gs-score-logger.js +6 -6
- package/dist/tools/add-module.js +123 -123
- package/dist/tools/advice-registry.js +18 -18
- package/dist/tools/check-cascade-report.js +64 -64
- package/dist/tools/configure-mcp.d.ts +3 -0
- package/dist/tools/configure-mcp.d.ts.map +1 -1
- package/dist/tools/configure-mcp.js +10 -0
- package/dist/tools/configure-mcp.js.map +1 -1
- package/dist/tools/forgecraft-dispatch.d.ts.map +1 -1
- package/dist/tools/forgecraft-dispatch.js +3 -0
- package/dist/tools/forgecraft-dispatch.js.map +1 -1
- package/dist/tools/forgecraft-schema-params.d.ts +9 -0
- package/dist/tools/forgecraft-schema-params.d.ts.map +1 -1
- package/dist/tools/forgecraft-schema-params.js +21 -0
- package/dist/tools/forgecraft-schema-params.js.map +1 -1
- package/dist/tools/forgecraft-schema.d.ts +9 -0
- package/dist/tools/forgecraft-schema.d.ts.map +1 -1
- package/dist/tools/refresh-output.js +14 -14
- package/dist/tools/scaffold-spec-stubs.js +115 -115
- package/dist/tools/scaffold-templates.js +62 -62
- package/dist/tools/setup-artifact-writers.d.ts +30 -0
- package/dist/tools/setup-artifact-writers.d.ts.map +1 -1
- package/dist/tools/setup-artifact-writers.js +120 -8
- package/dist/tools/setup-artifact-writers.js.map +1 -1
- package/dist/tools/setup-phase1.d.ts +3 -0
- package/dist/tools/setup-phase1.d.ts.map +1 -1
- package/dist/tools/setup-phase1.js +79 -35
- package/dist/tools/setup-phase1.js.map +1 -1
- package/dist/tools/setup-phase2.d.ts +2 -0
- package/dist/tools/setup-phase2.d.ts.map +1 -1
- package/dist/tools/setup-phase2.js +10 -1
- package/dist/tools/setup-phase2.js.map +1 -1
- package/dist/tools/setup-project.d.ts +18 -0
- package/dist/tools/setup-project.d.ts.map +1 -1
- package/dist/tools/setup-project.js +77 -1
- package/dist/tools/setup-project.js.map +1 -1
- package/dist/tools/spec-parser-tags.d.ts +9 -0
- package/dist/tools/spec-parser-tags.d.ts.map +1 -1
- package/dist/tools/spec-parser-tags.js +92 -0
- package/dist/tools/spec-parser-tags.js.map +1 -1
- package/package.json +89 -86
- package/templates/analytics/instructions.yaml +37 -37
- package/templates/analytics/mcp-servers.yaml +11 -11
- package/templates/analytics/structure.yaml +25 -25
- package/templates/api/instructions.yaml +231 -231
- package/templates/api/mcp-servers.yaml +22 -13
- package/templates/api/nfr.yaml +23 -23
- package/templates/api/review.yaml +103 -103
- package/templates/api/structure.yaml +34 -34
- package/templates/api/verification.yaml +132 -132
- package/templates/cli/instructions.yaml +31 -31
- package/templates/cli/mcp-servers.yaml +11 -11
- package/templates/cli/review.yaml +53 -53
- package/templates/cli/structure.yaml +16 -16
- package/templates/data-lineage/instructions.yaml +28 -28
- package/templates/data-lineage/mcp-servers.yaml +22 -22
- package/templates/data-pipeline/instructions.yaml +84 -84
- package/templates/data-pipeline/mcp-servers.yaml +13 -13
- package/templates/data-pipeline/nfr.yaml +39 -39
- package/templates/data-pipeline/structure.yaml +23 -23
- package/templates/fintech/hooks.yaml +55 -55
- package/templates/fintech/instructions.yaml +112 -112
- package/templates/fintech/mcp-servers.yaml +13 -13
- package/templates/fintech/nfr.yaml +46 -46
- package/templates/fintech/playbook.yaml +210 -210
- package/templates/fintech/verification.yaml +239 -239
- package/templates/game/instructions.yaml +289 -289
- package/templates/game/mcp-servers.yaml +38 -38
- package/templates/game/nfr.yaml +64 -64
- package/templates/game/playbook.yaml +214 -214
- package/templates/game/review.yaml +97 -97
- package/templates/game/structure.yaml +67 -67
- package/templates/game/verification.yaml +174 -174
- package/templates/healthcare/instructions.yaml +42 -42
- package/templates/healthcare/mcp-servers.yaml +13 -13
- package/templates/healthcare/nfr.yaml +47 -47
- package/templates/hipaa/instructions.yaml +41 -41
- package/templates/hipaa/mcp-servers.yaml +13 -13
- package/templates/infra/instructions.yaml +104 -104
- package/templates/infra/mcp-servers.yaml +20 -20
- package/templates/infra/nfr.yaml +46 -46
- package/templates/infra/review.yaml +65 -65
- package/templates/infra/structure.yaml +25 -25
- package/templates/library/instructions.yaml +36 -36
- package/templates/library/mcp-servers.yaml +20 -20
- package/templates/library/review.yaml +56 -56
- package/templates/library/structure.yaml +19 -19
- package/templates/medallion-architecture/instructions.yaml +41 -41
- package/templates/medallion-architecture/mcp-servers.yaml +22 -22
- package/templates/ml/instructions.yaml +85 -85
- package/templates/ml/mcp-servers.yaml +11 -11
- package/templates/ml/nfr.yaml +39 -39
- package/templates/ml/structure.yaml +25 -25
- package/templates/ml/verification.yaml +156 -156
- package/templates/mobile/instructions.yaml +44 -44
- package/templates/mobile/mcp-servers.yaml +11 -11
- package/templates/mobile/nfr.yaml +49 -49
- package/templates/mobile/structure.yaml +27 -27
- package/templates/mobile/verification.yaml +121 -121
- package/templates/observability-xray/instructions.yaml +40 -40
- package/templates/observability-xray/mcp-servers.yaml +15 -15
- package/templates/realtime/instructions.yaml +42 -42
- package/templates/realtime/mcp-servers.yaml +13 -13
- package/templates/soc2/instructions.yaml +41 -41
- package/templates/soc2/mcp-servers.yaml +24 -24
- package/templates/social/instructions.yaml +43 -43
- package/templates/social/mcp-servers.yaml +24 -24
- package/templates/state-machine/instructions.yaml +42 -42
- package/templates/state-machine/mcp-servers.yaml +11 -11
- package/templates/tools-registry.yaml +164 -164
- package/templates/universal/hooks.yaml +531 -531
- package/templates/universal/instructions.yaml +1692 -1692
- package/templates/universal/mcp-servers.yaml +50 -50
- package/templates/universal/nfr.yaml +197 -197
- package/templates/universal/reference.yaml +326 -326
- package/templates/universal/review.yaml +204 -204
- package/templates/universal/skills.yaml +262 -262
- package/templates/universal/structure.yaml +67 -67
- package/templates/universal/verification.yaml +416 -416
- package/templates/web-react/hooks.yaml +44 -44
- package/templates/web-react/instructions.yaml +207 -207
- package/templates/web-react/mcp-servers.yaml +20 -20
- package/templates/web-react/nfr.yaml +27 -27
- package/templates/web-react/review.yaml +94 -94
- package/templates/web-react/structure.yaml +46 -46
- package/templates/web-react/verification.yaml +126 -126
- package/templates/web-static/instructions.yaml +115 -115
- package/templates/web-static/mcp-servers.yaml +20 -20
- package/templates/web3/instructions.yaml +44 -44
- package/templates/web3/mcp-servers.yaml +11 -11
- package/templates/web3/verification.yaml +159 -159
- package/templates/zero-trust/instructions.yaml +41 -41
- package/templates/zero-trust/mcp-servers.yaml +15 -15
|
@@ -1,239 +1,239 @@
|
|
|
1
|
-
tag: FINTECH
|
|
2
|
-
section: verification
|
|
3
|
-
title: "Statistical Simulation + Heuristic Model Verification"
|
|
4
|
-
description: >
|
|
5
|
-
Financial models have two uncertainty dimensions. Stochastic uncertainty: price,
|
|
6
|
-
volume, and risk outputs depend on market regime and path-dependent sequences that
|
|
7
|
-
unit tests cannot cover. Heuristic uncertainty: hyperparameter values (lookback
|
|
8
|
-
windows, threshold multipliers, decay factors) must be found by search, not derived
|
|
9
|
-
analytically. This strategy uses statistically meaningful simulated datasets,
|
|
10
|
-
Monte Carlo scenario analysis, and constrained hyperparameter search with pruning.
|
|
11
|
-
uncertainty_levels:
|
|
12
|
-
- stochastic
|
|
13
|
-
- heuristic
|
|
14
|
-
completeness_ceiling: 0.85
|
|
15
|
-
|
|
16
|
-
phases:
|
|
17
|
-
|
|
18
|
-
- id: contract-definition
|
|
19
|
-
title: "Define Statistical Contracts and Parameter Bounds"
|
|
20
|
-
rationale: >
|
|
21
|
-
A financial model contract is a statistical bound, not an exact output.
|
|
22
|
-
"VaR at 95% confidence ≤ 2.5% of portfolio" is a valid contract.
|
|
23
|
-
"The model returns 0.031" is not. Contracts must hold across market regimes,
|
|
24
|
-
not just on the training period.
|
|
25
|
-
steps:
|
|
26
|
-
- id: define-statistical-invariants
|
|
27
|
-
instruction: >
|
|
28
|
-
For each model output, define statistical contracts:
|
|
29
|
-
- VaR (Value at Risk): max acceptable VaR at 95% and 99% confidence levels
|
|
30
|
-
- CVaR (Conditional VaR / Expected Shortfall): max acceptable CVaR at 95%
|
|
31
|
-
- Sharpe ratio: minimum acceptable Sharpe across rolling 90-day windows
|
|
32
|
-
- Max drawdown: maximum acceptable peak-to-trough drawdown
|
|
33
|
-
- Hit rate: for classification signals, minimum precision and recall
|
|
34
|
-
Store in docs/statistical-contracts.md.
|
|
35
|
-
contract: >
|
|
36
|
-
docs/statistical-contracts.md exists with one row per model output.
|
|
37
|
-
Each row has: metric, confidence level, bound (max or min), measurement window.
|
|
38
|
-
tools: ["filesystem"]
|
|
39
|
-
expected_output: "| VaR | 95% | ≤ 2.5% portfolio | 1-day holding period |"
|
|
40
|
-
pass_criterion: "File exists with ≥1 statistical bound per model output"
|
|
41
|
-
|
|
42
|
-
- id: define-parameter-search-space
|
|
43
|
-
instruction: >
|
|
44
|
-
For each tunable hyperparameter, define:
|
|
45
|
-
- Name (e.g., lookback_window, threshold_multiplier)
|
|
46
|
-
- Type (int, float, categorical)
|
|
47
|
-
- Valid range with hard bounds (e.g., lookback_window ∈ [5, 252] days)
|
|
48
|
-
- Prior (e.g., uniform, log-uniform, normal around domain-expert estimate)
|
|
49
|
-
- Forbidden regions (e.g., lookback_window < 5 causes lookahead bias)
|
|
50
|
-
Store in docs/parameter-space.md.
|
|
51
|
-
contract: "docs/parameter-space.md exists with bounds and forbidden regions per parameter"
|
|
52
|
-
tools: ["filesystem"]
|
|
53
|
-
expected_output: "| lookback_window | int | [5, 252] | days | uniform | >252 causes data sparsity |"
|
|
54
|
-
pass_criterion: "Every tunable parameter has hard bounds and a forbidden-regions note"
|
|
55
|
-
|
|
56
|
-
- id: build-simulation-dataset
|
|
57
|
-
instruction: >
|
|
58
|
-
Generate or load a statistically meaningful simulation dataset:
|
|
59
|
-
- Minimum 1,000 trading days of price + volume data per instrument
|
|
60
|
-
- Must include at least 3 distinct market regimes: trending, mean-reverting, high-volatility
|
|
61
|
-
- Data must pass stationarity and regime-detection checks
|
|
62
|
-
- Use real historical data or a calibrated GBM/Heston model with documented parameters
|
|
63
|
-
Do NOT use synthetic uncalibrated random data — this produces misleading validation.
|
|
64
|
-
contract: >
|
|
65
|
-
simulation-dataset.parquet or simulation-dataset.csv exists.
|
|
66
|
-
Row count ≥ 1,000 per instrument. Regime labels column present.
|
|
67
|
-
Stationarity check (ADF) passes. Volatility distribution is documented.
|
|
68
|
-
tools: ["pandas", "numpy", "yfinance", "scipy.stats.adfuller", "hmmlearn"]
|
|
69
|
-
expected_output: "simulation-dataset.csv: date, open, high, low, close, volume, regime_label"
|
|
70
|
-
pass_criterion: "Row count ≥ 1000; ADF p-value < 0.05 on returns series"
|
|
71
|
-
|
|
72
|
-
- id: simulation
|
|
73
|
-
title: "Monte Carlo Scenario Analysis Across Market Regimes"
|
|
74
|
-
rationale: >
|
|
75
|
-
A model that works on one market regime and fails on others is not production-grade.
|
|
76
|
-
Monte Carlo across regimes closes stochastic uncertainty by constructing a confidence
|
|
77
|
-
interval for every statistical contract across the full scenario space.
|
|
78
|
-
steps:
|
|
79
|
-
- id: run-monte-carlo-per-regime
|
|
80
|
-
instruction: >
|
|
81
|
-
For each market regime in the simulation dataset, run 10,000 Monte Carlo
|
|
82
|
-
iterations of the model. Each iteration uses a bootstrapped price path from
|
|
83
|
-
that regime. Record the distribution of each model output (VaR, CVaR, Sharpe, etc.).
|
|
84
|
-
Use multiprocessing — 10,000 iterations must complete in < 60 seconds.
|
|
85
|
-
contract: >
|
|
86
|
-
Monte Carlo output exists per regime with ≥ 10,000 iterations.
|
|
87
|
-
Runtime < 60 seconds on a 4-core machine.
|
|
88
|
-
tools: ["numpy.random.bootstrap", "scipy.stats", "multiprocessing", "joblib"]
|
|
89
|
-
expected_output: "mc-results-{regime}.json: {n_iterations, metric_distributions: {VaR: {mean, p5, p95}, ...}}"
|
|
90
|
-
pass_criterion: "File exists per regime; n_iterations ≥ 10000"
|
|
91
|
-
|
|
92
|
-
- id: assert-statistical-contracts
|
|
93
|
-
instruction: >
|
|
94
|
-
For each statistical contract in docs/statistical-contracts.md, assert it holds
|
|
95
|
-
at the specified confidence level across all regimes:
|
|
96
|
-
- VaR: p95 of the VaR distribution ≤ contracted bound
|
|
97
|
-
- CVaR: same approach
|
|
98
|
-
- Sharpe: p5 of the Sharpe distribution ≥ contracted bound
|
|
99
|
-
Any contract that fails at p5/p95 is a FAIL — not just the mean.
|
|
100
|
-
contract: "All statistical contracts hold at the contracted confidence level across all regimes"
|
|
101
|
-
tools: ["python scripts/assert-contracts.py", "scipy.stats"]
|
|
102
|
-
expected_output: "contract-report.json: [{metric, regime, contracted_bound, p5_actual, p95_actual, result}]"
|
|
103
|
-
pass_criterion: "All result fields = PASS in contract-report.json"
|
|
104
|
-
|
|
105
|
-
- id: hyperparameter-search
|
|
106
|
-
title: "Constrained Heuristic Search with Pruning and Warm Runs"
|
|
107
|
-
rationale: >
|
|
108
|
-
Hyperparameters cannot be derived analytically. They must be found by search.
|
|
109
|
-
Random search without pruning wastes compute on clearly bad regions.
|
|
110
|
-
Pruning eliminates unpromising trials early. Warm runs reuse the best
|
|
111
|
-
checkpoint from a previous search run to avoid starting from scratch.
|
|
112
|
-
steps:
|
|
113
|
-
- id: warm-run-from-prior
|
|
114
|
-
instruction: >
|
|
115
|
-
Before launching a full hyperparameter search, load the best parameters
|
|
116
|
-
from the previous run (warm-run-checkpoint.json). If no checkpoint exists,
|
|
117
|
-
use domain-expert priors from docs/parameter-space.md as the starting point.
|
|
118
|
-
Run the model with the warm start and record its baseline performance.
|
|
119
|
-
contract: >
|
|
120
|
-
warm-run-checkpoint.json contains the best parameters from the last search.
|
|
121
|
-
Warm-start baseline performance is recorded before the new search begins.
|
|
122
|
-
tools: ["optuna", "hyperopt", "ray.tune", "json"]
|
|
123
|
-
expected_output: "warm-run-baseline.json: {params, sharpe, VaR, CVaR, regime_scores}"
|
|
124
|
-
pass_criterion: "warm-run-baseline.json exists; sharpe > 0 (model is not trivially broken)"
|
|
125
|
-
|
|
126
|
-
- id: run-pruned-search
|
|
127
|
-
instruction: >
|
|
128
|
-
Run a Bayesian hyperparameter search with early stopping (pruning):
|
|
129
|
-
- Use Optuna with a MedianPruner or HyperbandPruner
|
|
130
|
-
- Max 200 trials total; trials that fall below the 25th percentile of completed
|
|
131
|
-
trials' primary metric at the 25% epoch mark are pruned immediately
|
|
132
|
-
- Search within the bounds defined in docs/parameter-space.md
|
|
133
|
-
- Forbidden regions are enforced as hard constraints (trial returns -inf)
|
|
134
|
-
contract: >
|
|
135
|
-
Search completes in ≤ 200 trials. Best trial improves on warm-run baseline.
|
|
136
|
-
No forbidden-region parameter combinations appear in the top 10 trials.
|
|
137
|
-
tools: ["optuna", "optuna.pruners.HyperbandPruner", "optuna.samplers.TPESampler"]
|
|
138
|
-
expected_output: "search-results.json: {best_params, best_score, n_trials, n_pruned, improvement_over_baseline}"
|
|
139
|
-
pass_criterion: "best_score > warm_run_baseline_score; no forbidden regions in top 10"
|
|
140
|
-
|
|
141
|
-
- id: validate-best-params-out-of-sample
|
|
142
|
-
instruction: >
|
|
143
|
-
Take the best parameters from the search and run them on a held-out
|
|
144
|
-
out-of-sample dataset (a market period not present in the simulation dataset).
|
|
145
|
-
The model must meet all statistical contracts on this out-of-sample data.
|
|
146
|
-
In-sample performance that does not generalize is overfitting — report it as FAIL.
|
|
147
|
-
contract: >
|
|
148
|
-
Out-of-sample dataset covers ≥ 252 trading days not in the training set.
|
|
149
|
-
All statistical contracts hold on the out-of-sample data.
|
|
150
|
-
tools: ["pandas", "scripts/run-oos-validation.py"]
|
|
151
|
-
expected_output: "oos-validation.json: {params, oos_sharpe, oos_VaR, oos_CVaR, oos_max_drawdown, all_contracts_pass}"
|
|
152
|
-
pass_criterion: "all_contracts_pass = true in oos-validation.json"
|
|
153
|
-
|
|
154
|
-
- id: simulation-invariants
|
|
155
|
-
title: "Post-Run Simulation Invariants (Silent Bug Detection)"
|
|
156
|
-
rationale: >
|
|
157
|
-
Financial simulations fail in two directions, both silently.
|
|
158
|
-
Category A (Silent Loss): strategy runs, earns nothing — unlinked accounting, stuck state machine.
|
|
159
|
-
Category B (Silent Gain): inflated returns from accounting errors or look-ahead leaks.
|
|
160
|
-
These 6 invariants run at finalize() and print alongside standard metrics.
|
|
161
|
-
steps:
|
|
162
|
-
- id: pnl-decomposition
|
|
163
|
-
instruction: >
|
|
164
|
-
Assert: fee_income + price_income == total_pnl ± 1%.
|
|
165
|
-
A gap > 1% means an accounting component is unlinked — it exists in the ledger
|
|
166
|
-
but never flows into the reported total. Category A: Silent Loss Bug.
|
|
167
|
-
contract: "abs((fee_income + price_income) - total_pnl) / abs(total_pnl) < 0.01"
|
|
168
|
-
tools: ["simulation harness finalize()"]
|
|
169
|
-
expected_output: "[INVARIANT PASS] P&L decomposition: fee_income + price_income ≈ total_pnl"
|
|
170
|
-
pass_criterion: "Ratio within 1% tolerance; printed in finalize output"
|
|
171
|
-
|
|
172
|
-
- id: fee-time-ratio
|
|
173
|
-
instruction: >
|
|
174
|
-
Assert: total_fees / active_hours is above the minimum activity threshold.
|
|
175
|
-
Near-zero fees after 1000h of 'running' = instrument was never created.
|
|
176
|
-
The strategy ran against nothing. Category A: Silent Loss Bug.
|
|
177
|
-
contract: "total_fees / active_hours > min_fee_rate_per_hour (config)"
|
|
178
|
-
tools: ["simulation harness finalize()"]
|
|
179
|
-
expected_output: "[INVARIANT PASS] Fee-time ratio: fees proportional to active time"
|
|
180
|
-
pass_criterion: "Fee rate above threshold; flagged as [INVARIANT FAIL] if below"
|
|
181
|
-
|
|
182
|
-
- id: state-concentration
|
|
183
|
-
instruction: >
|
|
184
|
-
Assert: no single non-productive state consumes >80% of simulation time.
|
|
185
|
-
A stuck state machine is not a conservative strategy — it is a broken one.
|
|
186
|
-
Category A: Silent Loss Bug.
|
|
187
|
-
contract: "max(time_per_state[non_productive_states]) / total_time < 0.80"
|
|
188
|
-
tools: ["simulation harness finalize()"]
|
|
189
|
-
expected_output: "[INVARIANT PASS] State concentration: no stuck non-productive state"
|
|
190
|
-
pass_criterion: "All non-productive states below 80% time threshold"
|
|
191
|
-
|
|
192
|
-
- id: return-plausibility
|
|
193
|
-
instruction: >
|
|
194
|
-
For a market-neutral strategy, assert: annual_return < 200% AND
|
|
195
|
-
total_pnl / fee_income < 10×. Returns exceeding these bounds are almost
|
|
196
|
-
certainly bugs, not alpha. Category B: Silent Gain Bug.
|
|
197
|
-
contract: "annualized_return < 2.0 AND total_pnl / fee_income < 10.0"
|
|
198
|
-
tools: ["simulation harness finalize()"]
|
|
199
|
-
expected_output: "[INVARIANT PASS] Return plausibility: within market-neutral bounds"
|
|
200
|
-
pass_criterion: "Both bounds satisfied; failing bound printed with actual values"
|
|
201
|
-
|
|
202
|
-
- id: delta-neutrality
|
|
203
|
-
instruction: >
|
|
204
|
-
Assert: avg |net_delta| while in the primary running state is below the
|
|
205
|
-
delta tolerance threshold (config). High delta = hedge broken or bootstrap bypassed.
|
|
206
|
-
Category B: Silent Gain Bug — bootstrapping errors inflate returns mid-simulation.
|
|
207
|
-
contract: "mean(abs(net_delta) | state == primary_running_state) < delta_tolerance"
|
|
208
|
-
tools: ["simulation harness finalize()"]
|
|
209
|
-
expected_output: "[INVARIANT PASS] Delta neutrality: avg |net_delta| within tolerance"
|
|
210
|
-
pass_criterion: "Average absolute delta below configured tolerance"
|
|
211
|
-
|
|
212
|
-
- id: instrument-balance
|
|
213
|
-
instruction: >
|
|
214
|
-
Assert: max(instrument_notionals) / min(instrument_notionals) < 3×.
|
|
215
|
-
If one sub-strategy is 5× larger than another at finalize, allocation logic failed.
|
|
216
|
-
Category B: Silent Gain Bug — one instrument dominates and distorts aggregate returns.
|
|
217
|
-
contract: "max(instrument_notionals) / min(instrument_notionals) < 3.0"
|
|
218
|
-
tools: ["simulation harness finalize()"]
|
|
219
|
-
expected_output: "[INVARIANT PASS] Instrument balance: all sub-strategies within 3× of each other"
|
|
220
|
-
pass_criterion: "Notional ratio below 3×; actual ratio printed regardless"
|
|
221
|
-
|
|
222
|
-
- id: evidence
|
|
223
|
-
title: "Persist Backtests, Param Checkpoints, and Contract Reports"
|
|
224
|
-
rationale: >
|
|
225
|
-
Financial model evidence is audit-critical. Regulators and risk teams require
|
|
226
|
-
reproducible backtests with exact parameter sets and dataset provenance.
|
|
227
|
-
steps:
|
|
228
|
-
- id: commit-simulation-artifacts
|
|
229
|
-
instruction: >
|
|
230
|
-
Commit to docs/backtests/:
|
|
231
|
-
- contract-report.json (all statistical contract assertions)
|
|
232
|
-
- oos-validation.json (out-of-sample results)
|
|
233
|
-
- simulation-dataset metadata (NOT the full dataset if large — commit a hash and S3/GCS URI)
|
|
234
|
-
- warm-run-checkpoint.json (best params for next run warm start)
|
|
235
|
-
Include dataset SHA-256 hash in the commit message for reproducibility.
|
|
236
|
-
contract: "docs/backtests/ exists with contract-report.json and oos-validation.json"
|
|
237
|
-
tools: ["git", "sha256sum"]
|
|
238
|
-
expected_output: "Committed files with dataset hash in commit message"
|
|
239
|
-
pass_criterion: "Files present in docs/backtests/; contract-report parsed successfully"
|
|
1
|
+
tag: FINTECH
|
|
2
|
+
section: verification
|
|
3
|
+
title: "Statistical Simulation + Heuristic Model Verification"
|
|
4
|
+
description: >
|
|
5
|
+
Financial models have two uncertainty dimensions. Stochastic uncertainty: price,
|
|
6
|
+
volume, and risk outputs depend on market regime and path-dependent sequences that
|
|
7
|
+
unit tests cannot cover. Heuristic uncertainty: hyperparameter values (lookback
|
|
8
|
+
windows, threshold multipliers, decay factors) must be found by search, not derived
|
|
9
|
+
analytically. This strategy uses statistically meaningful simulated datasets,
|
|
10
|
+
Monte Carlo scenario analysis, and constrained hyperparameter search with pruning.
|
|
11
|
+
uncertainty_levels:
|
|
12
|
+
- stochastic
|
|
13
|
+
- heuristic
|
|
14
|
+
completeness_ceiling: 0.85
|
|
15
|
+
|
|
16
|
+
phases:
|
|
17
|
+
|
|
18
|
+
- id: contract-definition
|
|
19
|
+
title: "Define Statistical Contracts and Parameter Bounds"
|
|
20
|
+
rationale: >
|
|
21
|
+
A financial model contract is a statistical bound, not an exact output.
|
|
22
|
+
"VaR at 95% confidence ≤ 2.5% of portfolio" is a valid contract.
|
|
23
|
+
"The model returns 0.031" is not. Contracts must hold across market regimes,
|
|
24
|
+
not just on the training period.
|
|
25
|
+
steps:
|
|
26
|
+
- id: define-statistical-invariants
|
|
27
|
+
instruction: >
|
|
28
|
+
For each model output, define statistical contracts:
|
|
29
|
+
- VaR (Value at Risk): max acceptable VaR at 95% and 99% confidence levels
|
|
30
|
+
- CVaR (Conditional VaR / Expected Shortfall): max acceptable CVaR at 95%
|
|
31
|
+
- Sharpe ratio: minimum acceptable Sharpe across rolling 90-day windows
|
|
32
|
+
- Max drawdown: maximum acceptable peak-to-trough drawdown
|
|
33
|
+
- Hit rate: for classification signals, minimum precision and recall
|
|
34
|
+
Store in docs/statistical-contracts.md.
|
|
35
|
+
contract: >
|
|
36
|
+
docs/statistical-contracts.md exists with one row per model output.
|
|
37
|
+
Each row has: metric, confidence level, bound (max or min), measurement window.
|
|
38
|
+
tools: ["filesystem"]
|
|
39
|
+
expected_output: "| VaR | 95% | ≤ 2.5% portfolio | 1-day holding period |"
|
|
40
|
+
pass_criterion: "File exists with ≥1 statistical bound per model output"
|
|
41
|
+
|
|
42
|
+
- id: define-parameter-search-space
|
|
43
|
+
instruction: >
|
|
44
|
+
For each tunable hyperparameter, define:
|
|
45
|
+
- Name (e.g., lookback_window, threshold_multiplier)
|
|
46
|
+
- Type (int, float, categorical)
|
|
47
|
+
- Valid range with hard bounds (e.g., lookback_window ∈ [5, 252] days)
|
|
48
|
+
- Prior (e.g., uniform, log-uniform, normal around domain-expert estimate)
|
|
49
|
+
- Forbidden regions (e.g., lookback_window < 5 causes lookahead bias)
|
|
50
|
+
Store in docs/parameter-space.md.
|
|
51
|
+
contract: "docs/parameter-space.md exists with bounds and forbidden regions per parameter"
|
|
52
|
+
tools: ["filesystem"]
|
|
53
|
+
expected_output: "| lookback_window | int | [5, 252] | days | uniform | >252 causes data sparsity |"
|
|
54
|
+
pass_criterion: "Every tunable parameter has hard bounds and a forbidden-regions note"
|
|
55
|
+
|
|
56
|
+
- id: build-simulation-dataset
|
|
57
|
+
instruction: >
|
|
58
|
+
Generate or load a statistically meaningful simulation dataset:
|
|
59
|
+
- Minimum 1,000 trading days of price + volume data per instrument
|
|
60
|
+
- Must include at least 3 distinct market regimes: trending, mean-reverting, high-volatility
|
|
61
|
+
- Data must pass stationarity and regime-detection checks
|
|
62
|
+
- Use real historical data or a calibrated GBM/Heston model with documented parameters
|
|
63
|
+
Do NOT use synthetic uncalibrated random data — this produces misleading validation.
|
|
64
|
+
contract: >
|
|
65
|
+
simulation-dataset.parquet or simulation-dataset.csv exists.
|
|
66
|
+
Row count ≥ 1,000 per instrument. Regime labels column present.
|
|
67
|
+
Stationarity check (ADF) passes. Volatility distribution is documented.
|
|
68
|
+
tools: ["pandas", "numpy", "yfinance", "scipy.stats.adfuller", "hmmlearn"]
|
|
69
|
+
expected_output: "simulation-dataset.csv: date, open, high, low, close, volume, regime_label"
|
|
70
|
+
pass_criterion: "Row count ≥ 1000; ADF p-value < 0.05 on returns series"
|
|
71
|
+
|
|
72
|
+
- id: simulation
|
|
73
|
+
title: "Monte Carlo Scenario Analysis Across Market Regimes"
|
|
74
|
+
rationale: >
|
|
75
|
+
A model that works on one market regime and fails on others is not production-grade.
|
|
76
|
+
Monte Carlo across regimes closes stochastic uncertainty by constructing a confidence
|
|
77
|
+
interval for every statistical contract across the full scenario space.
|
|
78
|
+
steps:
|
|
79
|
+
- id: run-monte-carlo-per-regime
|
|
80
|
+
instruction: >
|
|
81
|
+
For each market regime in the simulation dataset, run 10,000 Monte Carlo
|
|
82
|
+
iterations of the model. Each iteration uses a bootstrapped price path from
|
|
83
|
+
that regime. Record the distribution of each model output (VaR, CVaR, Sharpe, etc.).
|
|
84
|
+
Use multiprocessing — 10,000 iterations must complete in < 60 seconds.
|
|
85
|
+
contract: >
|
|
86
|
+
Monte Carlo output exists per regime with ≥ 10,000 iterations.
|
|
87
|
+
Runtime < 60 seconds on a 4-core machine.
|
|
88
|
+
tools: ["numpy.random.bootstrap", "scipy.stats", "multiprocessing", "joblib"]
|
|
89
|
+
expected_output: "mc-results-{regime}.json: {n_iterations, metric_distributions: {VaR: {mean, p5, p95}, ...}}"
|
|
90
|
+
pass_criterion: "File exists per regime; n_iterations ≥ 10000"
|
|
91
|
+
|
|
92
|
+
- id: assert-statistical-contracts
|
|
93
|
+
instruction: >
|
|
94
|
+
For each statistical contract in docs/statistical-contracts.md, assert it holds
|
|
95
|
+
at the specified confidence level across all regimes:
|
|
96
|
+
- VaR: p95 of the VaR distribution ≤ contracted bound
|
|
97
|
+
- CVaR: same approach
|
|
98
|
+
- Sharpe: p5 of the Sharpe distribution ≥ contracted bound
|
|
99
|
+
Any contract that fails at p5/p95 is a FAIL — not just the mean.
|
|
100
|
+
contract: "All statistical contracts hold at the contracted confidence level across all regimes"
|
|
101
|
+
tools: ["python scripts/assert-contracts.py", "scipy.stats"]
|
|
102
|
+
expected_output: "contract-report.json: [{metric, regime, contracted_bound, p5_actual, p95_actual, result}]"
|
|
103
|
+
pass_criterion: "All result fields = PASS in contract-report.json"
|
|
104
|
+
|
|
105
|
+
- id: hyperparameter-search
|
|
106
|
+
title: "Constrained Heuristic Search with Pruning and Warm Runs"
|
|
107
|
+
rationale: >
|
|
108
|
+
Hyperparameters cannot be derived analytically. They must be found by search.
|
|
109
|
+
Random search without pruning wastes compute on clearly bad regions.
|
|
110
|
+
Pruning eliminates unpromising trials early. Warm runs reuse the best
|
|
111
|
+
checkpoint from a previous search run to avoid starting from scratch.
|
|
112
|
+
steps:
|
|
113
|
+
- id: warm-run-from-prior
|
|
114
|
+
instruction: >
|
|
115
|
+
Before launching a full hyperparameter search, load the best parameters
|
|
116
|
+
from the previous run (warm-run-checkpoint.json). If no checkpoint exists,
|
|
117
|
+
use domain-expert priors from docs/parameter-space.md as the starting point.
|
|
118
|
+
Run the model with the warm start and record its baseline performance.
|
|
119
|
+
contract: >
|
|
120
|
+
warm-run-checkpoint.json contains the best parameters from the last search.
|
|
121
|
+
Warm-start baseline performance is recorded before the new search begins.
|
|
122
|
+
tools: ["optuna", "hyperopt", "ray.tune", "json"]
|
|
123
|
+
expected_output: "warm-run-baseline.json: {params, sharpe, VaR, CVaR, regime_scores}"
|
|
124
|
+
pass_criterion: "warm-run-baseline.json exists; sharpe > 0 (model is not trivially broken)"
|
|
125
|
+
|
|
126
|
+
- id: run-pruned-search
|
|
127
|
+
instruction: >
|
|
128
|
+
Run a Bayesian hyperparameter search with early stopping (pruning):
|
|
129
|
+
- Use Optuna with a MedianPruner or HyperbandPruner
|
|
130
|
+
- Max 200 trials total; trials that fall below the 25th percentile of completed
|
|
131
|
+
trials' primary metric at the 25% epoch mark are pruned immediately
|
|
132
|
+
- Search within the bounds defined in docs/parameter-space.md
|
|
133
|
+
- Forbidden regions are enforced as hard constraints (trial returns -inf)
|
|
134
|
+
contract: >
|
|
135
|
+
Search completes in ≤ 200 trials. Best trial improves on warm-run baseline.
|
|
136
|
+
No forbidden-region parameter combinations appear in the top 10 trials.
|
|
137
|
+
tools: ["optuna", "optuna.pruners.HyperbandPruner", "optuna.samplers.TPESampler"]
|
|
138
|
+
expected_output: "search-results.json: {best_params, best_score, n_trials, n_pruned, improvement_over_baseline}"
|
|
139
|
+
pass_criterion: "best_score > warm_run_baseline_score; no forbidden regions in top 10"
|
|
140
|
+
|
|
141
|
+
- id: validate-best-params-out-of-sample
|
|
142
|
+
instruction: >
|
|
143
|
+
Take the best parameters from the search and run them on a held-out
|
|
144
|
+
out-of-sample dataset (a market period not present in the simulation dataset).
|
|
145
|
+
The model must meet all statistical contracts on this out-of-sample data.
|
|
146
|
+
In-sample performance that does not generalize is overfitting — report it as FAIL.
|
|
147
|
+
contract: >
|
|
148
|
+
Out-of-sample dataset covers ≥ 252 trading days not in the training set.
|
|
149
|
+
All statistical contracts hold on the out-of-sample data.
|
|
150
|
+
tools: ["pandas", "scripts/run-oos-validation.py"]
|
|
151
|
+
expected_output: "oos-validation.json: {params, oos_sharpe, oos_VaR, oos_CVaR, oos_max_drawdown, all_contracts_pass}"
|
|
152
|
+
pass_criterion: "all_contracts_pass = true in oos-validation.json"
|
|
153
|
+
|
|
154
|
+
- id: simulation-invariants
|
|
155
|
+
title: "Post-Run Simulation Invariants (Silent Bug Detection)"
|
|
156
|
+
rationale: >
|
|
157
|
+
Financial simulations fail in two directions, both silently.
|
|
158
|
+
Category A (Silent Loss): strategy runs, earns nothing — unlinked accounting, stuck state machine.
|
|
159
|
+
Category B (Silent Gain): inflated returns from accounting errors or look-ahead leaks.
|
|
160
|
+
These 6 invariants run at finalize() and print alongside standard metrics.
|
|
161
|
+
steps:
|
|
162
|
+
- id: pnl-decomposition
|
|
163
|
+
instruction: >
|
|
164
|
+
Assert: fee_income + price_income == total_pnl ± 1%.
|
|
165
|
+
A gap > 1% means an accounting component is unlinked — it exists in the ledger
|
|
166
|
+
but never flows into the reported total. Category A: Silent Loss Bug.
|
|
167
|
+
contract: "abs((fee_income + price_income) - total_pnl) / abs(total_pnl) < 0.01"
|
|
168
|
+
tools: ["simulation harness finalize()"]
|
|
169
|
+
expected_output: "[INVARIANT PASS] P&L decomposition: fee_income + price_income ≈ total_pnl"
|
|
170
|
+
pass_criterion: "Ratio within 1% tolerance; printed in finalize output"
|
|
171
|
+
|
|
172
|
+
- id: fee-time-ratio
|
|
173
|
+
instruction: >
|
|
174
|
+
Assert: total_fees / active_hours is above the minimum activity threshold.
|
|
175
|
+
Near-zero fees after 1000h of 'running' = instrument was never created.
|
|
176
|
+
The strategy ran against nothing. Category A: Silent Loss Bug.
|
|
177
|
+
contract: "total_fees / active_hours > min_fee_rate_per_hour (config)"
|
|
178
|
+
tools: ["simulation harness finalize()"]
|
|
179
|
+
expected_output: "[INVARIANT PASS] Fee-time ratio: fees proportional to active time"
|
|
180
|
+
pass_criterion: "Fee rate above threshold; flagged as [INVARIANT FAIL] if below"
|
|
181
|
+
|
|
182
|
+
- id: state-concentration
|
|
183
|
+
instruction: >
|
|
184
|
+
Assert: no single non-productive state consumes >80% of simulation time.
|
|
185
|
+
A stuck state machine is not a conservative strategy — it is a broken one.
|
|
186
|
+
Category A: Silent Loss Bug.
|
|
187
|
+
contract: "max(time_per_state[non_productive_states]) / total_time < 0.80"
|
|
188
|
+
tools: ["simulation harness finalize()"]
|
|
189
|
+
expected_output: "[INVARIANT PASS] State concentration: no stuck non-productive state"
|
|
190
|
+
pass_criterion: "All non-productive states below 80% time threshold"
|
|
191
|
+
|
|
192
|
+
- id: return-plausibility
|
|
193
|
+
instruction: >
|
|
194
|
+
For a market-neutral strategy, assert: annual_return < 200% AND
|
|
195
|
+
total_pnl / fee_income < 10×. Returns exceeding these bounds are almost
|
|
196
|
+
certainly bugs, not alpha. Category B: Silent Gain Bug.
|
|
197
|
+
contract: "annualized_return < 2.0 AND total_pnl / fee_income < 10.0"
|
|
198
|
+
tools: ["simulation harness finalize()"]
|
|
199
|
+
expected_output: "[INVARIANT PASS] Return plausibility: within market-neutral bounds"
|
|
200
|
+
pass_criterion: "Both bounds satisfied; failing bound printed with actual values"
|
|
201
|
+
|
|
202
|
+
- id: delta-neutrality
|
|
203
|
+
instruction: >
|
|
204
|
+
Assert: avg |net_delta| while in the primary running state is below the
|
|
205
|
+
delta tolerance threshold (config). High delta = hedge broken or bootstrap bypassed.
|
|
206
|
+
Category B: Silent Gain Bug — bootstrapping errors inflate returns mid-simulation.
|
|
207
|
+
contract: "mean(abs(net_delta) | state == primary_running_state) < delta_tolerance"
|
|
208
|
+
tools: ["simulation harness finalize()"]
|
|
209
|
+
expected_output: "[INVARIANT PASS] Delta neutrality: avg |net_delta| within tolerance"
|
|
210
|
+
pass_criterion: "Average absolute delta below configured tolerance"
|
|
211
|
+
|
|
212
|
+
- id: instrument-balance
|
|
213
|
+
instruction: >
|
|
214
|
+
Assert: max(instrument_notionals) / min(instrument_notionals) < 3×.
|
|
215
|
+
If one sub-strategy is 5× larger than another at finalize, allocation logic failed.
|
|
216
|
+
Category B: Silent Gain Bug — one instrument dominates and distorts aggregate returns.
|
|
217
|
+
contract: "max(instrument_notionals) / min(instrument_notionals) < 3.0"
|
|
218
|
+
tools: ["simulation harness finalize()"]
|
|
219
|
+
expected_output: "[INVARIANT PASS] Instrument balance: all sub-strategies within 3× of each other"
|
|
220
|
+
pass_criterion: "Notional ratio below 3×; actual ratio printed regardless"
|
|
221
|
+
|
|
222
|
+
- id: evidence
|
|
223
|
+
title: "Persist Backtests, Param Checkpoints, and Contract Reports"
|
|
224
|
+
rationale: >
|
|
225
|
+
Financial model evidence is audit-critical. Regulators and risk teams require
|
|
226
|
+
reproducible backtests with exact parameter sets and dataset provenance.
|
|
227
|
+
steps:
|
|
228
|
+
- id: commit-simulation-artifacts
|
|
229
|
+
instruction: >
|
|
230
|
+
Commit to docs/backtests/:
|
|
231
|
+
- contract-report.json (all statistical contract assertions)
|
|
232
|
+
- oos-validation.json (out-of-sample results)
|
|
233
|
+
- simulation-dataset metadata (NOT the full dataset if large — commit a hash and S3/GCS URI)
|
|
234
|
+
- warm-run-checkpoint.json (best params for next run warm start)
|
|
235
|
+
Include dataset SHA-256 hash in the commit message for reproducibility.
|
|
236
|
+
contract: "docs/backtests/ exists with contract-report.json and oos-validation.json"
|
|
237
|
+
tools: ["git", "sha256sum"]
|
|
238
|
+
expected_output: "Committed files with dataset hash in commit message"
|
|
239
|
+
pass_criterion: "Files present in docs/backtests/; contract-report parsed successfully"
|