forgecraft-mcp 1.2.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/README.md +525 -525
  2. package/dist/cli/help.js +44 -44
  3. package/dist/registry/renderer-skeletons.js +92 -92
  4. package/dist/shared/gs-score-logger.js +6 -6
  5. package/dist/tools/add-module.js +123 -123
  6. package/dist/tools/advice-registry.js +18 -18
  7. package/dist/tools/check-cascade-report.js +64 -64
  8. package/dist/tools/configure-mcp.d.ts +3 -0
  9. package/dist/tools/configure-mcp.d.ts.map +1 -1
  10. package/dist/tools/configure-mcp.js +10 -0
  11. package/dist/tools/configure-mcp.js.map +1 -1
  12. package/dist/tools/forgecraft-dispatch.d.ts.map +1 -1
  13. package/dist/tools/forgecraft-dispatch.js +3 -0
  14. package/dist/tools/forgecraft-dispatch.js.map +1 -1
  15. package/dist/tools/forgecraft-schema-params.d.ts +9 -0
  16. package/dist/tools/forgecraft-schema-params.d.ts.map +1 -1
  17. package/dist/tools/forgecraft-schema-params.js +21 -0
  18. package/dist/tools/forgecraft-schema-params.js.map +1 -1
  19. package/dist/tools/forgecraft-schema.d.ts +9 -0
  20. package/dist/tools/forgecraft-schema.d.ts.map +1 -1
  21. package/dist/tools/refresh-output.js +14 -14
  22. package/dist/tools/scaffold-spec-stubs.js +115 -115
  23. package/dist/tools/scaffold-templates.js +62 -62
  24. package/dist/tools/setup-artifact-writers.d.ts +30 -0
  25. package/dist/tools/setup-artifact-writers.d.ts.map +1 -1
  26. package/dist/tools/setup-artifact-writers.js +120 -8
  27. package/dist/tools/setup-artifact-writers.js.map +1 -1
  28. package/dist/tools/setup-phase1.d.ts +3 -0
  29. package/dist/tools/setup-phase1.d.ts.map +1 -1
  30. package/dist/tools/setup-phase1.js +79 -35
  31. package/dist/tools/setup-phase1.js.map +1 -1
  32. package/dist/tools/setup-phase2.d.ts +2 -0
  33. package/dist/tools/setup-phase2.d.ts.map +1 -1
  34. package/dist/tools/setup-phase2.js +10 -1
  35. package/dist/tools/setup-phase2.js.map +1 -1
  36. package/dist/tools/setup-project.d.ts +18 -0
  37. package/dist/tools/setup-project.d.ts.map +1 -1
  38. package/dist/tools/setup-project.js +77 -1
  39. package/dist/tools/setup-project.js.map +1 -1
  40. package/dist/tools/spec-parser-tags.d.ts +9 -0
  41. package/dist/tools/spec-parser-tags.d.ts.map +1 -1
  42. package/dist/tools/spec-parser-tags.js +92 -0
  43. package/dist/tools/spec-parser-tags.js.map +1 -1
  44. package/package.json +89 -86
  45. package/templates/analytics/instructions.yaml +37 -37
  46. package/templates/analytics/mcp-servers.yaml +11 -11
  47. package/templates/analytics/structure.yaml +25 -25
  48. package/templates/api/instructions.yaml +231 -231
  49. package/templates/api/mcp-servers.yaml +22 -13
  50. package/templates/api/nfr.yaml +23 -23
  51. package/templates/api/review.yaml +103 -103
  52. package/templates/api/structure.yaml +34 -34
  53. package/templates/api/verification.yaml +132 -132
  54. package/templates/cli/instructions.yaml +31 -31
  55. package/templates/cli/mcp-servers.yaml +11 -11
  56. package/templates/cli/review.yaml +53 -53
  57. package/templates/cli/structure.yaml +16 -16
  58. package/templates/data-lineage/instructions.yaml +28 -28
  59. package/templates/data-lineage/mcp-servers.yaml +22 -22
  60. package/templates/data-pipeline/instructions.yaml +84 -84
  61. package/templates/data-pipeline/mcp-servers.yaml +13 -13
  62. package/templates/data-pipeline/nfr.yaml +39 -39
  63. package/templates/data-pipeline/structure.yaml +23 -23
  64. package/templates/fintech/hooks.yaml +55 -55
  65. package/templates/fintech/instructions.yaml +112 -112
  66. package/templates/fintech/mcp-servers.yaml +13 -13
  67. package/templates/fintech/nfr.yaml +46 -46
  68. package/templates/fintech/playbook.yaml +210 -210
  69. package/templates/fintech/verification.yaml +239 -239
  70. package/templates/game/instructions.yaml +289 -289
  71. package/templates/game/mcp-servers.yaml +38 -38
  72. package/templates/game/nfr.yaml +64 -64
  73. package/templates/game/playbook.yaml +214 -214
  74. package/templates/game/review.yaml +97 -97
  75. package/templates/game/structure.yaml +67 -67
  76. package/templates/game/verification.yaml +174 -174
  77. package/templates/healthcare/instructions.yaml +42 -42
  78. package/templates/healthcare/mcp-servers.yaml +13 -13
  79. package/templates/healthcare/nfr.yaml +47 -47
  80. package/templates/hipaa/instructions.yaml +41 -41
  81. package/templates/hipaa/mcp-servers.yaml +13 -13
  82. package/templates/infra/instructions.yaml +104 -104
  83. package/templates/infra/mcp-servers.yaml +20 -20
  84. package/templates/infra/nfr.yaml +46 -46
  85. package/templates/infra/review.yaml +65 -65
  86. package/templates/infra/structure.yaml +25 -25
  87. package/templates/library/instructions.yaml +36 -36
  88. package/templates/library/mcp-servers.yaml +20 -20
  89. package/templates/library/review.yaml +56 -56
  90. package/templates/library/structure.yaml +19 -19
  91. package/templates/medallion-architecture/instructions.yaml +41 -41
  92. package/templates/medallion-architecture/mcp-servers.yaml +22 -22
  93. package/templates/ml/instructions.yaml +85 -85
  94. package/templates/ml/mcp-servers.yaml +11 -11
  95. package/templates/ml/nfr.yaml +39 -39
  96. package/templates/ml/structure.yaml +25 -25
  97. package/templates/ml/verification.yaml +156 -156
  98. package/templates/mobile/instructions.yaml +44 -44
  99. package/templates/mobile/mcp-servers.yaml +11 -11
  100. package/templates/mobile/nfr.yaml +49 -49
  101. package/templates/mobile/structure.yaml +27 -27
  102. package/templates/mobile/verification.yaml +121 -121
  103. package/templates/observability-xray/instructions.yaml +40 -40
  104. package/templates/observability-xray/mcp-servers.yaml +15 -15
  105. package/templates/realtime/instructions.yaml +42 -42
  106. package/templates/realtime/mcp-servers.yaml +13 -13
  107. package/templates/soc2/instructions.yaml +41 -41
  108. package/templates/soc2/mcp-servers.yaml +24 -24
  109. package/templates/social/instructions.yaml +43 -43
  110. package/templates/social/mcp-servers.yaml +24 -24
  111. package/templates/state-machine/instructions.yaml +42 -42
  112. package/templates/state-machine/mcp-servers.yaml +11 -11
  113. package/templates/tools-registry.yaml +164 -164
  114. package/templates/universal/hooks.yaml +531 -531
  115. package/templates/universal/instructions.yaml +1692 -1692
  116. package/templates/universal/mcp-servers.yaml +50 -50
  117. package/templates/universal/nfr.yaml +197 -197
  118. package/templates/universal/reference.yaml +326 -326
  119. package/templates/universal/review.yaml +204 -204
  120. package/templates/universal/skills.yaml +262 -262
  121. package/templates/universal/structure.yaml +67 -67
  122. package/templates/universal/verification.yaml +416 -416
  123. package/templates/web-react/hooks.yaml +44 -44
  124. package/templates/web-react/instructions.yaml +207 -207
  125. package/templates/web-react/mcp-servers.yaml +20 -20
  126. package/templates/web-react/nfr.yaml +27 -27
  127. package/templates/web-react/review.yaml +94 -94
  128. package/templates/web-react/structure.yaml +46 -46
  129. package/templates/web-react/verification.yaml +126 -126
  130. package/templates/web-static/instructions.yaml +115 -115
  131. package/templates/web-static/mcp-servers.yaml +20 -20
  132. package/templates/web3/instructions.yaml +44 -44
  133. package/templates/web3/mcp-servers.yaml +11 -11
  134. package/templates/web3/verification.yaml +159 -159
  135. package/templates/zero-trust/instructions.yaml +41 -41
  136. package/templates/zero-trust/mcp-servers.yaml +15 -15
@@ -1,239 +1,239 @@
1
- tag: FINTECH
2
- section: verification
3
- title: "Statistical Simulation + Heuristic Model Verification"
4
- description: >
5
- Financial models have two uncertainty dimensions. Stochastic uncertainty: price,
6
- volume, and risk outputs depend on market regime and path-dependent sequences that
7
- unit tests cannot cover. Heuristic uncertainty: hyperparameter values (lookback
8
- windows, threshold multipliers, decay factors) must be found by search, not derived
9
- analytically. This strategy uses statistically meaningful simulated datasets,
10
- Monte Carlo scenario analysis, and constrained hyperparameter search with pruning.
11
- uncertainty_levels:
12
- - stochastic
13
- - heuristic
14
- completeness_ceiling: 0.85
15
-
16
- phases:
17
-
18
- - id: contract-definition
19
- title: "Define Statistical Contracts and Parameter Bounds"
20
- rationale: >
21
- A financial model contract is a statistical bound, not an exact output.
22
- "VaR at 95% confidence ≤ 2.5% of portfolio" is a valid contract.
23
- "The model returns 0.031" is not. Contracts must hold across market regimes,
24
- not just on the training period.
25
- steps:
26
- - id: define-statistical-invariants
27
- instruction: >
28
- For each model output, define statistical contracts:
29
- - VaR (Value at Risk): max acceptable VaR at 95% and 99% confidence levels
30
- - CVaR (Conditional VaR / Expected Shortfall): max acceptable CVaR at 95%
31
- - Sharpe ratio: minimum acceptable Sharpe across rolling 90-day windows
32
- - Max drawdown: maximum acceptable peak-to-trough drawdown
33
- - Hit rate: for classification signals, minimum precision and recall
34
- Store in docs/statistical-contracts.md.
35
- contract: >
36
- docs/statistical-contracts.md exists with one row per model output.
37
- Each row has: metric, confidence level, bound (max or min), measurement window.
38
- tools: ["filesystem"]
39
- expected_output: "| VaR | 95% | ≤ 2.5% portfolio | 1-day holding period |"
40
- pass_criterion: "File exists with ≥1 statistical bound per model output"
41
-
42
- - id: define-parameter-search-space
43
- instruction: >
44
- For each tunable hyperparameter, define:
45
- - Name (e.g., lookback_window, threshold_multiplier)
46
- - Type (int, float, categorical)
47
- - Valid range with hard bounds (e.g., lookback_window ∈ [5, 252] days)
48
- - Prior (e.g., uniform, log-uniform, normal around domain-expert estimate)
49
- - Forbidden regions (e.g., lookback_window < 5 causes lookahead bias)
50
- Store in docs/parameter-space.md.
51
- contract: "docs/parameter-space.md exists with bounds and forbidden regions per parameter"
52
- tools: ["filesystem"]
53
- expected_output: "| lookback_window | int | [5, 252] | days | uniform | >252 causes data sparsity |"
54
- pass_criterion: "Every tunable parameter has hard bounds and a forbidden-regions note"
55
-
56
- - id: build-simulation-dataset
57
- instruction: >
58
- Generate or load a statistically meaningful simulation dataset:
59
- - Minimum 1,000 trading days of price + volume data per instrument
60
- - Must include at least 3 distinct market regimes: trending, mean-reverting, high-volatility
61
- - Data must pass stationarity and regime-detection checks
62
- - Use real historical data or a calibrated GBM/Heston model with documented parameters
63
- Do NOT use synthetic uncalibrated random data — this produces misleading validation.
64
- contract: >
65
- simulation-dataset.parquet or simulation-dataset.csv exists.
66
- Row count ≥ 1,000 per instrument. Regime labels column present.
67
- Stationarity check (ADF) passes. Volatility distribution is documented.
68
- tools: ["pandas", "numpy", "yfinance", "scipy.stats.adfuller", "hmmlearn"]
69
- expected_output: "simulation-dataset.csv: date, open, high, low, close, volume, regime_label"
70
- pass_criterion: "Row count ≥ 1000; ADF p-value < 0.05 on returns series"
71
-
72
- - id: simulation
73
- title: "Monte Carlo Scenario Analysis Across Market Regimes"
74
- rationale: >
75
- A model that works on one market regime and fails on others is not production-grade.
76
- Monte Carlo across regimes closes stochastic uncertainty by constructing a confidence
77
- interval for every statistical contract across the full scenario space.
78
- steps:
79
- - id: run-monte-carlo-per-regime
80
- instruction: >
81
- For each market regime in the simulation dataset, run 10,000 Monte Carlo
82
- iterations of the model. Each iteration uses a bootstrapped price path from
83
- that regime. Record the distribution of each model output (VaR, CVaR, Sharpe, etc.).
84
- Use multiprocessing — 10,000 iterations must complete in < 60 seconds.
85
- contract: >
86
- Monte Carlo output exists per regime with ≥ 10,000 iterations.
87
- Runtime < 60 seconds on a 4-core machine.
88
- tools: ["numpy.random.bootstrap", "scipy.stats", "multiprocessing", "joblib"]
89
- expected_output: "mc-results-{regime}.json: {n_iterations, metric_distributions: {VaR: {mean, p5, p95}, ...}}"
90
- pass_criterion: "File exists per regime; n_iterations ≥ 10000"
91
-
92
- - id: assert-statistical-contracts
93
- instruction: >
94
- For each statistical contract in docs/statistical-contracts.md, assert it holds
95
- at the specified confidence level across all regimes:
96
- - VaR: p95 of the VaR distribution ≤ contracted bound
97
- - CVaR: same approach
98
- - Sharpe: p5 of the Sharpe distribution ≥ contracted bound
99
- Any contract that fails at p5/p95 is a FAIL — not just the mean.
100
- contract: "All statistical contracts hold at the contracted confidence level across all regimes"
101
- tools: ["python scripts/assert-contracts.py", "scipy.stats"]
102
- expected_output: "contract-report.json: [{metric, regime, contracted_bound, p5_actual, p95_actual, result}]"
103
- pass_criterion: "All result fields = PASS in contract-report.json"
104
-
105
- - id: hyperparameter-search
106
- title: "Constrained Heuristic Search with Pruning and Warm Runs"
107
- rationale: >
108
- Hyperparameters cannot be derived analytically. They must be found by search.
109
- Random search without pruning wastes compute on clearly bad regions.
110
- Pruning eliminates unpromising trials early. Warm runs reuse the best
111
- checkpoint from a previous search run to avoid starting from scratch.
112
- steps:
113
- - id: warm-run-from-prior
114
- instruction: >
115
- Before launching a full hyperparameter search, load the best parameters
116
- from the previous run (warm-run-checkpoint.json). If no checkpoint exists,
117
- use domain-expert priors from docs/parameter-space.md as the starting point.
118
- Run the model with the warm start and record its baseline performance.
119
- contract: >
120
- warm-run-checkpoint.json contains the best parameters from the last search.
121
- Warm-start baseline performance is recorded before the new search begins.
122
- tools: ["optuna", "hyperopt", "ray.tune", "json"]
123
- expected_output: "warm-run-baseline.json: {params, sharpe, VaR, CVaR, regime_scores}"
124
- pass_criterion: "warm-run-baseline.json exists; sharpe > 0 (model is not trivially broken)"
125
-
126
- - id: run-pruned-search
127
- instruction: >
128
- Run a Bayesian hyperparameter search with early stopping (pruning):
129
- - Use Optuna with a MedianPruner or HyperbandPruner
130
- - Max 200 trials total; trials that fall below the 25th percentile of completed
131
- trials' primary metric at the 25% epoch mark are pruned immediately
132
- - Search within the bounds defined in docs/parameter-space.md
133
- - Forbidden regions are enforced as hard constraints (trial returns -inf)
134
- contract: >
135
- Search completes in ≤ 200 trials. Best trial improves on warm-run baseline.
136
- No forbidden-region parameter combinations appear in the top 10 trials.
137
- tools: ["optuna", "optuna.pruners.HyperbandPruner", "optuna.samplers.TPESampler"]
138
- expected_output: "search-results.json: {best_params, best_score, n_trials, n_pruned, improvement_over_baseline}"
139
- pass_criterion: "best_score > warm_run_baseline_score; no forbidden regions in top 10"
140
-
141
- - id: validate-best-params-out-of-sample
142
- instruction: >
143
- Take the best parameters from the search and run them on a held-out
144
- out-of-sample dataset (a market period not present in the simulation dataset).
145
- The model must meet all statistical contracts on this out-of-sample data.
146
- In-sample performance that does not generalize is overfitting — report it as FAIL.
147
- contract: >
148
- Out-of-sample dataset covers ≥ 252 trading days not in the training set.
149
- All statistical contracts hold on the out-of-sample data.
150
- tools: ["pandas", "scripts/run-oos-validation.py"]
151
- expected_output: "oos-validation.json: {params, oos_sharpe, oos_VaR, oos_CVaR, oos_max_drawdown, all_contracts_pass}"
152
- pass_criterion: "all_contracts_pass = true in oos-validation.json"
153
-
154
- - id: simulation-invariants
155
- title: "Post-Run Simulation Invariants (Silent Bug Detection)"
156
- rationale: >
157
- Financial simulations fail in two directions, both silently.
158
- Category A (Silent Loss): strategy runs, earns nothing — unlinked accounting, stuck state machine.
159
- Category B (Silent Gain): inflated returns from accounting errors or look-ahead leaks.
160
- These 6 invariants run at finalize() and print alongside standard metrics.
161
- steps:
162
- - id: pnl-decomposition
163
- instruction: >
164
- Assert: fee_income + price_income == total_pnl ± 1%.
165
- A gap > 1% means an accounting component is unlinked — it exists in the ledger
166
- but never flows into the reported total. Category A: Silent Loss Bug.
167
- contract: "abs((fee_income + price_income) - total_pnl) / abs(total_pnl) < 0.01"
168
- tools: ["simulation harness finalize()"]
169
- expected_output: "[INVARIANT PASS] P&L decomposition: fee_income + price_income ≈ total_pnl"
170
- pass_criterion: "Ratio within 1% tolerance; printed in finalize output"
171
-
172
- - id: fee-time-ratio
173
- instruction: >
174
- Assert: total_fees / active_hours is above the minimum activity threshold.
175
- Near-zero fees after 1000h of 'running' = instrument was never created.
176
- The strategy ran against nothing. Category A: Silent Loss Bug.
177
- contract: "total_fees / active_hours > min_fee_rate_per_hour (config)"
178
- tools: ["simulation harness finalize()"]
179
- expected_output: "[INVARIANT PASS] Fee-time ratio: fees proportional to active time"
180
- pass_criterion: "Fee rate above threshold; flagged as [INVARIANT FAIL] if below"
181
-
182
- - id: state-concentration
183
- instruction: >
184
- Assert: no single non-productive state consumes >80% of simulation time.
185
- A stuck state machine is not a conservative strategy — it is a broken one.
186
- Category A: Silent Loss Bug.
187
- contract: "max(time_per_state[non_productive_states]) / total_time < 0.80"
188
- tools: ["simulation harness finalize()"]
189
- expected_output: "[INVARIANT PASS] State concentration: no stuck non-productive state"
190
- pass_criterion: "All non-productive states below 80% time threshold"
191
-
192
- - id: return-plausibility
193
- instruction: >
194
- For a market-neutral strategy, assert: annual_return < 200% AND
195
- total_pnl / fee_income < 10×. Returns exceeding these bounds are almost
196
- certainly bugs, not alpha. Category B: Silent Gain Bug.
197
- contract: "annualized_return < 2.0 AND total_pnl / fee_income < 10.0"
198
- tools: ["simulation harness finalize()"]
199
- expected_output: "[INVARIANT PASS] Return plausibility: within market-neutral bounds"
200
- pass_criterion: "Both bounds satisfied; failing bound printed with actual values"
201
-
202
- - id: delta-neutrality
203
- instruction: >
204
- Assert: avg |net_delta| while in the primary running state is below the
205
- delta tolerance threshold (config). High delta = hedge broken or bootstrap bypassed.
206
- Category B: Silent Gain Bug — bootstrapping errors inflate returns mid-simulation.
207
- contract: "mean(abs(net_delta) | state == primary_running_state) < delta_tolerance"
208
- tools: ["simulation harness finalize()"]
209
- expected_output: "[INVARIANT PASS] Delta neutrality: avg |net_delta| within tolerance"
210
- pass_criterion: "Average absolute delta below configured tolerance"
211
-
212
- - id: instrument-balance
213
- instruction: >
214
- Assert: max(instrument_notionals) / min(instrument_notionals) < 3×.
215
- If one sub-strategy is 5× larger than another at finalize, allocation logic failed.
216
- Category B: Silent Gain Bug — one instrument dominates and distorts aggregate returns.
217
- contract: "max(instrument_notionals) / min(instrument_notionals) < 3.0"
218
- tools: ["simulation harness finalize()"]
219
- expected_output: "[INVARIANT PASS] Instrument balance: all sub-strategies within 3× of each other"
220
- pass_criterion: "Notional ratio below 3×; actual ratio printed regardless"
221
-
222
- - id: evidence
223
- title: "Persist Backtests, Param Checkpoints, and Contract Reports"
224
- rationale: >
225
- Financial model evidence is audit-critical. Regulators and risk teams require
226
- reproducible backtests with exact parameter sets and dataset provenance.
227
- steps:
228
- - id: commit-simulation-artifacts
229
- instruction: >
230
- Commit to docs/backtests/:
231
- - contract-report.json (all statistical contract assertions)
232
- - oos-validation.json (out-of-sample results)
233
- - simulation-dataset metadata (NOT the full dataset if large — commit a hash and S3/GCS URI)
234
- - warm-run-checkpoint.json (best params for next run warm start)
235
- Include dataset SHA-256 hash in the commit message for reproducibility.
236
- contract: "docs/backtests/ exists with contract-report.json and oos-validation.json"
237
- tools: ["git", "sha256sum"]
238
- expected_output: "Committed files with dataset hash in commit message"
239
- pass_criterion: "Files present in docs/backtests/; contract-report parsed successfully"
1
+ tag: FINTECH
2
+ section: verification
3
+ title: "Statistical Simulation + Heuristic Model Verification"
4
+ description: >
5
+ Financial models have two uncertainty dimensions. Stochastic uncertainty: price,
6
+ volume, and risk outputs depend on market regime and path-dependent sequences that
7
+ unit tests cannot cover. Heuristic uncertainty: hyperparameter values (lookback
8
+ windows, threshold multipliers, decay factors) must be found by search, not derived
9
+ analytically. This strategy uses statistically meaningful simulated datasets,
10
+ Monte Carlo scenario analysis, and constrained hyperparameter search with pruning.
11
+ uncertainty_levels:
12
+ - stochastic
13
+ - heuristic
14
+ completeness_ceiling: 0.85
15
+
16
+ phases:
17
+
18
+ - id: contract-definition
19
+ title: "Define Statistical Contracts and Parameter Bounds"
20
+ rationale: >
21
+ A financial model contract is a statistical bound, not an exact output.
22
+ "VaR at 95% confidence ≤ 2.5% of portfolio" is a valid contract.
23
+ "The model returns 0.031" is not. Contracts must hold across market regimes,
24
+ not just on the training period.
25
+ steps:
26
+ - id: define-statistical-invariants
27
+ instruction: >
28
+ For each model output, define statistical contracts:
29
+ - VaR (Value at Risk): max acceptable VaR at 95% and 99% confidence levels
30
+ - CVaR (Conditional VaR / Expected Shortfall): max acceptable CVaR at 95%
31
+ - Sharpe ratio: minimum acceptable Sharpe across rolling 90-day windows
32
+ - Max drawdown: maximum acceptable peak-to-trough drawdown
33
+ - Hit rate: for classification signals, minimum precision and recall
34
+ Store in docs/statistical-contracts.md.
35
+ contract: >
36
+ docs/statistical-contracts.md exists with one row per model output.
37
+ Each row has: metric, confidence level, bound (max or min), measurement window.
38
+ tools: ["filesystem"]
39
+ expected_output: "| VaR | 95% | ≤ 2.5% portfolio | 1-day holding period |"
40
+ pass_criterion: "File exists with ≥1 statistical bound per model output"
41
+
42
+ - id: define-parameter-search-space
43
+ instruction: >
44
+ For each tunable hyperparameter, define:
45
+ - Name (e.g., lookback_window, threshold_multiplier)
46
+ - Type (int, float, categorical)
47
+ - Valid range with hard bounds (e.g., lookback_window ∈ [5, 252] days)
48
+ - Prior (e.g., uniform, log-uniform, normal around domain-expert estimate)
49
+ - Forbidden regions (e.g., lookback_window < 5 causes lookahead bias)
50
+ Store in docs/parameter-space.md.
51
+ contract: "docs/parameter-space.md exists with bounds and forbidden regions per parameter"
52
+ tools: ["filesystem"]
53
+ expected_output: "| lookback_window | int | [5, 252] | days | uniform | >252 causes data sparsity |"
54
+ pass_criterion: "Every tunable parameter has hard bounds and a forbidden-regions note"
55
+
56
+ - id: build-simulation-dataset
57
+ instruction: >
58
+ Generate or load a statistically meaningful simulation dataset:
59
+ - Minimum 1,000 trading days of price + volume data per instrument
60
+ - Must include at least 3 distinct market regimes: trending, mean-reverting, high-volatility
61
+ - Data must pass stationarity and regime-detection checks
62
+ - Use real historical data or a calibrated GBM/Heston model with documented parameters
63
+ Do NOT use synthetic uncalibrated random data — this produces misleading validation.
64
+ contract: >
65
+ simulation-dataset.parquet or simulation-dataset.csv exists.
66
+ Row count ≥ 1,000 per instrument. Regime labels column present.
67
+ Stationarity check (ADF) passes. Volatility distribution is documented.
68
+ tools: ["pandas", "numpy", "yfinance", "scipy.stats.adfuller", "hmmlearn"]
69
+ expected_output: "simulation-dataset.csv: date, open, high, low, close, volume, regime_label"
70
+ pass_criterion: "Row count ≥ 1000; ADF p-value < 0.05 on returns series"
71
+
72
+ - id: simulation
73
+ title: "Monte Carlo Scenario Analysis Across Market Regimes"
74
+ rationale: >
75
+ A model that works on one market regime and fails on others is not production-grade.
76
+ Monte Carlo across regimes closes stochastic uncertainty by constructing a confidence
77
+ interval for every statistical contract across the full scenario space.
78
+ steps:
79
+ - id: run-monte-carlo-per-regime
80
+ instruction: >
81
+ For each market regime in the simulation dataset, run 10,000 Monte Carlo
82
+ iterations of the model. Each iteration uses a bootstrapped price path from
83
+ that regime. Record the distribution of each model output (VaR, CVaR, Sharpe, etc.).
84
+ Use multiprocessing — 10,000 iterations must complete in < 60 seconds.
85
+ contract: >
86
+ Monte Carlo output exists per regime with ≥ 10,000 iterations.
87
+ Runtime < 60 seconds on a 4-core machine.
88
+ tools: ["numpy.random.bootstrap", "scipy.stats", "multiprocessing", "joblib"]
89
+ expected_output: "mc-results-{regime}.json: {n_iterations, metric_distributions: {VaR: {mean, p5, p95}, ...}}"
90
+ pass_criterion: "File exists per regime; n_iterations ≥ 10000"
91
+
92
+ - id: assert-statistical-contracts
93
+ instruction: >
94
+ For each statistical contract in docs/statistical-contracts.md, assert it holds
95
+ at the specified confidence level across all regimes:
96
+ - VaR: p95 of the VaR distribution ≤ contracted bound
97
+ - CVaR: same approach
98
+ - Sharpe: p5 of the Sharpe distribution ≥ contracted bound
99
+ Any contract that fails at p5/p95 is a FAIL — not just the mean.
100
+ contract: "All statistical contracts hold at the contracted confidence level across all regimes"
101
+ tools: ["python scripts/assert-contracts.py", "scipy.stats"]
102
+ expected_output: "contract-report.json: [{metric, regime, contracted_bound, p5_actual, p95_actual, result}]"
103
+ pass_criterion: "All result fields = PASS in contract-report.json"
104
+
105
+ - id: hyperparameter-search
106
+ title: "Constrained Heuristic Search with Pruning and Warm Runs"
107
+ rationale: >
108
+ Hyperparameters cannot be derived analytically. They must be found by search.
109
+ Random search without pruning wastes compute on clearly bad regions.
110
+ Pruning eliminates unpromising trials early. Warm runs reuse the best
111
+ checkpoint from a previous search run to avoid starting from scratch.
112
+ steps:
113
+ - id: warm-run-from-prior
114
+ instruction: >
115
+ Before launching a full hyperparameter search, load the best parameters
116
+ from the previous run (warm-run-checkpoint.json). If no checkpoint exists,
117
+ use domain-expert priors from docs/parameter-space.md as the starting point.
118
+ Run the model with the warm start and record its baseline performance.
119
+ contract: >
120
+ warm-run-checkpoint.json contains the best parameters from the last search.
121
+ Warm-start baseline performance is recorded before the new search begins.
122
+ tools: ["optuna", "hyperopt", "ray.tune", "json"]
123
+ expected_output: "warm-run-baseline.json: {params, sharpe, VaR, CVaR, regime_scores}"
124
+ pass_criterion: "warm-run-baseline.json exists; sharpe > 0 (model is not trivially broken)"
125
+
126
+ - id: run-pruned-search
127
+ instruction: >
128
+ Run a Bayesian hyperparameter search with early stopping (pruning):
129
+ - Use Optuna with a MedianPruner or HyperbandPruner
130
+ - Max 200 trials total; trials that fall below the 25th percentile of completed
131
+ trials' primary metric at the 25% epoch mark are pruned immediately
132
+ - Search within the bounds defined in docs/parameter-space.md
133
+ - Forbidden regions are enforced as hard constraints (trial returns -inf)
134
+ contract: >
135
+ Search completes in ≤ 200 trials. Best trial improves on warm-run baseline.
136
+ No forbidden-region parameter combinations appear in the top 10 trials.
137
+ tools: ["optuna", "optuna.pruners.HyperbandPruner", "optuna.samplers.TPESampler"]
138
+ expected_output: "search-results.json: {best_params, best_score, n_trials, n_pruned, improvement_over_baseline}"
139
+ pass_criterion: "best_score > warm_run_baseline_score; no forbidden regions in top 10"
140
+
141
+ - id: validate-best-params-out-of-sample
142
+ instruction: >
143
+ Take the best parameters from the search and run them on a held-out
144
+ out-of-sample dataset (a market period not present in the simulation dataset).
145
+ The model must meet all statistical contracts on this out-of-sample data.
146
+ In-sample performance that does not generalize is overfitting — report it as FAIL.
147
+ contract: >
148
+ Out-of-sample dataset covers ≥ 252 trading days not in the training set.
149
+ All statistical contracts hold on the out-of-sample data.
150
+ tools: ["pandas", "scripts/run-oos-validation.py"]
151
+ expected_output: "oos-validation.json: {params, oos_sharpe, oos_VaR, oos_CVaR, oos_max_drawdown, all_contracts_pass}"
152
+ pass_criterion: "all_contracts_pass = true in oos-validation.json"
153
+
154
+ - id: simulation-invariants
155
+ title: "Post-Run Simulation Invariants (Silent Bug Detection)"
156
+ rationale: >
157
+ Financial simulations fail in two directions, both silently.
158
+ Category A (Silent Loss): strategy runs, earns nothing — unlinked accounting, stuck state machine.
159
+ Category B (Silent Gain): inflated returns from accounting errors or look-ahead leaks.
160
+ These 6 invariants run at finalize() and print alongside standard metrics.
161
+ steps:
162
+ - id: pnl-decomposition
163
+ instruction: >
164
+ Assert: fee_income + price_income == total_pnl ± 1%.
165
+ A gap > 1% means an accounting component is unlinked — it exists in the ledger
166
+ but never flows into the reported total. Category A: Silent Loss Bug.
167
+ contract: "abs((fee_income + price_income) - total_pnl) / abs(total_pnl) < 0.01"
168
+ tools: ["simulation harness finalize()"]
169
+ expected_output: "[INVARIANT PASS] P&L decomposition: fee_income + price_income ≈ total_pnl"
170
+ pass_criterion: "Ratio within 1% tolerance; printed in finalize output"
171
+
172
+ - id: fee-time-ratio
173
+ instruction: >
174
+ Assert: total_fees / active_hours is above the minimum activity threshold.
175
+ Near-zero fees after 1000h of 'running' = instrument was never created.
176
+ The strategy ran against nothing. Category A: Silent Loss Bug.
177
+ contract: "total_fees / active_hours > min_fee_rate_per_hour (config)"
178
+ tools: ["simulation harness finalize()"]
179
+ expected_output: "[INVARIANT PASS] Fee-time ratio: fees proportional to active time"
180
+ pass_criterion: "Fee rate above threshold; flagged as [INVARIANT FAIL] if below"
181
+
182
+ - id: state-concentration
183
+ instruction: >
184
+ Assert: no single non-productive state consumes >80% of simulation time.
185
+ A stuck state machine is not a conservative strategy — it is a broken one.
186
+ Category A: Silent Loss Bug.
187
+ contract: "max(time_per_state[non_productive_states]) / total_time < 0.80"
188
+ tools: ["simulation harness finalize()"]
189
+ expected_output: "[INVARIANT PASS] State concentration: no stuck non-productive state"
190
+ pass_criterion: "All non-productive states below 80% time threshold"
191
+
192
+ - id: return-plausibility
193
+ instruction: >
194
+ For a market-neutral strategy, assert: annual_return < 200% AND
195
+ total_pnl / fee_income < 10×. Returns exceeding these bounds are almost
196
+ certainly bugs, not alpha. Category B: Silent Gain Bug.
197
+ contract: "annualized_return < 2.0 AND total_pnl / fee_income < 10.0"
198
+ tools: ["simulation harness finalize()"]
199
+ expected_output: "[INVARIANT PASS] Return plausibility: within market-neutral bounds"
200
+ pass_criterion: "Both bounds satisfied; failing bound printed with actual values"
201
+
202
+ - id: delta-neutrality
203
+ instruction: >
204
+ Assert: avg |net_delta| while in the primary running state is below the
205
+ delta tolerance threshold (config). High delta = hedge broken or bootstrap bypassed.
206
+ Category B: Silent Gain Bug — bootstrapping errors inflate returns mid-simulation.
207
+ contract: "mean(abs(net_delta) | state == primary_running_state) < delta_tolerance"
208
+ tools: ["simulation harness finalize()"]
209
+ expected_output: "[INVARIANT PASS] Delta neutrality: avg |net_delta| within tolerance"
210
+ pass_criterion: "Average absolute delta below configured tolerance"
211
+
212
+ - id: instrument-balance
213
+ instruction: >
214
+ Assert: max(instrument_notionals) / min(instrument_notionals) < 3×.
215
+ If one sub-strategy is 5× larger than another at finalize, allocation logic failed.
216
+ Category B: Silent Gain Bug — one instrument dominates and distorts aggregate returns.
217
+ contract: "max(instrument_notionals) / min(instrument_notionals) < 3.0"
218
+ tools: ["simulation harness finalize()"]
219
+ expected_output: "[INVARIANT PASS] Instrument balance: all sub-strategies within 3× of each other"
220
+ pass_criterion: "Notional ratio below 3×; actual ratio printed regardless"
221
+
222
+ - id: evidence
223
+ title: "Persist Backtests, Param Checkpoints, and Contract Reports"
224
+ rationale: >
225
+ Financial model evidence is audit-critical. Regulators and risk teams require
226
+ reproducible backtests with exact parameter sets and dataset provenance.
227
+ steps:
228
+ - id: commit-simulation-artifacts
229
+ instruction: >
230
+ Commit to docs/backtests/:
231
+ - contract-report.json (all statistical contract assertions)
232
+ - oos-validation.json (out-of-sample results)
233
+ - simulation-dataset metadata (NOT the full dataset if large — commit a hash and S3/GCS URI)
234
+ - warm-run-checkpoint.json (best params for next run warm start)
235
+ Include dataset SHA-256 hash in the commit message for reproducibility.
236
+ contract: "docs/backtests/ exists with contract-report.json and oos-validation.json"
237
+ tools: ["git", "sha256sum"]
238
+ expected_output: "Committed files with dataset hash in commit message"
239
+ pass_criterion: "Files present in docs/backtests/; contract-report parsed successfully"