kalshi-trading-bot-cli 2.1.6 → 2.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/backtest/discovery.ts +110 -8
- package/src/backtest/metrics.ts +235 -8
- package/src/backtest/renderer.ts +91 -3
- package/src/backtest/types.ts +92 -1
- package/src/commands/analyze.ts +140 -39
- package/src/commands/backtest.ts +90 -29
- package/src/commands/help.ts +9 -3
- package/src/commands/index.ts +8 -0
- package/src/commands/parse-args.ts +23 -1
- package/src/commands/review.ts +28 -17
- package/src/scan/octagon-client.ts +130 -2
- package/src/tools/v2/portfolio-review.ts +1 -1
package/package.json
CHANGED
|
@@ -1,9 +1,92 @@
|
|
|
1
1
|
import type { Database } from 'bun:sqlite';
|
|
2
2
|
import { callKalshiApi } from '../tools/kalshi/api.js';
|
|
3
3
|
import type { KalshiMarket } from '../tools/kalshi/types.js';
|
|
4
|
+
import { fetchAllOctagonEvents } from '../scan/octagon-events-api.js';
|
|
4
5
|
|
|
5
6
|
const CONCURRENCY = 10;
|
|
6
7
|
|
|
8
|
+
/** Where the backtest universe is sourced from. */
|
|
9
|
+
export type UniverseSource = 'api' | 'local';
|
|
10
|
+
|
|
11
|
+
export interface UniverseEntry {
|
|
12
|
+
event_ticker: string;
|
|
13
|
+
category: string | null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface Universe {
|
|
17
|
+
events: UniverseEntry[];
|
|
18
|
+
source: UniverseSource;
|
|
19
|
+
description: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Resolve the backtest universe (the set of events scored).
|
|
24
|
+
*
|
|
25
|
+
* `api` (default): paginate Octagon's covered-events list — systematic,
|
|
26
|
+
* reproducible across machines, doesn't depend on whatever this install
|
|
27
|
+
* happened to analyze in the past. Uses fetchAllOctagonEvents directly;
|
|
28
|
+
* we deliberately do NOT pass hasHistory=true because a prior audit showed
|
|
29
|
+
* that flag silently dropped 373 of 662 events. The pipeline self-filters
|
|
30
|
+
* downstream: events with no usable snapshot return null from
|
|
31
|
+
* selectSnapshotByDate and are skipped cheaply.
|
|
32
|
+
*
|
|
33
|
+
* `local`: legacy behavior — pull from the local `octagon_reports` log.
|
|
34
|
+
* Reflects past usage of this machine, not a defined universe. Useful for
|
|
35
|
+
* offline runs and for comparing against historical backtests.
|
|
36
|
+
*
|
|
37
|
+
* KNOWN LIMITATION: the API returns *today's* covered universe, not the
|
|
38
|
+
* universe as of the entry date. Events dropped from coverage mid-window
|
|
39
|
+
* vanish from the backtest — survivorship at the universe level. A true
|
|
40
|
+
* point-in-time universe requires `events?as_of=<date>` upstream.
|
|
41
|
+
*/
|
|
42
|
+
export async function resolveUniverse(
|
|
43
|
+
db: Database,
|
|
44
|
+
opts?: { source?: UniverseSource; category?: string },
|
|
45
|
+
): Promise<Universe> {
|
|
46
|
+
const source = opts?.source ?? 'api';
|
|
47
|
+
if (source === 'api') {
|
|
48
|
+
const all = await fetchAllOctagonEvents();
|
|
49
|
+
let events: UniverseEntry[] = all.map((e) => ({
|
|
50
|
+
event_ticker: e.event_ticker,
|
|
51
|
+
category: e.series_category ?? null,
|
|
52
|
+
}));
|
|
53
|
+
if (opts?.category) {
|
|
54
|
+
const needle = opts.category.toLowerCase();
|
|
55
|
+
events = events.filter((e) => e.category?.toLowerCase().includes(needle));
|
|
56
|
+
}
|
|
57
|
+
return {
|
|
58
|
+
events,
|
|
59
|
+
source,
|
|
60
|
+
description: `${events.length} events from Octagon API (systematic universe)`,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
// Legacy local path
|
|
64
|
+
const { query, params } = buildEventQuery('', opts?.category);
|
|
65
|
+
const rows = db.query(query).all(params) as Array<{ event_ticker: string; category: string | null }>;
|
|
66
|
+
return {
|
|
67
|
+
events: rows.map((r) => ({ event_ticker: r.event_ticker, category: r.category })),
|
|
68
|
+
source,
|
|
69
|
+
description: `${rows.length} events from local octagon_reports (NON-SYSTEMATIC — reflects past usage of this machine)`,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Fetch the Kalshi event payload for each event in the universe, once.
|
|
75
|
+
* Returns a map keyed by event_ticker. Both discoverSettledMarkets and
|
|
76
|
+
* discoverOpenMarkets can read from this single map instead of each
|
|
77
|
+
* re-fetching every event payload — halves the Kalshi call count.
|
|
78
|
+
*/
|
|
79
|
+
export async function fetchEventPayloads(
|
|
80
|
+
universe: UniverseEntry[],
|
|
81
|
+
): Promise<Map<string, KalshiMarket[]>> {
|
|
82
|
+
const out = new Map<string, KalshiMarket[]>();
|
|
83
|
+
await parallelMap(universe, async (entry) => {
|
|
84
|
+
const markets = await fetchEventMarkets(entry.event_ticker);
|
|
85
|
+
out.set(entry.event_ticker, markets);
|
|
86
|
+
}, CONCURRENCY);
|
|
87
|
+
return out;
|
|
88
|
+
}
|
|
89
|
+
|
|
7
90
|
export interface SettledMarket {
|
|
8
91
|
ticker: string;
|
|
9
92
|
event_ticker: string;
|
|
@@ -103,13 +186,23 @@ export async function parallelMap<T, R>(
|
|
|
103
186
|
*/
|
|
104
187
|
export async function discoverSettledMarkets(
|
|
105
188
|
db: Database,
|
|
106
|
-
opts?: {
|
|
189
|
+
opts?: {
|
|
190
|
+
category?: string;
|
|
191
|
+
/** Pre-resolved universe + payloads (Phase 4 path). When omitted, falls back to the legacy local SQL path. */
|
|
192
|
+
universe?: Universe;
|
|
193
|
+
payloads?: Map<string, KalshiMarket[]>;
|
|
194
|
+
},
|
|
107
195
|
): Promise<SettledMarket[]> {
|
|
108
|
-
|
|
109
|
-
|
|
196
|
+
let events: Array<{ event_ticker: string; category: string | null }>;
|
|
197
|
+
if (opts?.universe) {
|
|
198
|
+
events = opts.universe.events;
|
|
199
|
+
} else {
|
|
200
|
+
const { query, params } = buildEventQuery('', opts?.category);
|
|
201
|
+
events = db.query(query).all(params) as Array<{ event_ticker: string; category: string | null }>;
|
|
202
|
+
}
|
|
110
203
|
|
|
111
204
|
const batchResults = await parallelMap(events, async ({ event_ticker, category: cat }) => {
|
|
112
|
-
const markets = await fetchEventMarkets(event_ticker);
|
|
205
|
+
const markets = opts?.payloads?.get(event_ticker) ?? await fetchEventMarkets(event_ticker);
|
|
113
206
|
const settled: SettledMarket[] = [];
|
|
114
207
|
|
|
115
208
|
for (const m of markets) {
|
|
@@ -137,13 +230,22 @@ export async function discoverSettledMarkets(
|
|
|
137
230
|
*/
|
|
138
231
|
export async function discoverOpenMarkets(
|
|
139
232
|
db: Database,
|
|
140
|
-
opts?: {
|
|
233
|
+
opts?: {
|
|
234
|
+
category?: string;
|
|
235
|
+
universe?: Universe;
|
|
236
|
+
payloads?: Map<string, KalshiMarket[]>;
|
|
237
|
+
},
|
|
141
238
|
): Promise<OpenMarket[]> {
|
|
142
|
-
|
|
143
|
-
|
|
239
|
+
let events2: Array<{ event_ticker: string; category: string | null }>;
|
|
240
|
+
if (opts?.universe) {
|
|
241
|
+
events2 = opts.universe.events;
|
|
242
|
+
} else {
|
|
243
|
+
const { query: q2, params: p2 } = buildEventQuery('', opts?.category);
|
|
244
|
+
events2 = db.query(q2).all(p2) as Array<{ event_ticker: string; category: string | null }>;
|
|
245
|
+
}
|
|
144
246
|
|
|
145
247
|
const batchResults = await parallelMap(events2, async ({ event_ticker, category: cat }) => {
|
|
146
|
-
const markets = await fetchEventMarkets(event_ticker);
|
|
248
|
+
const markets = opts?.payloads?.get(event_ticker) ?? await fetchEventMarkets(event_ticker);
|
|
147
249
|
const open: OpenMarket[] = [];
|
|
148
250
|
|
|
149
251
|
for (const m of markets) {
|
package/src/backtest/metrics.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ScoredSignal, BacktestResult } from './types.js';
|
|
1
|
+
import type { ScoredSignal, BacktestResult, LegMetrics } from './types.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Skill score: how much better Octagon is vs the market as a forecaster.
|
|
@@ -44,6 +44,68 @@ export function bootstrapCI(
|
|
|
44
44
|
return [stats[lo], stats[hi]];
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
+
/**
|
|
48
|
+
* Cluster bootstrap — resamples GROUPS with replacement, not individual rows.
|
|
49
|
+
*
|
|
50
|
+
* Use when the unit of risk is the group, not the row. In our case Kalshi
|
|
51
|
+
* events are multi-outcome: a single "Will the Fed cut N times?" event has
|
|
52
|
+
* a ladder of NO contracts that all settle together. If the model bets NO
|
|
53
|
+
* on five rungs and the Fed cuts once, all five settle NO simultaneously
|
|
54
|
+
* — that's *one* underlying outcome contributing five rows.
|
|
55
|
+
*
|
|
56
|
+
* Row-level bootstrap (`bootstrapCI` above) treats those rows as independent
|
|
57
|
+
* → CI width shrinks with √N where N is the row count. Real effective N is
|
|
58
|
+
* closer to the event count, so the honest interval is roughly 2× wider.
|
|
59
|
+
*
|
|
60
|
+
* This function takes `groups` (each group = a contiguous block of indices
|
|
61
|
+
* referring to per-row data carried in closures by `statFn`), draws
|
|
62
|
+
* `groups.length` groups with replacement per iteration, concatenates the
|
|
63
|
+
* indices, and applies `statFn` to the pooled sample.
|
|
64
|
+
*/
|
|
65
|
+
export function clusterBootstrapCI(
|
|
66
|
+
groups: number[][],
|
|
67
|
+
statFn: (sampleIndices: number[]) => number,
|
|
68
|
+
iterations = 10_000,
|
|
69
|
+
alpha = 0.05,
|
|
70
|
+
): [number, number] {
|
|
71
|
+
if (groups.length === 0) return [0, 0];
|
|
72
|
+
if (!Number.isFinite(iterations) || !Number.isInteger(iterations) || iterations <= 0) {
|
|
73
|
+
throw new Error(`clusterBootstrapCI: iterations must be a finite integer > 0, got ${iterations}`);
|
|
74
|
+
}
|
|
75
|
+
if (!Number.isFinite(alpha) || alpha <= 0 || alpha >= 1) {
|
|
76
|
+
throw new Error(`clusterBootstrapCI: alpha must be a finite number in (0, 1), got ${alpha}`);
|
|
77
|
+
}
|
|
78
|
+
const stats: number[] = [];
|
|
79
|
+
for (let i = 0; i < iterations; i++) {
|
|
80
|
+
const pooled: number[] = [];
|
|
81
|
+
for (let j = 0; j < groups.length; j++) {
|
|
82
|
+
const g = groups[Math.floor(Math.random() * groups.length)];
|
|
83
|
+
pooled.push(...g);
|
|
84
|
+
}
|
|
85
|
+
if (pooled.length === 0) { stats.push(0); continue; }
|
|
86
|
+
stats.push(statFn(pooled));
|
|
87
|
+
}
|
|
88
|
+
stats.sort((a, b) => a - b);
|
|
89
|
+
const lo = Math.min(Math.max(0, Math.floor((alpha / 2) * stats.length)), stats.length - 1);
|
|
90
|
+
const hi = Math.min(Math.max(0, Math.floor((1 - alpha / 2) * stats.length)), stats.length - 1);
|
|
91
|
+
return [stats[lo], stats[hi]];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Build event-clustered groups: indices for each signal grouped by event_ticker.
|
|
96
|
+
* Used to feed clusterBootstrapCI when the underlying signals are correlated
|
|
97
|
+
* within an event (multi-outcome ladders, mutually-exclusive option sets).
|
|
98
|
+
*/
|
|
99
|
+
function groupIndicesByEvent<T extends { event_ticker: string }>(items: T[]): number[][] {
|
|
100
|
+
const byEvent = new Map<string, number[]>();
|
|
101
|
+
items.forEach((item, idx) => {
|
|
102
|
+
const arr = byEvent.get(item.event_ticker) ?? [];
|
|
103
|
+
arr.push(idx);
|
|
104
|
+
byEvent.set(item.event_ticker, arr);
|
|
105
|
+
});
|
|
106
|
+
return [...byEvent.values()];
|
|
107
|
+
}
|
|
108
|
+
|
|
47
109
|
/**
|
|
48
110
|
* Compute Brier score: ((forecast/100) - (outcome/100))²
|
|
49
111
|
* Both forecast and outcome are on 0-100 scale.
|
|
@@ -52,10 +114,162 @@ function brier(forecast: number, outcome: number): number {
|
|
|
52
114
|
return ((forecast / 100) - (outcome / 100)) ** 2;
|
|
53
115
|
}
|
|
54
116
|
|
|
117
|
+
/** Entry-price bands used by the within-band skill calculation. */
|
|
118
|
+
const PRICE_BANDS: Array<{ label: string; lo: number; hi: number }> = [
|
|
119
|
+
{ label: '5-20¢', lo: 5, hi: 20 },
|
|
120
|
+
{ label: '20-40¢', lo: 20, hi: 40 },
|
|
121
|
+
{ label: '40-60¢', lo: 40, hi: 60 },
|
|
122
|
+
{ label: '60-80¢', lo: 60, hi: 80 },
|
|
123
|
+
{ label: '80-95¢', lo: 80, hi: 95 },
|
|
124
|
+
];
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Compute zero-skill baselines on the same post-filter universe as the model.
|
|
128
|
+
*
|
|
129
|
+
* Why: Kalshi events are multi-outcome — most resolved contracts settle NO
|
|
130
|
+
* because each event has one YES outcome and many NOs. A model that
|
|
131
|
+
* consistently picks NO will hit ~75% by structure alone. The "always-NO"
|
|
132
|
+
* baseline strips that structural tilt out so we can see whether the model
|
|
133
|
+
* has any selection skill beyond the universe's bias.
|
|
134
|
+
*
|
|
135
|
+
* Within-band skill: model NO-bet ROI minus always-NO ROI, computed inside
|
|
136
|
+
* entry-price buckets and capital-weighted across buckets. This also
|
|
137
|
+
* controls for the entry-price mix — a model that only bets cheap
|
|
138
|
+
* longshots will look great vs. an always-NO baseline run over the full
|
|
139
|
+
* universe, but mediocre once we compare within the same price band.
|
|
140
|
+
*/
|
|
141
|
+
function computeBaselines(signals: ScoredSignal[]): BacktestResult['baselines'] {
|
|
142
|
+
// Universe-wide always-NO / always-YES on the same post-filter rows.
|
|
143
|
+
const noPnl = (s: ScoredSignal): { pnl: number; capital: number; hit: boolean } => {
|
|
144
|
+
const capital = (100 - s.market_then) / 100;
|
|
145
|
+
const settlement = 100 - s.market_now;
|
|
146
|
+
const pnl = (settlement - (100 - s.market_then)) / 100;
|
|
147
|
+
return { pnl, capital, hit: s.market_now < s.market_then };
|
|
148
|
+
};
|
|
149
|
+
const yesPnl = (s: ScoredSignal): { pnl: number; capital: number; hit: boolean } => {
|
|
150
|
+
const capital = s.market_then / 100;
|
|
151
|
+
const pnl = (s.market_now - s.market_then) / 100;
|
|
152
|
+
return { pnl, capital, hit: s.market_now > s.market_then };
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
let noP = 0, noC = 0, noHits = 0;
|
|
156
|
+
let yesP = 0, yesC = 0, yesHits = 0;
|
|
157
|
+
for (const s of signals) {
|
|
158
|
+
const n = noPnl(s); noP += n.pnl; noC += n.capital; if (n.hit) noHits++;
|
|
159
|
+
const y = yesPnl(s); yesP += y.pnl; yesC += y.capital; if (y.hit) yesHits++;
|
|
160
|
+
}
|
|
161
|
+
const alwaysNoRoi = noC > 0 ? noP / noC : 0;
|
|
162
|
+
const alwaysYesRoi = yesC > 0 ? yesP / yesC : 0;
|
|
163
|
+
const alwaysNoHitRate = signals.length > 0 ? noHits / signals.length : 0;
|
|
164
|
+
const alwaysYesHitRate = signals.length > 0 ? yesHits / signals.length : 0;
|
|
165
|
+
|
|
166
|
+
// Within-band: bucket by entry price, compute model-NO-bet ROI minus
|
|
167
|
+
// always-NO ROI per band, then capital-weight the deltas across bands.
|
|
168
|
+
// The model's NO bets are the meaningful comparable population (the
|
|
169
|
+
// structural NO tilt is the dominant source of "skill" in the universe).
|
|
170
|
+
const breakdown: BacktestResult['baselines']['within_band_breakdown'] = [];
|
|
171
|
+
let weightedDeltaNumer = 0;
|
|
172
|
+
let weightedDeltaDenom = 0;
|
|
173
|
+
for (const band of PRICE_BANDS) {
|
|
174
|
+
const inBand = signals.filter((s) => s.market_then >= band.lo && s.market_then < band.hi);
|
|
175
|
+
if (inBand.length === 0) continue;
|
|
176
|
+
|
|
177
|
+
let bandModelPnl = 0, bandModelCap = 0, bandModelN = 0;
|
|
178
|
+
let bandNoPnl = 0, bandNoCap = 0;
|
|
179
|
+
for (const s of inBand) {
|
|
180
|
+
const n = noPnl(s); bandNoPnl += n.pnl; bandNoCap += n.capital;
|
|
181
|
+
// Model "bet" here = signal where model picked NO (edge_pp < 0) and
|
|
182
|
+
// capital is the NO-side capital we already stored in s.capital.
|
|
183
|
+
if (s.edge_pp < 0) {
|
|
184
|
+
bandModelPnl += s.pnl;
|
|
185
|
+
bandModelCap += s.capital;
|
|
186
|
+
bandModelN++;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
const modelRoi = bandModelCap > 0 ? bandModelPnl / bandModelCap : 0;
|
|
190
|
+
const baselineRoi = bandNoCap > 0 ? bandNoPnl / bandNoCap : 0;
|
|
191
|
+
const deltaPp = (modelRoi - baselineRoi) * 100;
|
|
192
|
+
breakdown.push({
|
|
193
|
+
band: band.label,
|
|
194
|
+
model_no_roi: modelRoi,
|
|
195
|
+
always_no_roi: baselineRoi,
|
|
196
|
+
skill_delta_pp: deltaPp,
|
|
197
|
+
n_model: bandModelN,
|
|
198
|
+
n_universe: inBand.length,
|
|
199
|
+
});
|
|
200
|
+
weightedDeltaNumer += deltaPp * bandModelCap;
|
|
201
|
+
weightedDeltaDenom += bandModelCap;
|
|
202
|
+
}
|
|
203
|
+
const withinBandSkillPp = weightedDeltaDenom > 0 ? weightedDeltaNumer / weightedDeltaDenom : 0;
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
always_no_roi: alwaysNoRoi,
|
|
207
|
+
always_no_hit_rate: alwaysNoHitRate,
|
|
208
|
+
always_yes_roi: alwaysYesRoi,
|
|
209
|
+
always_yes_hit_rate: alwaysYesHitRate,
|
|
210
|
+
within_band_skill_pp: withinBandSkillPp,
|
|
211
|
+
within_band_breakdown: breakdown,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Compute the scorecard for one leg (resolved-only or unresolved-only).
|
|
217
|
+
* Same hit-rate and capital-weighted ROI definitions as the blended
|
|
218
|
+
* computation, just scoped to the subset.
|
|
219
|
+
*/
|
|
220
|
+
function computeLegMetrics(signals: ScoredSignal[], minEdgePp: number): LegMetrics {
|
|
221
|
+
const edgeSignals = signals.filter((s) => s.edge_pp !== 0 && Math.abs(s.edge_pp) >= minEdgePp);
|
|
222
|
+
const edgeCount = edgeSignals.length;
|
|
223
|
+
const hits = edgeSignals.filter((s) =>
|
|
224
|
+
s.edge_pp > 0 ? s.market_now > s.market_then : s.market_now < s.market_then,
|
|
225
|
+
);
|
|
226
|
+
const hitRate = edgeCount > 0 ? hits.length / edgeCount : 0;
|
|
227
|
+
const hitRateData = edgeSignals.map((s) =>
|
|
228
|
+
s.edge_pp > 0 ? (s.market_now > s.market_then ? 1 : 0) : (s.market_now < s.market_then ? 1 : 0),
|
|
229
|
+
);
|
|
230
|
+
const legEventGroups = groupIndicesByEvent(edgeSignals);
|
|
231
|
+
const hitRateCI: [number, number] = edgeCount > 0
|
|
232
|
+
? clusterBootstrapCI(legEventGroups, (sample) => {
|
|
233
|
+
let sum = 0;
|
|
234
|
+
for (const idx of sample) sum += hitRateData[idx];
|
|
235
|
+
return sample.length > 0 ? sum / sample.length : 0;
|
|
236
|
+
})
|
|
237
|
+
: [0, 0];
|
|
238
|
+
const pnl = edgeSignals.reduce((sum, s) => sum + s.pnl, 0);
|
|
239
|
+
const totalCapital = edgeSignals.reduce((sum, s) => sum + s.capital, 0);
|
|
240
|
+
const roi = totalCapital > 0 ? pnl / totalCapital : 0;
|
|
241
|
+
return {
|
|
242
|
+
edge_signals: edgeCount,
|
|
243
|
+
edge_hit_rate: hitRate,
|
|
244
|
+
hit_rate_ci: hitRateCI,
|
|
245
|
+
flat_bet_pnl: pnl,
|
|
246
|
+
flat_bet_roi: roi,
|
|
247
|
+
total_capital: totalCapital,
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const EMPTY_LEG: LegMetrics = {
|
|
252
|
+
edge_signals: 0,
|
|
253
|
+
edge_hit_rate: 0,
|
|
254
|
+
hit_rate_ci: [0, 0],
|
|
255
|
+
flat_bet_pnl: 0,
|
|
256
|
+
flat_bet_roi: 0,
|
|
257
|
+
total_capital: 0,
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
const EMPTY_BASELINES: BacktestResult['baselines'] = {
|
|
261
|
+
always_no_roi: 0,
|
|
262
|
+
always_no_hit_rate: 0,
|
|
263
|
+
always_yes_roi: 0,
|
|
264
|
+
always_yes_hit_rate: 0,
|
|
265
|
+
within_band_skill_pp: 0,
|
|
266
|
+
within_band_breakdown: [],
|
|
267
|
+
};
|
|
268
|
+
|
|
55
269
|
/**
|
|
56
270
|
* Compute all backtest metrics from a unified list of scored signals.
|
|
57
271
|
*/
|
|
58
|
-
export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<BacktestResult, 'subscription_notice'> {
|
|
272
|
+
export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<BacktestResult, 'subscription_notice' | 'signals_dropped_no_volume' | 'universe_source' | 'universe_size' | 'universe_description' | 'fee_model' | 'flat_bet_pnl_net' | 'flat_bet_roi_net'> {
|
|
59
273
|
const n = signals.length;
|
|
60
274
|
if (n === 0) {
|
|
61
275
|
return {
|
|
@@ -75,6 +289,9 @@ export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<B
|
|
|
75
289
|
flat_bet_roi: 0,
|
|
76
290
|
total_capital: 0,
|
|
77
291
|
signals: [],
|
|
292
|
+
baselines: EMPTY_BASELINES,
|
|
293
|
+
resolved_metrics: EMPTY_LEG,
|
|
294
|
+
unresolved_metrics: EMPTY_LEG,
|
|
78
295
|
};
|
|
79
296
|
}
|
|
80
297
|
|
|
@@ -84,10 +301,14 @@ export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<B
|
|
|
84
301
|
const brierOctagon = brierOctagonScores.reduce((a, b) => a + b, 0) / n;
|
|
85
302
|
const brierMarket = brierMarketScores.reduce((a, b) => a + b, 0) / n;
|
|
86
303
|
|
|
87
|
-
// Skill score with bootstrap CI
|
|
304
|
+
// Skill score with EVENT-CLUSTERED bootstrap CI. Why clustered: multi-
|
|
305
|
+
// outcome events (Fed-cut ladders, election option sets, price strikes)
|
|
306
|
+
// settle as a block — N contracts from one event aren't N independent
|
|
307
|
+
// observations. Row-level bootstrap shrinks the CI with sqrt(N rows)
|
|
308
|
+
// when the right denominator is sqrt(N events).
|
|
88
309
|
const skillScore = computeSkillScore(brierOctagon, brierMarket);
|
|
89
|
-
const
|
|
90
|
-
const skillCI =
|
|
310
|
+
const eventGroups = groupIndicesByEvent(signals);
|
|
311
|
+
const skillCI = clusterBootstrapCI(eventGroups, (sample) => {
|
|
91
312
|
let sumOctagon = 0;
|
|
92
313
|
let sumMarket = 0;
|
|
93
314
|
for (const idx of sample) {
|
|
@@ -112,13 +333,16 @@ export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<B
|
|
|
112
333
|
});
|
|
113
334
|
const hitRate = edgeCount > 0 ? hits.length / edgeCount : 0;
|
|
114
335
|
|
|
115
|
-
//
|
|
336
|
+
// Event-clustered hit rate CI on the EDGE signals only.
|
|
116
337
|
const hitRateData = edgeSignals.map(s => {
|
|
117
338
|
if (s.edge_pp > 0) return s.market_now > s.market_then ? 1 : 0;
|
|
118
339
|
return s.market_now < s.market_then ? 1 : 0;
|
|
119
340
|
});
|
|
120
|
-
const
|
|
121
|
-
|
|
341
|
+
const edgeEventGroups = groupIndicesByEvent(edgeSignals);
|
|
342
|
+
const hitRateCI = clusterBootstrapCI(edgeEventGroups, (sample) => {
|
|
343
|
+
let sum = 0;
|
|
344
|
+
for (const idx of sample) sum += hitRateData[idx];
|
|
345
|
+
return sample.length > 0 ? sum / sample.length : 0;
|
|
122
346
|
});
|
|
123
347
|
|
|
124
348
|
// P&L and capital-weighted ROI (matches Supabase methodology):
|
|
@@ -161,5 +385,8 @@ export function computeMetrics(signals: ScoredSignal[], minEdgePp = 0.5): Omit<B
|
|
|
161
385
|
flat_bet_roi: roi,
|
|
162
386
|
total_capital: totalCapital,
|
|
163
387
|
signals,
|
|
388
|
+
baselines: computeBaselines(signals),
|
|
389
|
+
resolved_metrics: computeLegMetrics(signals.filter((s) => s.resolved), minEdgePp),
|
|
390
|
+
unresolved_metrics: computeLegMetrics(signals.filter((s) => !s.resolved), minEdgePp),
|
|
164
391
|
};
|
|
165
392
|
}
|
package/src/backtest/renderer.ts
CHANGED
|
@@ -5,6 +5,16 @@ export interface FormatOpts {
|
|
|
5
5
|
minEdge?: number; // 0-1 scale, default 0.005 (0.5pp)
|
|
6
6
|
}
|
|
7
7
|
|
|
8
|
+
/** Format a 0-1 ROI as a signed percentage string. */
|
|
9
|
+
function fmtRoi(roi: number): string {
|
|
10
|
+
return `${roi >= 0 ? '+' : ''}${(roi * 100).toFixed(1)}%`;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Format a percentage-point delta with sign. */
|
|
14
|
+
function fmtPp(pp: number): string {
|
|
15
|
+
return `${pp >= 0 ? '+' : ''}${pp.toFixed(1)}pp`;
|
|
16
|
+
}
|
|
17
|
+
|
|
8
18
|
/**
|
|
9
19
|
* Format complete backtest result for terminal display.
|
|
10
20
|
*/
|
|
@@ -17,6 +27,20 @@ export function formatBacktestHuman(result: BacktestResult, opts?: FormatOpts):
|
|
|
17
27
|
|
|
18
28
|
const lines: string[] = [];
|
|
19
29
|
lines.push(`Octagon Backtest — ${result.days}-day lookback (${fromStr} – ${toStr})`);
|
|
30
|
+
lines.push(`Universe: ${result.universe_description}`);
|
|
31
|
+
let feeHeader: string;
|
|
32
|
+
switch (result.fee_model) {
|
|
33
|
+
case 'none':
|
|
34
|
+
feeHeader = 'none — output is GROSS (pre-fee)';
|
|
35
|
+
break;
|
|
36
|
+
case 'taker':
|
|
37
|
+
feeHeader = 'taker (entries charged Kalshi taker fee = 0.07·p·(1−p))';
|
|
38
|
+
break;
|
|
39
|
+
case 'maker':
|
|
40
|
+
feeHeader = 'maker (free-entry execution assumption — net P&L equals gross)';
|
|
41
|
+
break;
|
|
42
|
+
}
|
|
43
|
+
lines.push(`Fee model: ${feeHeader}`);
|
|
20
44
|
lines.push('══════════════════════════════════════════════════════════');
|
|
21
45
|
lines.push('');
|
|
22
46
|
|
|
@@ -47,9 +71,73 @@ export function formatBacktestHuman(result: BacktestResult, opts?: FormatOpts):
|
|
|
47
71
|
// lines.push('');
|
|
48
72
|
lines.push(` Edge signals ${result.edge_signals} (min edge: ${minEdgePp}pp)`);
|
|
49
73
|
if (result.edge_signals > 0) {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
74
|
+
// Resolved settles at 0/100 — realized. Unresolved is marked to the
|
|
75
|
+
// current Kalshi price — paper P&L that can reverse. Splitting them
|
|
76
|
+
// makes it visible when one leg is carrying a weak other.
|
|
77
|
+
const r = result.resolved_metrics;
|
|
78
|
+
const u = result.unresolved_metrics;
|
|
79
|
+
if (r.edge_signals > 0) {
|
|
80
|
+
lines.push('');
|
|
81
|
+
lines.push(' RESOLVED (realized P&L)');
|
|
82
|
+
lines.push(` Hit rate ${(r.edge_hit_rate * 100).toFixed(1)}% [95% CI: ${(r.hit_rate_ci[0] * 100).toFixed(1)}% to ${(r.hit_rate_ci[1] * 100).toFixed(1)}%, event-clustered] n=${r.edge_signals}`);
|
|
83
|
+
lines.push(` Flat-bet P&L ${fmtRoi(r.flat_bet_roi)} ROI (${r.flat_bet_pnl >= 0 ? '+' : ''}$${r.flat_bet_pnl.toFixed(2)} on $${r.total_capital.toFixed(2)} capital)`);
|
|
84
|
+
}
|
|
85
|
+
if (u.edge_signals > 0) {
|
|
86
|
+
lines.push('');
|
|
87
|
+
lines.push(' UNRESOLVED (mark-to-market — paper P&L)');
|
|
88
|
+
lines.push(` Directional drift ${(u.edge_hit_rate * 100).toFixed(1)}% [95% CI: ${(u.hit_rate_ci[0] * 100).toFixed(1)}% to ${(u.hit_rate_ci[1] * 100).toFixed(1)}%, event-clustered] n=${u.edge_signals}`);
|
|
89
|
+
lines.push(` M2M P&L ${fmtRoi(u.flat_bet_roi)} ROI (${u.flat_bet_pnl >= 0 ? '+' : ''}$${u.flat_bet_pnl.toFixed(2)} on $${u.total_capital.toFixed(2)} capital)`);
|
|
90
|
+
}
|
|
91
|
+
if (r.edge_signals > 0 && u.edge_signals > 0) {
|
|
92
|
+
lines.push('');
|
|
93
|
+
lines.push(' COMBINED (both legs blended — interpret with care)');
|
|
94
|
+
lines.push(` Hit rate ${(result.edge_hit_rate * 100).toFixed(1)}% [95% CI: ${(result.hit_rate_ci[0] * 100).toFixed(1)}% to ${(result.hit_rate_ci[1] * 100).toFixed(1)}%, event-clustered]`);
|
|
95
|
+
lines.push(` Flat-bet P&L ${fmtRoi(result.flat_bet_roi)} ROI (${result.flat_bet_pnl >= 0 ? '+' : ''}$${result.flat_bet_pnl.toFixed(2)} on $${result.total_capital.toFixed(2)} capital)`);
|
|
96
|
+
}
|
|
97
|
+
// Fee drag — show only when --fees is on so existing output is unchanged.
|
|
98
|
+
if (result.fee_model !== 'none' && result.flat_bet_pnl !== result.flat_bet_pnl_net) {
|
|
99
|
+
const feeDrag = result.flat_bet_pnl - result.flat_bet_pnl_net;
|
|
100
|
+
lines.push('');
|
|
101
|
+
lines.push(` Fees applied (${result.fee_model})`);
|
|
102
|
+
lines.push(` Gross P&L ${result.flat_bet_pnl >= 0 ? '+' : ''}$${result.flat_bet_pnl.toFixed(2)} (${fmtRoi(result.flat_bet_roi)} ROI)`);
|
|
103
|
+
lines.push(` Fee drag -$${feeDrag.toFixed(2)}`);
|
|
104
|
+
lines.push(` Net P&L ${result.flat_bet_pnl_net >= 0 ? '+' : ''}$${result.flat_bet_pnl_net.toFixed(2)} (${fmtRoi(result.flat_bet_roi_net)} ROI)`);
|
|
105
|
+
} else if (r.edge_signals === 0 && u.edge_signals === 0) {
|
|
106
|
+
// No edge signals on either leg — fall back to the old single-line view.
|
|
107
|
+
lines.push(` Hit rate ${(result.edge_hit_rate * 100).toFixed(1)}% [95% CI: ${(result.hit_rate_ci[0] * 100).toFixed(1)}% to ${(result.hit_rate_ci[1] * 100).toFixed(1)}%]`);
|
|
108
|
+
lines.push(` Flat-bet P&L ${result.flat_bet_pnl >= 0 ? '+' : ''}$${result.flat_bet_pnl.toFixed(2)} (ROI: ${fmtRoi(result.flat_bet_roi)})`);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ─── Zero-skill baselines ─────────────────────────────────────────────
|
|
113
|
+
// The headline ROI / hit rate can look strong purely from the universe's
|
|
114
|
+
// structural NO tilt (multi-outcome events resolve mostly NO). These two
|
|
115
|
+
// baselines run the same post-filter universe under zero-skill strategies
|
|
116
|
+
// so the user can see whether the model adds anything.
|
|
117
|
+
const b = result.baselines;
|
|
118
|
+
if (result.signals.length > 0) {
|
|
119
|
+
lines.push('');
|
|
120
|
+
lines.push(' Zero-skill baselines (same universe, no model):');
|
|
121
|
+
lines.push(` Always-NO ROI ${fmtRoi(b.always_no_roi)} hit rate ${(b.always_no_hit_rate * 100).toFixed(1)}%`);
|
|
122
|
+
lines.push(` Always-YES ROI ${fmtRoi(b.always_yes_roi)} hit rate ${(b.always_yes_hit_rate * 100).toFixed(1)}%`);
|
|
123
|
+
lines.push(` Within-band skill ${fmtPp(b.within_band_skill_pp)} (model NO-ROI minus always-NO ROI, capital-weighted across entry-price bands)`);
|
|
124
|
+
// Per-band breakdown when at least one band has model bets
|
|
125
|
+
if (b.within_band_breakdown.some((r) => r.n_model > 0)) {
|
|
126
|
+
lines.push('');
|
|
127
|
+
lines.push(' Per-band skill breakdown:');
|
|
128
|
+
lines.push(` ${'Band'.padEnd(8)} ${'Model NO ROI'.padStart(13)} ${'Always-NO ROI'.padStart(14)} ${'Delta'.padStart(9)} ${'n_model'.padStart(7)} ${'n_total'.padStart(7)}`);
|
|
129
|
+
for (const row of b.within_band_breakdown) {
|
|
130
|
+
if (row.n_universe === 0) continue;
|
|
131
|
+
const delta = `${row.skill_delta_pp >= 0 ? '+' : ''}${row.skill_delta_pp.toFixed(1)}pp`;
|
|
132
|
+
lines.push(` ${row.band.padEnd(8)} ${fmtRoi(row.model_no_roi).padStart(13)} ${fmtRoi(row.always_no_roi).padStart(14)} ${delta.padStart(9)} ${String(row.n_model).padStart(7)} ${String(row.n_universe).padStart(7)}`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Coverage cost of the strict (no lifetime-volume look-ahead) volume gate.
|
|
138
|
+
if (result.signals_dropped_no_volume > 0) {
|
|
139
|
+
lines.push('');
|
|
140
|
+
lines.push(` Signals dropped: ${result.signals_dropped_no_volume} (no per-contract volume in Octagon snapshot; lifetime-volume fallback removed to avoid look-ahead bias)`);
|
|
53
141
|
}
|
|
54
142
|
|
|
55
143
|
// Resolved detail table
|