slice-tournament-zoo 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.ts ADDED
@@ -0,0 +1,305 @@
1
+ /**
2
+ * STZ domain types — the data model that the whole harness reads/writes.
3
+ *
4
+ * Design anchor: N1 (auditability) + N6 (determinism/replay). Every decision
5
+ * the harness makes must be reconstructible from the markdown tree + git +
6
+ * the `state.json` event sequence. These types are the schema for that state.
7
+ */
8
+
9
+ /** The 8 phases of the per-slice pipeline (F1). Order is significant. */
10
+ export const PHASES = [
11
+ "elicitation",
12
+ "research",
13
+ "ground-truth-validation",
14
+ "standards",
15
+ "test-authoring",
16
+ "planning",
17
+ "tournament",
18
+ "judgment",
19
+ ] as const;
20
+
21
+ export type Phase = (typeof PHASES)[number];
22
+
23
+ /** Lifecycle status of a single phase within a slice. */
24
+ export type PhaseStatus = "pending" | "running" | "done" | "failed";
25
+
26
+ /** Where the bounded failure-escalation FSM (F14) currently sits. */
27
+ export type EscalationStage =
28
+ | "normal"
29
+ | "grpo-retry"
30
+ | "replan"
31
+ | "halted";
32
+
33
+ /** Specimen identifiers default to a..d for N=4 (F6). */
34
+ export type SpecimenId = string;
35
+
36
+ /**
37
+ * A machine-checkable success predicate (F2). Elicitation may not exit until
38
+ * every quantitative success criterion is expressed as one of these — no
39
+ * prose-only acceptance.
40
+ */
41
+ export interface DonePredicate {
42
+ /** Stable id, e.g. "p95_latency". */
43
+ id: string;
44
+ /** The predicate expression, e.g. "p95_latency_ms < 200". */
45
+ expr: string;
46
+ /** How it is checked: a sealed test, a metric threshold, a schema match. */
47
+ kind: "test" | "metric" | "schema";
48
+ }
49
+
50
+ /** Trace tier declared per slice (F11). */
51
+ export type TraceTier = "minimal" | "otel";
52
+
53
+ /**
54
+ * Per-slice manifest frontmatter (F5). The orchestrator loads this summary,
55
+ * never the full slice body (N2 progressive disclosure).
56
+ */
57
+ export interface SliceManifest {
58
+ id: string; // "slice-01"
59
+ name: string; // "elicitation-subagent"
60
+ /** Interface contract the slice implements (F4). Prose/signature surface. */
61
+ contract: string;
62
+ donePredicates: DonePredicate[];
63
+ traceTier: TraceTier;
64
+ /** Complexity estimate 1..5 (F15) — drives budgeting. */
65
+ complexity: number;
66
+ /** Slice ids this one depends on (DAG ordering, F5). */
67
+ dependsOn: string[];
68
+ /** Judge config (F7): votes per pairwise comparison. */
69
+ judge: { votesPerPair: number };
70
+ /** ~200-token summary for progressive disclosure (N2). */
71
+ summary: string;
72
+ }
73
+
74
+ /** Result of running one specimen through the eval-gate (F7 stage 1). */
75
+ export interface EvalResult {
76
+ specimen: SpecimenId;
77
+ /** Did it pass the sealed held-out suite? Gate failures are eliminated. */
78
+ passedGate: boolean;
79
+ /** 0..1 fraction of sealed tests passed. */
80
+ testPassRate: number;
81
+ /** 0..1 code coverage (F11). */
82
+ coverage: number;
83
+ /** 0..1 mutation survival rate; lower is better (F11). */
84
+ mutationScore: number;
85
+ /** Hack-pattern findings (F10/L3). Non-empty ⇒ disqualified. */
86
+ hackFindings: HackFinding[];
87
+ }
88
+
89
+ /** A single anti-reward-hacking finding (F10 / L3). */
90
+ export interface HackFinding {
91
+ specimen: SpecimenId;
92
+ pattern: HackPattern;
93
+ /** File + line where the pattern was detected. */
94
+ location: string;
95
+ /** Remediation context re-injected into the next prompt on replan (F14). */
96
+ remediation: string;
97
+ }
98
+
99
+ export type HackPattern =
100
+ | "hardcoded-test-input"
101
+ | "assertion-mutation"
102
+ | "test-skip"
103
+ | "fixture-keyed-branch"
104
+ | "network-bypass";
105
+
106
+ /** A single pairwise judge vote (F7 stage 2). */
107
+ export interface PairwiseVote {
108
+ a: SpecimenId;
109
+ b: SpecimenId;
110
+ /** Winner of this vote. */
111
+ winner: SpecimenId;
112
+ }
113
+
114
+ /** GRPO group-relative advantage for one specimen (F8). */
115
+ export interface Advantage {
116
+ specimen: SpecimenId;
117
+ reward: number;
118
+ advantage: number;
119
+ }
120
+
121
+ /** Final ranking output of the judgment phase. */
122
+ export interface Judgment {
123
+ /** Specimens ordered best→worst among gate-passers. */
124
+ ranking: SpecimenId[];
125
+ winner: SpecimenId | null;
126
+ advantages: Advantage[];
127
+ votes: PairwiseVote[];
128
+ }
129
+
130
+ /** One persisted LLM/subagent call for replay (N6). */
131
+ export interface CallRecord {
132
+ id: string;
133
+ phase: Phase;
134
+ role: "specimen" | "judge" | "test-author" | "documenter" | "elicitor" | "researcher" | "planner";
135
+ model: string;
136
+ temperature: number;
137
+ seed: number | null;
138
+ promptTokens: number;
139
+ completionTokens: number;
140
+ /** Monotonic sequence index for deterministic replay ordering. */
141
+ seq: number;
142
+ }
143
+
144
+ /** Per-slice budget derived from complexity (F15, N5). */
145
+ export interface Budget {
146
+ tokenCap: number;
147
+ wallClockMs: number;
148
+ tokensSpent: number;
149
+ }
150
+
151
+ /** A structured state-transition event (N1: replayable event sequence). */
152
+ export interface StateEvent {
153
+ seq: number;
154
+ phase: Phase | "lifecycle";
155
+ kind: string;
156
+ detail: string;
157
+ }
158
+
159
+ /**
160
+ * `state.json` — the per-slice durable checkpoint (F16). Combined with git and
161
+ * the markdown tree it is sufficient to replay any decision (N1) and to resume
162
+ * after a crash from the last committed phase.
163
+ */
164
+ export interface SliceState {
165
+ schemaVersion: 1;
166
+ sliceId: string;
167
+ currentPhase: Phase;
168
+ phaseStatus: Record<Phase, PhaseStatus>;
169
+ escalation: EscalationStage;
170
+ /** How many GRPO retries / replans consumed (ceiling enforced by F14). */
171
+ retryCount: number;
172
+ replanCount: number;
173
+ activeSpecimens: SpecimenId[];
174
+ budget: Budget;
175
+ /** Append-only event log — the replay spine. */
176
+ events: StateEvent[];
177
+ /** Accumulated call ledger pointers (full records under 90-audit/calls). */
178
+ callCount: number;
179
+ /** Set when escalation reaches "halted". */
180
+ failureReport: string | null;
181
+ }
182
+
183
+ // ── Project-level pipeline (multi-slice driver) ─────────────────────────────
184
+
185
+ /**
186
+ * Project-level pipeline phases — run ONCE for the whole project, before and
187
+ * around the per-slice tournaments. These share names with the per-slice
188
+ * `PHASES` but are a different scope: project preparation vs. one slice's
189
+ * tournament. Kept as a separate enum on purpose; do not overload `PHASES`.
190
+ */
191
+ export const PROJECT_PHASES = [
192
+ "elicitation",
193
+ "research",
194
+ "ground-truth",
195
+ "standards",
196
+ "testing-conventions",
197
+ "slice-disaggregation",
198
+ ] as const;
199
+
200
+ export type ProjectPhase = (typeof PROJECT_PHASES)[number];
201
+ export type ProjectPhaseStatus = "pending" | "done";
202
+
203
+ /** Per-slice rollup status as seen by the project driver. */
204
+ export type SliceRunStatus = "pending" | "running" | "done" | "halted";
205
+
206
+ /** One slice as registered in the project DAG. A thin pointer; the full
207
+ * SliceManifest still lives at 40-slices/<id>/manifest.json. */
208
+ export interface ProjectSliceEntry {
209
+ id: string;
210
+ name: string;
211
+ /** Slice ids this one depends on — reuses SliceManifest.dependsOn semantics. */
212
+ dependsOn: string[];
213
+ }
214
+
215
+ /** 00-intent/project.json — the declarative project spec + slice DAG. */
216
+ export interface ProjectManifest {
217
+ schemaVersion: 1;
218
+ projectId: string;
219
+ name: string;
220
+ summary: string;
221
+ slices: ProjectSliceEntry[];
222
+ }
223
+
224
+ /** A project-level structured event (N1 replay spine, project scope). */
225
+ export interface ProjectStateEvent {
226
+ seq: number;
227
+ phase: ProjectPhase | "lifecycle" | "slice";
228
+ kind: string;
229
+ detail: string;
230
+ }
231
+
232
+ /**
233
+ * 90-audit/project-state.json — the mutable project driver state. `sliceStatus`
234
+ * is a cache; `project-status` re-derives the authoritative value from each
235
+ * slice's own 40-slices/<id>/state.json (no dual-write, no drift).
236
+ */
237
+ export interface ProjectState {
238
+ schemaVersion: 1;
239
+ projectId: string;
240
+ phaseStatus: Record<ProjectPhase, ProjectPhaseStatus>;
241
+ sliceStatus: Record<string, SliceRunStatus>;
242
+ events: ProjectStateEvent[];
243
+ }
244
+
245
+ // ── Run configuration (0.3.0 — captured during elicitation, consumed
246
+ // downstream) ─────────────────────────────────────────────────────────────
247
+
248
+ /** How finely `/stz:slice` breaks the work into vertical slices. */
249
+ export type SlicingGranularity = "coarse" | "balanced" | "fine";
250
+
251
+ /** Mutation-testing bar for `/stz:tests`. */
252
+ export type MutationPolicy = "off" | "lenient" | "standard" | "strict";
253
+
254
+ /** Conventions/lint bar for `/stz:standards`. */
255
+ export type ConventionStrictness = "relaxed" | "standard" | "strict";
256
+
257
+ /** The per-role subagents whose model can be chosen up front. */
258
+ export const STZ_ROLES = [
259
+ "planning",
260
+ "research",
261
+ "execution",
262
+ "testing",
263
+ "validation",
264
+ "judging",
265
+ ] as const;
266
+ export type StzRole = (typeof STZ_ROLES)[number];
267
+
268
+ /** Strictness bar applied to standards and testing conventions. */
269
+ export interface StrictnessConfig {
270
+ /** Coverage target in [0, 1] — `/stz:tests` strategy + per-slice eval. */
271
+ coverageTarget: number;
272
+ mutationPolicy: MutationPolicy;
273
+ conventions: ConventionStrictness;
274
+ }
275
+
276
+ /**
277
+ * 00-intent/run-config.json — the run configuration the user sets during
278
+ * `/stz:new`, applied downstream: `granularity` → `/stz:slice`, `fanout` → the
279
+ * specimen count N in `/stz:run`, `models` → the per-role subagent model
280
+ * overrides, `strictness` → `/stz:standards` and `/stz:tests`.
281
+ *
282
+ * `models` values are FREE-FORM strings (the get-shit-done "Other" pattern):
283
+ * the suggested combos use spawn aliases (`opus`/`sonnet`/`haiku`/`fable`) so
284
+ * they drop straight into an Agent `model` override, but any string is allowed.
285
+ */
286
+ export interface RunConfig {
287
+ schemaVersion: 1;
288
+ granularity: SlicingGranularity;
289
+ /** Specimens per tournament (N). Clamped to [2, 8]. */
290
+ fanout: number;
291
+ models: Record<StzRole, string>;
292
+ strictness: StrictnessConfig;
293
+ /**
294
+ * Dark-factory mode (0.4.0). When true the pipeline runs end-to-end with no
295
+ * human in the loop: the orchestrator skips every interactive gate it can
296
+ * legitimately skip (the `/stz:slice` "approve as-is" gate and the `/stz:run`
297
+ * winner-approval gate) and drives every phase → per-slice run → summary
298
+ * autonomously, surfacing only the final completion report. The one gate it
299
+ * may NOT skip is the F2 done-predicate confirmation in elicitation — a run
300
+ * with zero machine-checkable predicates is never auto-invented. Off by
301
+ * default; offered at the end of elicitation and flippable at any point via
302
+ * `stz bridge project-dark-factory` (the invoke-anytime flag).
303
+ */
304
+ darkFactory: boolean;
305
+ }