@reynsu/reactlens-diagnosis-prompts 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -0
- package/dist/diagnosis.d.ts +247 -0
- package/dist/diagnosis.d.ts.map +1 -0
- package/dist/diagnosis.js +82 -0
- package/dist/diagnosis.js.map +1 -0
- package/dist/eval-metrics.d.ts +40 -0
- package/dist/eval-metrics.d.ts.map +1 -0
- package/dist/eval-metrics.js +53 -0
- package/dist/eval-metrics.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/prompts.d.ts +3 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +185 -0
- package/dist/prompts.js.map +1 -0
- package/package.json +44 -0
- package/src/diagnosis.ts +107 -0
- package/src/eval-metrics.ts +79 -0
- package/src/index.ts +34 -0
- package/src/prompts.ts +186 -0
package/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# @reynsu/reactlens-diagnosis-prompts
|
|
2
|
+
|
|
3
|
+
Diagnosis prompts + `Diagnosis` Zod schema + pure eval-metrics helpers for the reactlens / nativelens AI failure-diagnosis loop.
|
|
4
|
+
|
|
5
|
+
Published under `@reynsu/` because the `@reactlens` npm scope is unclaimed. Inside the reactlens monorepo this package is still referenced via local imports; a follow-up swap PR will switch reactlens to consume the published version (mirroring the [`@reynsu/nativelens-event-protocol` migration](https://github.com/reynsu/nativelens-protocol)).
|
|
6
|
+
|
|
7
|
+
## What's in the box
|
|
8
|
+
|
|
9
|
+
- **`DIAGNOSE_SYSTEM_PROMPT`** — the system message that primes the agent on its job, the schema, and the confidence calibration.
|
|
10
|
+
- **`CLASSIFY_BUG_RUBRIC`** — the long-form classification rubric (`real-bug` / `test-bug` / `flaky` / `env-issue`), including the test-bug vs flaky disambiguation that traps overconfident classifiers.
|
|
11
|
+
- **`DiagnosisSchema`** + `Diagnosis` type — the canonical structured output. Schema-first; consumers parse agent output with `DiagnosisSchema.safeParse(...)`.
|
|
12
|
+
- **`buildUserMessage(failure)`** — pure user-message builder (title, spec, error fence, component snapshot, truncation). Golden-testable.
|
|
13
|
+
- **`degradedDiagnosis(gitCtx?)`** — fallback when the agent exhausts retries. Deliberately emits `env-issue` / `low` so the operator notices.
|
|
14
|
+
- **`TruthSchema`, `parseTruth`, `compareToTruth`, `aggregateMetrics`** — pure scoring helpers for the eval harness. `falseConfidenceRate` is surfaced explicitly as the primary regression signal.
|
|
15
|
+
|
|
16
|
+
## What's NOT in the box (kept per-host)
|
|
17
|
+
|
|
18
|
+
- Agent runner implementations (Anthropic SDK, OpenAI, etc.) — consumers provide their own.
|
|
19
|
+
- Sandboxing logic for the eval pipeline (loading `component.tsx` / `spec.ts` from disk).
|
|
20
|
+
- Git context gathering (host-specific paths + tooling).
|
|
21
|
+
- The eval cases themselves (reactlens ships its under `tests/diagnostic-eval/cases/`; nativelens will ship its own).
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import {
|
|
27
|
+
DIAGNOSE_SYSTEM_PROMPT,
|
|
28
|
+
CLASSIFY_BUG_RUBRIC,
|
|
29
|
+
DiagnosisSchema,
|
|
30
|
+
buildUserMessage,
|
|
31
|
+
degradedDiagnosis,
|
|
32
|
+
type FailedTest,
|
|
33
|
+
type Diagnosis,
|
|
34
|
+
} from '@reynsu/reactlens-diagnosis-prompts';
|
|
35
|
+
|
|
36
|
+
const failure: FailedTest = {
|
|
37
|
+
testId: 'cart-flow:tap-add',
|
|
38
|
+
testTitle: 'cart shows declined banner',
|
|
39
|
+
specFile: '/abs/path/cart.spec.ts',
|
|
40
|
+
errorMessage: 'Timed out waiting for [data-testid="declined-banner"]',
|
|
41
|
+
componentSnapshot: { /* host-shaped tree */ },
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const userMessage = buildUserMessage(failure);
|
|
45
|
+
// Feed [DIAGNOSE_SYSTEM_PROMPT, CLASSIFY_BUG_RUBRIC] as system messages and
|
|
46
|
+
// `userMessage` as the user turn to whichever LLM-runner your host wires up.
|
|
47
|
+
// Parse the agent's final message with DiagnosisSchema.
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Versioning
|
|
51
|
+
|
|
52
|
+
Pre-1.0: minor bumps may include breaking schema changes; pin to the exact minor in dependents. The prompts themselves change without bumps (they're tuned against the eval set); only `Diagnosis` schema changes are semver-meaningful.
|
|
53
|
+
|
|
54
|
+
## Development
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pnpm install
|
|
58
|
+
pnpm test
|
|
59
|
+
pnpm typecheck
|
|
60
|
+
pnpm build
|
|
61
|
+
```
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export declare const ClassificationSchema: z.ZodEnum<["real-bug", "test-bug", "flaky", "env-issue"]>;
|
|
3
|
+
export type Classification = z.infer<typeof ClassificationSchema>;
|
|
4
|
+
export declare const ConfidenceSchema: z.ZodEnum<["high", "medium", "low"]>;
|
|
5
|
+
export type Confidence = z.infer<typeof ConfidenceSchema>;
|
|
6
|
+
export declare const PatchEntrySchema: z.ZodObject<{
|
|
7
|
+
file: z.ZodString;
|
|
8
|
+
oldStr: z.ZodString;
|
|
9
|
+
newStr: z.ZodString;
|
|
10
|
+
rationale: z.ZodString;
|
|
11
|
+
}, "strip", z.ZodTypeAny, {
|
|
12
|
+
file: string;
|
|
13
|
+
oldStr: string;
|
|
14
|
+
newStr: string;
|
|
15
|
+
rationale: string;
|
|
16
|
+
}, {
|
|
17
|
+
file: string;
|
|
18
|
+
oldStr: string;
|
|
19
|
+
newStr: string;
|
|
20
|
+
rationale: string;
|
|
21
|
+
}>;
|
|
22
|
+
export type PatchEntry = z.infer<typeof PatchEntrySchema>;
|
|
23
|
+
export declare const GitContextEntrySchema: z.ZodObject<{
|
|
24
|
+
sha: z.ZodString;
|
|
25
|
+
author: z.ZodString;
|
|
26
|
+
date: z.ZodString;
|
|
27
|
+
message: z.ZodString;
|
|
28
|
+
}, "strip", z.ZodTypeAny, {
|
|
29
|
+
message: string;
|
|
30
|
+
sha: string;
|
|
31
|
+
author: string;
|
|
32
|
+
date: string;
|
|
33
|
+
}, {
|
|
34
|
+
message: string;
|
|
35
|
+
sha: string;
|
|
36
|
+
author: string;
|
|
37
|
+
date: string;
|
|
38
|
+
}>;
|
|
39
|
+
export type GitContextEntry = z.infer<typeof GitContextEntrySchema>;
|
|
40
|
+
export declare const GitContextSchema: z.ZodObject<{
|
|
41
|
+
componentLastChanged: z.ZodOptional<z.ZodObject<{
|
|
42
|
+
sha: z.ZodString;
|
|
43
|
+
author: z.ZodString;
|
|
44
|
+
date: z.ZodString;
|
|
45
|
+
message: z.ZodString;
|
|
46
|
+
}, "strip", z.ZodTypeAny, {
|
|
47
|
+
message: string;
|
|
48
|
+
sha: string;
|
|
49
|
+
author: string;
|
|
50
|
+
date: string;
|
|
51
|
+
}, {
|
|
52
|
+
message: string;
|
|
53
|
+
sha: string;
|
|
54
|
+
author: string;
|
|
55
|
+
date: string;
|
|
56
|
+
}>>;
|
|
57
|
+
specLastChanged: z.ZodOptional<z.ZodObject<{
|
|
58
|
+
sha: z.ZodString;
|
|
59
|
+
author: z.ZodString;
|
|
60
|
+
date: z.ZodString;
|
|
61
|
+
message: z.ZodString;
|
|
62
|
+
}, "strip", z.ZodTypeAny, {
|
|
63
|
+
message: string;
|
|
64
|
+
sha: string;
|
|
65
|
+
author: string;
|
|
66
|
+
date: string;
|
|
67
|
+
}, {
|
|
68
|
+
message: string;
|
|
69
|
+
sha: string;
|
|
70
|
+
author: string;
|
|
71
|
+
date: string;
|
|
72
|
+
}>>;
|
|
73
|
+
}, "strip", z.ZodTypeAny, {
|
|
74
|
+
componentLastChanged?: {
|
|
75
|
+
message: string;
|
|
76
|
+
sha: string;
|
|
77
|
+
author: string;
|
|
78
|
+
date: string;
|
|
79
|
+
} | undefined;
|
|
80
|
+
specLastChanged?: {
|
|
81
|
+
message: string;
|
|
82
|
+
sha: string;
|
|
83
|
+
author: string;
|
|
84
|
+
date: string;
|
|
85
|
+
} | undefined;
|
|
86
|
+
}, {
|
|
87
|
+
componentLastChanged?: {
|
|
88
|
+
message: string;
|
|
89
|
+
sha: string;
|
|
90
|
+
author: string;
|
|
91
|
+
date: string;
|
|
92
|
+
} | undefined;
|
|
93
|
+
specLastChanged?: {
|
|
94
|
+
message: string;
|
|
95
|
+
sha: string;
|
|
96
|
+
author: string;
|
|
97
|
+
date: string;
|
|
98
|
+
} | undefined;
|
|
99
|
+
}>;
|
|
100
|
+
export type GitContext = z.infer<typeof GitContextSchema>;
|
|
101
|
+
export declare const DiagnosisSchema: z.ZodObject<{
|
|
102
|
+
classification: z.ZodEnum<["real-bug", "test-bug", "flaky", "env-issue"]>;
|
|
103
|
+
confidence: z.ZodEnum<["high", "medium", "low"]>;
|
|
104
|
+
rootCause: z.ZodString;
|
|
105
|
+
evidence: z.ZodArray<z.ZodString, "many">;
|
|
106
|
+
suggestedFix: z.ZodString;
|
|
107
|
+
patch: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
108
|
+
file: z.ZodString;
|
|
109
|
+
oldStr: z.ZodString;
|
|
110
|
+
newStr: z.ZodString;
|
|
111
|
+
rationale: z.ZodString;
|
|
112
|
+
}, "strip", z.ZodTypeAny, {
|
|
113
|
+
file: string;
|
|
114
|
+
oldStr: string;
|
|
115
|
+
newStr: string;
|
|
116
|
+
rationale: string;
|
|
117
|
+
}, {
|
|
118
|
+
file: string;
|
|
119
|
+
oldStr: string;
|
|
120
|
+
newStr: string;
|
|
121
|
+
rationale: string;
|
|
122
|
+
}>, "many">>;
|
|
123
|
+
gitContext: z.ZodOptional<z.ZodObject<{
|
|
124
|
+
componentLastChanged: z.ZodOptional<z.ZodObject<{
|
|
125
|
+
sha: z.ZodString;
|
|
126
|
+
author: z.ZodString;
|
|
127
|
+
date: z.ZodString;
|
|
128
|
+
message: z.ZodString;
|
|
129
|
+
}, "strip", z.ZodTypeAny, {
|
|
130
|
+
message: string;
|
|
131
|
+
sha: string;
|
|
132
|
+
author: string;
|
|
133
|
+
date: string;
|
|
134
|
+
}, {
|
|
135
|
+
message: string;
|
|
136
|
+
sha: string;
|
|
137
|
+
author: string;
|
|
138
|
+
date: string;
|
|
139
|
+
}>>;
|
|
140
|
+
specLastChanged: z.ZodOptional<z.ZodObject<{
|
|
141
|
+
sha: z.ZodString;
|
|
142
|
+
author: z.ZodString;
|
|
143
|
+
date: z.ZodString;
|
|
144
|
+
message: z.ZodString;
|
|
145
|
+
}, "strip", z.ZodTypeAny, {
|
|
146
|
+
message: string;
|
|
147
|
+
sha: string;
|
|
148
|
+
author: string;
|
|
149
|
+
date: string;
|
|
150
|
+
}, {
|
|
151
|
+
message: string;
|
|
152
|
+
sha: string;
|
|
153
|
+
author: string;
|
|
154
|
+
date: string;
|
|
155
|
+
}>>;
|
|
156
|
+
}, "strip", z.ZodTypeAny, {
|
|
157
|
+
componentLastChanged?: {
|
|
158
|
+
message: string;
|
|
159
|
+
sha: string;
|
|
160
|
+
author: string;
|
|
161
|
+
date: string;
|
|
162
|
+
} | undefined;
|
|
163
|
+
specLastChanged?: {
|
|
164
|
+
message: string;
|
|
165
|
+
sha: string;
|
|
166
|
+
author: string;
|
|
167
|
+
date: string;
|
|
168
|
+
} | undefined;
|
|
169
|
+
}, {
|
|
170
|
+
componentLastChanged?: {
|
|
171
|
+
message: string;
|
|
172
|
+
sha: string;
|
|
173
|
+
author: string;
|
|
174
|
+
date: string;
|
|
175
|
+
} | undefined;
|
|
176
|
+
specLastChanged?: {
|
|
177
|
+
message: string;
|
|
178
|
+
sha: string;
|
|
179
|
+
author: string;
|
|
180
|
+
date: string;
|
|
181
|
+
} | undefined;
|
|
182
|
+
}>>;
|
|
183
|
+
}, "strip", z.ZodTypeAny, {
|
|
184
|
+
classification: "real-bug" | "test-bug" | "flaky" | "env-issue";
|
|
185
|
+
confidence: "high" | "medium" | "low";
|
|
186
|
+
rootCause: string;
|
|
187
|
+
evidence: string[];
|
|
188
|
+
suggestedFix: string;
|
|
189
|
+
patch?: {
|
|
190
|
+
file: string;
|
|
191
|
+
oldStr: string;
|
|
192
|
+
newStr: string;
|
|
193
|
+
rationale: string;
|
|
194
|
+
}[] | undefined;
|
|
195
|
+
gitContext?: {
|
|
196
|
+
componentLastChanged?: {
|
|
197
|
+
message: string;
|
|
198
|
+
sha: string;
|
|
199
|
+
author: string;
|
|
200
|
+
date: string;
|
|
201
|
+
} | undefined;
|
|
202
|
+
specLastChanged?: {
|
|
203
|
+
message: string;
|
|
204
|
+
sha: string;
|
|
205
|
+
author: string;
|
|
206
|
+
date: string;
|
|
207
|
+
} | undefined;
|
|
208
|
+
} | undefined;
|
|
209
|
+
}, {
|
|
210
|
+
classification: "real-bug" | "test-bug" | "flaky" | "env-issue";
|
|
211
|
+
confidence: "high" | "medium" | "low";
|
|
212
|
+
rootCause: string;
|
|
213
|
+
evidence: string[];
|
|
214
|
+
suggestedFix: string;
|
|
215
|
+
patch?: {
|
|
216
|
+
file: string;
|
|
217
|
+
oldStr: string;
|
|
218
|
+
newStr: string;
|
|
219
|
+
rationale: string;
|
|
220
|
+
}[] | undefined;
|
|
221
|
+
gitContext?: {
|
|
222
|
+
componentLastChanged?: {
|
|
223
|
+
message: string;
|
|
224
|
+
sha: string;
|
|
225
|
+
author: string;
|
|
226
|
+
date: string;
|
|
227
|
+
} | undefined;
|
|
228
|
+
specLastChanged?: {
|
|
229
|
+
message: string;
|
|
230
|
+
sha: string;
|
|
231
|
+
author: string;
|
|
232
|
+
date: string;
|
|
233
|
+
} | undefined;
|
|
234
|
+
} | undefined;
|
|
235
|
+
}>;
|
|
236
|
+
export type Diagnosis = z.infer<typeof DiagnosisSchema>;
|
|
237
|
+
export type FailedTest = {
|
|
238
|
+
testId: string;
|
|
239
|
+
testTitle: string;
|
|
240
|
+
specFile: string;
|
|
241
|
+
errorMessage?: string;
|
|
242
|
+
componentFile?: string;
|
|
243
|
+
componentSnapshot?: unknown;
|
|
244
|
+
};
|
|
245
|
+
export declare function buildUserMessage(failure: FailedTest): string;
|
|
246
|
+
export declare function degradedDiagnosis(gitContext?: GitContext): Diagnosis;
|
|
247
|
+
//# sourceMappingURL=diagnosis.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diagnosis.d.ts","sourceRoot":"","sources":["../src/diagnosis.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,oBAAoB,2DAAyD,CAAC;AAC3F,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAElE,eAAO,MAAM,gBAAgB,sCAAoC,CAAC;AAClE,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;EAK3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;EAKhC,CAAC;AACH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC;AAEpE,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAG3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAQ1B,CAAC;AACH,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AAKxD,MAAM,MAAM,UAAU,GAAG;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B,CAAC;AAKF,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,UAAU,GAAG,MAAM,CA4B5D;AAKD,wBAAgB,iBAAiB,CAAC,UAAU,GAAE,UAAe,GAAG,SAAS,CASxE"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
// Canonical Diagnosis output shape and the pure helpers that surround it
|
|
2
|
+
// (user-message builder, degraded fallback). Schema-first: `DiagnosisSchema`
|
|
3
|
+
// is the source of truth — `Diagnosis` is `z.infer<typeof DiagnosisSchema>`.
|
|
4
|
+
//
|
|
5
|
+
// Consumers (reactlens, nativelens) provide their own agent runner, supply
|
|
6
|
+
// per-host fields on FailedTest (e.g. a `componentSnapshot` shape that fits
|
|
7
|
+
// their probe), and treat the returned Diagnosis as opaque-but-typed.
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
export const ClassificationSchema = z.enum(['real-bug', 'test-bug', 'flaky', 'env-issue']);
|
|
10
|
+
export const ConfidenceSchema = z.enum(['high', 'medium', 'low']);
|
|
11
|
+
export const PatchEntrySchema = z.object({
|
|
12
|
+
file: z.string(),
|
|
13
|
+
oldStr: z.string(),
|
|
14
|
+
newStr: z.string(),
|
|
15
|
+
rationale: z.string(),
|
|
16
|
+
});
|
|
17
|
+
export const GitContextEntrySchema = z.object({
|
|
18
|
+
sha: z.string(),
|
|
19
|
+
author: z.string(),
|
|
20
|
+
date: z.string(),
|
|
21
|
+
message: z.string(),
|
|
22
|
+
});
|
|
23
|
+
export const GitContextSchema = z.object({
|
|
24
|
+
componentLastChanged: GitContextEntrySchema.optional(),
|
|
25
|
+
specLastChanged: GitContextEntrySchema.optional(),
|
|
26
|
+
});
|
|
27
|
+
export const DiagnosisSchema = z.object({
|
|
28
|
+
classification: ClassificationSchema,
|
|
29
|
+
confidence: ConfidenceSchema,
|
|
30
|
+
rootCause: z.string().min(1),
|
|
31
|
+
evidence: z.array(z.string()),
|
|
32
|
+
suggestedFix: z.string(),
|
|
33
|
+
patch: z.array(PatchEntrySchema).optional(),
|
|
34
|
+
gitContext: GitContextSchema.optional(),
|
|
35
|
+
});
|
|
36
|
+
// Builds the user-message string fed to the diagnosis agent alongside the
|
|
37
|
+
// system prompts. Pure — produces the exact bytes that go on the wire so
|
|
38
|
+
// the prompt surface can be diffed and golden-tested.
|
|
39
|
+
export function buildUserMessage(failure) {
|
|
40
|
+
const lines = [];
|
|
41
|
+
lines.push(`# Failure to diagnose`);
|
|
42
|
+
lines.push(``);
|
|
43
|
+
lines.push(`Test: ${failure.testTitle}`);
|
|
44
|
+
lines.push(`Spec: ${failure.specFile}`);
|
|
45
|
+
if (failure.componentFile !== undefined)
|
|
46
|
+
lines.push(`Component (probable): ${failure.componentFile}`);
|
|
47
|
+
if (failure.errorMessage !== undefined) {
|
|
48
|
+
lines.push(``);
|
|
49
|
+
lines.push(`# Error`);
|
|
50
|
+
lines.push('```');
|
|
51
|
+
lines.push(failure.errorMessage);
|
|
52
|
+
lines.push('```');
|
|
53
|
+
}
|
|
54
|
+
if (failure.componentSnapshot !== undefined) {
|
|
55
|
+
lines.push(``);
|
|
56
|
+
lines.push(`# Component snapshot at failure`);
|
|
57
|
+
lines.push('```json');
|
|
58
|
+
const snippet = JSON.stringify(failure.componentSnapshot, null, 2);
|
|
59
|
+
// Truncate enormous trees to keep token usage sane. 30k chars is the
|
|
60
|
+
// empirically-tuned ceiling — beyond it the agent's classification
|
|
61
|
+
// accuracy starts to degrade in eval.
|
|
62
|
+
lines.push(snippet.length > 30_000 ? snippet.slice(0, 30_000) + '\n…(truncated)' : snippet);
|
|
63
|
+
lines.push('```');
|
|
64
|
+
}
|
|
65
|
+
lines.push(``);
|
|
66
|
+
lines.push(`Read the spec, the component, and any other context you need. Output a single JSON object matching the Diagnosis schema as the FINAL message.`);
|
|
67
|
+
return lines.join('\n');
|
|
68
|
+
}
|
|
69
|
+
// Fallback when the agent loop exhausts retries without producing a valid
|
|
70
|
+
// Diagnosis. We DELIBERATELY return `env-issue` / `low` so the operator
|
|
71
|
+
// notices: this is the calibration failure mode Principle 2 forbids.
|
|
72
|
+
export function degradedDiagnosis(gitContext = {}) {
|
|
73
|
+
return {
|
|
74
|
+
classification: 'env-issue',
|
|
75
|
+
confidence: 'low',
|
|
76
|
+
rootCause: 'diagnosis agent failed to produce a valid output',
|
|
77
|
+
evidence: ['agent returned non-JSON or schema-mismatched output twice'],
|
|
78
|
+
suggestedFix: 'rerun the diagnosis with --verbose; if it persists, file an issue with the trace',
|
|
79
|
+
...(Object.keys(gitContext).length > 0 ? { gitContext } : {}),
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=diagnosis.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diagnosis.js","sourceRoot":"","sources":["../src/diagnosis.ts"],"names":[],"mappings":"AAAA,yEAAyE;AACzE,6EAA6E;AAC7E,6EAA6E;AAC7E,EAAE;AACF,2EAA2E;AAC3E,4EAA4E;AAC5E,sEAAsE;AAEtE,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,UAAU,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC,CAAC;AAG3F,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC;AAGlE,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAGH,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE;IACf,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;CACpB,CAAC,CAAC;AAGH,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IACvC,oBAAoB,EAAE,qBAAqB,CAAC,QAAQ,EAAE;IACtD,eAAe,EAAE,qBAAqB,CAAC,QAAQ,EAAE;CAClD,CAAC,CAAC;AAGH,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,cAAc,EAAE,oBAAoB;IACpC,UAAU,EAAE,gBAAgB;IAC5B,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5B,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IAC7B,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;IACxB,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC,QAAQ,EAAE;IAC3C,UAAU,EAAE,gBAAgB,CAAC,QAAQ,EAAE;CACxC,CAAC,CAAC;AAeH,0EAA0E;AAC1E,yEAAyE;AACzE,sDAAsD;AACtD,MAAM,UAAU,gBAAgB,CAAC,OAAmB;IAClD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;IACpC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,SAAS,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,SAAS,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;IACxC,IAAI,OAAO,CAAC,aAAa,KAAK,SAAS;QAAE,KAAK,CAAC,IAAI,CAAC,yBAAyB,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;IACtG,IAAI,OAAO,CAAC,YAAY,KAAK,SAAS,EAAE,CAAC;QACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QACjC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpB,CAAC;IACD,IAAI,OAAO,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;QAC5C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QAC9C,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACtB,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,iBAAiB,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QACnE,qEAAqE;QACrE,mEAAmE;QACnE,sCAAsC;QACtC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAC5F,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACpB,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,+IAA+I,CAAC,CAAC;IAC5J,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,0EAA0E;AAC1E,wEAAwE;AACxE,qEAAqE;AACrE,MAAM,UAAU,iBAAiB,CAAC,aAAyB,EAAE;IAC3D,OAAO;QACL,cAAc,EAAE,WAAW;QAC3B,UAAU,EAAE,KAAK;QACjB,SAAS,EAAE,kDAAkD;QAC7D,QAAQ,EAAE,CAAC,2DAA2D,CAAC;QACvE,YAAY,EAAE,kFAAkF;QAChG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAC9D,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { type Classification, type Diagnosis } from './diagnosis.js';
|
|
3
|
+
export declare const TruthSchema: z.ZodObject<{
|
|
4
|
+
expectedClassification: z.ZodEnum<["real-bug", "test-bug", "flaky", "env-issue"]>;
|
|
5
|
+
minimumConfidence: z.ZodEnum<["high", "medium", "low"]>;
|
|
6
|
+
category: z.ZodOptional<z.ZodString>;
|
|
7
|
+
notes: z.ZodOptional<z.ZodString>;
|
|
8
|
+
}, "strip", z.ZodTypeAny, {
|
|
9
|
+
expectedClassification: "real-bug" | "test-bug" | "flaky" | "env-issue";
|
|
10
|
+
minimumConfidence: "high" | "medium" | "low";
|
|
11
|
+
category?: string | undefined;
|
|
12
|
+
notes?: string | undefined;
|
|
13
|
+
}, {
|
|
14
|
+
expectedClassification: "real-bug" | "test-bug" | "flaky" | "env-issue";
|
|
15
|
+
minimumConfidence: "high" | "medium" | "low";
|
|
16
|
+
category?: string | undefined;
|
|
17
|
+
notes?: string | undefined;
|
|
18
|
+
}>;
|
|
19
|
+
export type Truth = z.infer<typeof TruthSchema>;
|
|
20
|
+
export type CaseResult = {
|
|
21
|
+
name: string;
|
|
22
|
+
expected: Truth;
|
|
23
|
+
actual: Diagnosis;
|
|
24
|
+
correct: boolean;
|
|
25
|
+
falseConfidence: boolean;
|
|
26
|
+
};
|
|
27
|
+
export declare function parseTruth(raw: string): Truth;
|
|
28
|
+
export declare function compareToTruth(actual: Diagnosis, expected: Truth, name: string): CaseResult;
|
|
29
|
+
export type EvalMetrics = {
|
|
30
|
+
total: number;
|
|
31
|
+
correct: number;
|
|
32
|
+
accuracy: number;
|
|
33
|
+
falseConfidenceRate: number;
|
|
34
|
+
perClassification: Record<Classification, {
|
|
35
|
+
total: number;
|
|
36
|
+
correct: number;
|
|
37
|
+
}>;
|
|
38
|
+
};
|
|
39
|
+
export declare function aggregateMetrics(results: CaseResult[]): EvalMetrics;
|
|
40
|
+
//# sourceMappingURL=eval-metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-metrics.d.ts","sourceRoot":"","sources":["../src/eval-metrics.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAA0C,KAAK,cAAc,EAAE,KAAK,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAE7G,eAAO,MAAM,WAAW;;;;;;;;;;;;;;;EAKtB,CAAC;AACH,MAAM,MAAM,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AAEhD,MAAM,MAAM,UAAU,GAAG;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,KAAK,CAAC;IAChB,MAAM,EAAE,SAAS,CAAC;IAClB,OAAO,EAAE,OAAO,CAAC;IAIjB,eAAe,EAAE,OAAO,CAAC;CAC1B,CAAC;AAEF,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,KAAK,CAE7C;AAED,wBAAgB,cAAc,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAS3F;AAED,MAAM,MAAM,WAAW,GAAG;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IAEjB,mBAAmB,EAAE,MAAM,CAAC;IAG5B,iBAAiB,EAAE,MAAM,CAAC,cAAc,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CAC/E,CAAC;AAEF,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,UAAU,EAAE,GAAG,WAAW,CAsBnE"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
// Pure scoring helpers for the diagnostic-eval harness. No fs, no agent —
|
|
2
|
+
// consumers (reactlens, nativelens) drive their own per-host case loading
|
|
3
|
+
// and call into these for grading.
|
|
4
|
+
//
|
|
5
|
+
// `falseConfidenceRate` is surfaced explicitly because Principle 2's
|
|
6
|
+
// implicit cap on it (>=95% high-confidence accuracy) is the most useful
|
|
7
|
+
// regression signal in eval reports.
|
|
8
|
+
import { z } from 'zod';
|
|
9
|
+
import { ClassificationSchema, ConfidenceSchema } from './diagnosis.js';
|
|
10
|
+
export const TruthSchema = z.object({
|
|
11
|
+
expectedClassification: ClassificationSchema,
|
|
12
|
+
minimumConfidence: ConfidenceSchema,
|
|
13
|
+
category: z.string().optional(),
|
|
14
|
+
notes: z.string().optional(),
|
|
15
|
+
});
|
|
16
|
+
export function parseTruth(raw) {
|
|
17
|
+
return TruthSchema.parse(JSON.parse(raw));
|
|
18
|
+
}
|
|
19
|
+
export function compareToTruth(actual, expected, name) {
|
|
20
|
+
const correct = actual.classification === expected.expectedClassification;
|
|
21
|
+
return {
|
|
22
|
+
name,
|
|
23
|
+
expected,
|
|
24
|
+
actual,
|
|
25
|
+
correct,
|
|
26
|
+
falseConfidence: !correct && actual.confidence === 'high',
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
export function aggregateMetrics(results) {
|
|
30
|
+
const total = results.length;
|
|
31
|
+
const correct = results.filter((r) => r.correct).length;
|
|
32
|
+
const falseConfident = results.filter((r) => r.falseConfidence).length;
|
|
33
|
+
const perClassification = {
|
|
34
|
+
'real-bug': { total: 0, correct: 0 },
|
|
35
|
+
'test-bug': { total: 0, correct: 0 },
|
|
36
|
+
flaky: { total: 0, correct: 0 },
|
|
37
|
+
'env-issue': { total: 0, correct: 0 },
|
|
38
|
+
};
|
|
39
|
+
for (const r of results) {
|
|
40
|
+
const bucket = perClassification[r.expected.expectedClassification];
|
|
41
|
+
bucket.total += 1;
|
|
42
|
+
if (r.correct)
|
|
43
|
+
bucket.correct += 1;
|
|
44
|
+
}
|
|
45
|
+
return {
|
|
46
|
+
total,
|
|
47
|
+
correct,
|
|
48
|
+
accuracy: total === 0 ? 0 : correct / total,
|
|
49
|
+
falseConfidenceRate: total === 0 ? 0 : falseConfident / total,
|
|
50
|
+
perClassification,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=eval-metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-metrics.js","sourceRoot":"","sources":["../src/eval-metrics.ts"],"names":[],"mappings":"AAAA,0EAA0E;AAC1E,0EAA0E;AAC1E,mCAAmC;AACnC,EAAE;AACF,qEAAqE;AACrE,yEAAyE;AACzE,qCAAqC;AAErC,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAuC,MAAM,gBAAgB,CAAC;AAE7G,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,sBAAsB,EAAE,oBAAoB;IAC5C,iBAAiB,EAAE,gBAAgB;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC/B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CAC7B,CAAC,CAAC;AAcH,MAAM,UAAU,UAAU,CAAC,GAAW;IACpC,OAAO,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,MAAiB,EAAE,QAAe,EAAE,IAAY;IAC7E,MAAM,OAAO,GAAG,MAAM,CAAC,cAAc,KAAK,QAAQ,CAAC,sBAAsB,CAAC;IAC1E,OAAO;QACL,IAAI;QACJ,QAAQ;QACR,MAAM;QACN,OAAO;QACP,eAAe,EAAE,CAAC,OAAO,IAAI,MAAM,CAAC,UAAU,KAAK,MAAM;KAC1D,CAAC;AACJ,CAAC;AAaD,MAAM,UAAU,gBAAgB,CAAC,OAAqB;IACpD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAC7B,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IACxD,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;IACvE,MAAM,iBAAiB,GAAqC;QAC1D,UAAU,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;QACpC,UAAU,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;QACpC,KAAK,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;QAC/B,WAAW,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE;KACtC,CAAC;IACF,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,MAAM,MAAM,GAAG,iBAAiB,CAAC,CAAC,CAAC,QAAQ,CAAC,sBAAsB,CAAC,CAAC;QACpE,MAAM,CAAC,KAAK,IAAI,CAAC,CAAC;QAClB,IAAI,CAAC,CAAC,OAAO;YAAE,MAAM,CAAC,OAAO,IAAI,CAAC,CAAC;IACrC,CAAC;IACD,OAAO;QACL,KAAK;QACL,OAAO;QACP,QAAQ,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK;QAC3C,mBAAmB,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,KAAK;QAC7D,iBAAiB;KAClB,CAAC;AACJ,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { DIAGNOSE_SYSTEM_PROMPT, CLASSIFY_BUG_RUBRIC } from './prompts.js';
|
|
2
|
+
export { ClassificationSchema, ConfidenceSchema, PatchEntrySchema, GitContextEntrySchema, GitContextSchema, DiagnosisSchema, type Classification, type Confidence, type PatchEntry, type GitContextEntry, type GitContext, type Diagnosis, type FailedTest, buildUserMessage, degradedDiagnosis, } from './diagnosis.js';
|
|
3
|
+
export { TruthSchema, type Truth, type CaseResult, type EvalMetrics, parseTruth, compareToTruth, aggregateMetrics, } from './eval-metrics.js';
|
|
4
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AAE3E,OAAO,EAEL,oBAAoB,EACpB,gBAAgB,EAChB,gBAAgB,EAChB,qBAAqB,EACrB,gBAAgB,EAChB,eAAe,EAEf,KAAK,cAAc,EACnB,KAAK,UAAU,EACf,KAAK,UAAU,EACf,KAAK,eAAe,EACpB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,UAAU,EAEf,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,WAAW,EACX,KAAK,KAAK,EACV,KAAK,UAAU,EACf,KAAK,WAAW,EAChB,UAAU,EACV,cAAc,EACd,gBAAgB,GACjB,MAAM,mBAAmB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
// Public surface — keep narrow. Consumers should import from the root only.
|
|
2
|
+
export { DIAGNOSE_SYSTEM_PROMPT, CLASSIFY_BUG_RUBRIC } from './prompts.js';
|
|
3
|
+
export {
|
|
4
|
+
// Schemas
|
|
5
|
+
ClassificationSchema, ConfidenceSchema, PatchEntrySchema, GitContextEntrySchema, GitContextSchema, DiagnosisSchema,
|
|
6
|
+
// Pure helpers
|
|
7
|
+
buildUserMessage, degradedDiagnosis, } from './diagnosis.js';
|
|
8
|
+
export { TruthSchema, parseTruth, compareToTruth, aggregateMetrics, } from './eval-metrics.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,4EAA4E;AAE5E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AAE3E,OAAO;AACL,UAAU;AACV,oBAAoB,EACpB,gBAAgB,EAChB,gBAAgB,EAChB,qBAAqB,EACrB,gBAAgB,EAChB,eAAe;AASf,eAAe;AACf,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,WAAW,EAIX,UAAU,EACV,cAAc,EACd,gBAAgB,GACjB,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export declare const DIAGNOSE_SYSTEM_PROMPT = "# reactlens \u2014 diagnose a Playwright failure\n\nYou are the diagnosis agent for reactlens. A Playwright test just failed. You have access to:\n\n- The failing **spec** (`specPath` below) \u2014 read it.\n- The **component** under test (`componentPath`) \u2014 read it. This is the code the user wrote.\n- The **trace summary** \u2014 Playwright's error message + locator chain.\n- The **component snapshot at failure** \u2014 the React tree at the moment the assertion failed, including props and hooks. This is the unique signal you have that other tools don't.\n- Optional **git context** for both files: who changed what most recently, and when.\n\nYou have access to the `Read`, `Glob`, `Grep`, and `Bash` tools to look around. Use them deliberately \u2014 every read costs the user money.\n\n## Your job\n\nOutput a single JSON object matching this schema (plain JSON; no markdown fence):\n\n```json\n{\n \"classification\": \"real-bug\" | \"test-bug\" | \"flaky\" | \"env-issue\",\n \"confidence\": \"high\" | \"medium\" | \"low\",\n \"rootCause\": \"one sentence\",\n \"evidence\": [\"bullet 1\", \"bullet 2\", ...],\n \"suggestedFix\": \"human-readable description of what to change\",\n \"patch\": [{\n \"file\": \"path\",\n \"oldStr\": \"exact text to replace\",\n \"newStr\": \"replacement text\",\n \"rationale\": \"why this fixes it\"\n }]\n}\n```\n\n`patch` is OPTIONAL \u2014 only include it when you can produce a concrete edit you're confident about. If you can't, omit it and say what additional information would let you produce one (in `suggestedFix`).\n\n## Classification rubric\n\nSee `classify-bug.md` (you should consult it). Headline rules:\n\n- **`real-bug`** \u2014 code regressed; spec is correct. The behavior the spec asserted was correct yesterday and is wrong today. Best signal: spec hasn't changed, component recently changed.\n- **`test-bug`** \u2014 code is fine; spec is stale or wrong. Best signal: component hasn't changed, spec recently changed; or the spec asserts something the source code never actually does.\n- **`flaky`** \u2014 neither code nor spec changed; the failure is non-deterministic. Best signal: timing-related error (waiting for element, race), no recent commits to either file.\n- **`env-issue`** \u2014 the failure is infrastructure (port conflict, missing env var, browser binary missing). Best signal: error mentions ports, env vars, browser launch, network unreachable.\n\n## Confidence calibration (CRITICAL)\n\nWe measure your accuracy at each confidence level. Do not lie.\n\n- **`high`** \u2014 you have direct evidence in the snapshot AND/OR a recent commit that explains the behavior. Example: \"the snapshot shows `cvv: '12'` \u2014 only 2 chars \u2014 but the component schema requires `min(3)`. The validation error visible in the snapshot is what the spec is asserting against.\"\n- **`medium`** \u2014 strong but indirect signal. Eg \"the spec hasn't changed in 3 weeks, the component changed yesterday \u2014 but I haven't found the actual bug in the diff.\"\n- **`low`** \u2014 informed guess. Use this freely; it's better than a wrong `high`.\n\nIf you cannot produce evidence, use `low`. Never inflate.\n\n## What to read\n\nIn order:\n1. The component snapshot at failure (you'll get this in the user message inline). Note components, props, hook values.\n2. The spec (`Read specPath`).\n3. The component (`Read componentPath`).\n4. Anything the spec or component imports that's relevant (`Read` selectively).\n5. `git log -p` on either file if you need to see what changed recently.\n\nDo not read the entire codebase. Stop reading when you have enough to commit to an answer.\n\n## Output format\n\nFinal message MUST be only the JSON object \u2014 no prose, no markdown, no code fence. Earlier messages can include reasoning. The system parses your last message as JSON.\n";
|
|
2
|
+
export declare const CLASSIFY_BUG_RUBRIC = "# reactlens \u2014 bug classification rubric\n\nThis document defines, with examples, when to choose each classification. Diagnosis agent: consult this when in doubt. Calibration is measured against the eval set in `tests/diagnostic-eval/cases/`.\n\n## `real-bug`\n\n**Definition:** the code regressed and the spec is correctly asserting the prior, correct behavior.\n\n**Strongest signals:**\n- Component file has a recent commit (today, this week) and the spec hasn't changed in much longer.\n- The component snapshot at failure shows props/state inconsistent with the source's invariants \u2014 eg the source says `if (data.length === 0) return Empty` but the snapshot shows `data.length === 0` AND the rendered subtree doesn't include `<Empty />`.\n- The snapshot shows a thrown error in a render path the spec was exercising.\n\n**Examples:**\n\n> Spec: \"checkout shows declined banner for cards starting with 4000\".\n> Component snapshot: status='success' (not 'declined') after submitting a 4000-card.\n> Component diff (last commit): the developer changed the API check from `startsWith('4000')` to `startsWith('5000')`.\n> \u2192 `real-bug`, `high` confidence. Patch: revert the prefix check.\n\n> Spec: \"dashboard shows empty state when there are no orders\".\n> Component snapshot: orders=[] but `<empty />` is not in the tree.\n> Component: the developer added an `early-return null` above the empty check that fires unconditionally.\n> \u2192 `real-bug`, `high`. Patch: remove the early return.\n\n## `test-bug`\n\n**Definition:** the code is correct; the spec is stale, makes a wrong assumption, or asserts something the component doesn't do.\n\n**Strongest signals:**\n- Spec file has a recent commit and the component hasn't changed.\n- The spec asserts a `data-testid` or `text` that doesn't exist anywhere in the component source.\n- The spec assumes a route, a label, or an order-of-operations that contradicts the source.\n- The snapshot shows the component is in the state the test wanted \u2014 but the assertion is wrong.\n\n**Examples:**\n\n> Spec: `await expect(page.getByTestId('checkout-fail')).toBeVisible()`.\n> Component source: there is `data-testid=\"checkout-network-error\"` and `data-testid=\"checkout-declined\"` but no `checkout-fail`.\n> \u2192 `test-bug`, `high`. Patch: change the selector to whichever testid was intended.\n\n> Spec was copy-pasted from another spec and still asserts the wrong page title.\n> \u2192 `test-bug`, `high`. Patch: update the title.\n\n## `flaky`\n\n**Definition:** non-deterministic failure unrelated to a code change. Includes ordering failures \u2014 failures that depend on what *other* specs did, not on the code or this spec.\n\n**Strongest signals:**\n- Failure mode is \"waiting for X, timed out\". X exists in source.\n- Neither file has changed recently.\n- Multiple consecutive runs against unchanged code show different outcomes.\n- Snapshot shows the component DID render the asserted state \u2014 just after the timeout.\n- Snapshot or runtime evidence shows state coming from **outside this spec's own actions** \u2014 eg `localStorage` populated with keys this spec never writes, an authenticated session this spec didn't sign in for, a database row another spec inserted. Often paired with an `error.txt` / trace line that says the failure correlates with run order (\"passes in isolation, fails after spec X\").\n\nUse sparingly. \"I don't know\" is more often a `low`-confidence other-classification than `flaky`.\n\n**Examples:**\n\n> Spec: clicks submit, asserts redirect within 5s. Sometimes the redirect takes 6s due to a queue.\n> \u2192 `flaky`, `medium`. Suggested fix: increase timeout for this spec, or add a deterministic gate (`waitForResponse`).\n\n> Spec: `await page.goto('/cart'); await expect(getByTestId('cart-empty')).toBeVisible()`.\n> Component snapshot: `hooks.state = [{id: 'abc'}]` (items array populated).\n> error.txt: \"Browser storage at time of failure: localStorage = {cart: '[{id:abc}]'}. Failure correlates with running after 'adds item to cart' spec. No spec or component change in 60 days.\"\n> Component on disk reads from localStorage on mount; spec doesn't call `localStorage.clear()`.\n> \u2192 `flaky` (ordering / state leak), `high`. The localStorage state did not come from this spec's actions \u2014 it leaked from another spec running first in the same worker. Even though the spec is also structurally fragile, the proximate cause is the order.\n> Suggested fix: add `test.beforeEach(({page}) => page.evaluate(() => localStorage.clear()))` to this spec or the file's parent describe. Optionally file an issue against the spec that wrote the leaked data.\n\n## Disambiguating `test-bug` vs `flaky` (ordering) when both fit\n\nSome failures look structurally like test-bugs (the spec doesn't `beforeEach(localStorage.clear())`, doesn't reset a global, doesn't re-seed the database) while *also* having runtime evidence of an ordering issue (snapshot shows state this spec never wrote; trace says the failure correlates with another spec running first). This is the most common false-confidence trap in the eval set, so be explicit.\n\n**Prefer `flaky` when:**\n- The component snapshot at failure contains state values this spec's own actions cannot explain. Eg `hooks.state = [{id:'abc'}]` but the spec never wrote that id; `props.user.email = '...'` but the spec didn't log anyone in.\n- The error trace or `error.txt` mentions the failure correlates with run order (\"passes in isolation, fails after X\"), failure rate is below 100%, or no recent changes to either file.\n- The same spec passes when run alone (a strong tell \u2014 if you only see \"fails in suite, passes alone\" in the trace, lean `flaky`).\n\n**Prefer `test-bug` when:**\n- The state the spec depends on is conventionally set up by the framework (eg `storageState`, fixtures) and *this* spec is the one that should have configured it.\n- The spec asserts a `data-testid` / text / route that doesn't exist anywhere in the source \u2014 the bug is in the assertion itself, not in the runtime state.\n- Multiple specs in the same file fail the same way regardless of order.\n\nThe structural shortcoming (no `beforeEach(clear)`) is real and worth fixing in both cases. But if the runtime evidence shows the leak came from *elsewhere*, that's the proximate cause \u2014 and the proximate cause is what `flaky` captures. Don't punish the symptom-bearing spec for being the one that exposed the order-dependency.\n\nConfidence in this disambiguation should track the directness of the evidence: snapshot hook values + an error.txt that names the suspected upstream spec is `high`; just \"no recent commits + spec doesn't clear state\" is `medium`.\n\n## `env-issue`\n\n**Definition:** the failure is infrastructure, not application logic.\n\n**Strongest signals:**\n- Error mentions: \"browserType.launch\", \"Executable doesn't exist\", \"EADDRINUSE\", \"ECONNREFUSED\", \"missing env var\".\n- All tests fail (not just one).\n- The component snapshot is empty / probe never connected.\n\n**Examples:**\n\n> Error: `Executable doesn't exist at .../chrome-headless-shell`.\n> \u2192 `env-issue`, `high`. Patch: `pnpm exec playwright install chromium`.\n\n> Error: `connect ECONNREFUSED 127.0.0.1:5173` on every test.\n> \u2192 `env-issue`, `high`. The dev server didn't start.\n\n## When you genuinely can't tell\n\nChoose the classification you'd guess at, drop confidence to `low`, and in `suggestedFix` say what evidence would let you upgrade.\n";
|
|
3
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../src/prompts.ts"],"names":[],"mappings":"AASA,eAAO,MAAM,sBAAsB,2yHAmElC,CAAC;AAEF,eAAO,MAAM,mBAAmB,k4OA2G/B,CAAC"}
|
package/dist/prompts.js
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
// Canonical diagnosis system prompts. Authored in markdown for readability;
|
|
2
|
+
// exported as plain strings so consumers don't need a markdown loader.
|
|
3
|
+
//
|
|
4
|
+
// These are reactlens-flavoured (Playwright + DOM). Nativelens consumers
|
|
5
|
+
// will eventually ship a parallel set keyed on Detox + React Native idioms.
|
|
6
|
+
// Both versions are intended to share the same `Diagnosis` output schema
|
|
7
|
+
// and `classification` rubric — only the surrounding "what tools you have"
|
|
8
|
+
// scaffolding diverges.
|
|
9
|
+
export const DIAGNOSE_SYSTEM_PROMPT = `# reactlens — diagnose a Playwright failure
|
|
10
|
+
|
|
11
|
+
You are the diagnosis agent for reactlens. A Playwright test just failed. You have access to:
|
|
12
|
+
|
|
13
|
+
- The failing **spec** (\`specPath\` below) — read it.
|
|
14
|
+
- The **component** under test (\`componentPath\`) — read it. This is the code the user wrote.
|
|
15
|
+
- The **trace summary** — Playwright's error message + locator chain.
|
|
16
|
+
- The **component snapshot at failure** — the React tree at the moment the assertion failed, including props and hooks. This is the unique signal you have that other tools don't.
|
|
17
|
+
- Optional **git context** for both files: who changed what most recently, and when.
|
|
18
|
+
|
|
19
|
+
You have access to the \`Read\`, \`Glob\`, \`Grep\`, and \`Bash\` tools to look around. Use them deliberately — every read costs the user money.
|
|
20
|
+
|
|
21
|
+
## Your job
|
|
22
|
+
|
|
23
|
+
Output a single JSON object matching this schema (plain JSON; no markdown fence):
|
|
24
|
+
|
|
25
|
+
\`\`\`json
|
|
26
|
+
{
|
|
27
|
+
"classification": "real-bug" | "test-bug" | "flaky" | "env-issue",
|
|
28
|
+
"confidence": "high" | "medium" | "low",
|
|
29
|
+
"rootCause": "one sentence",
|
|
30
|
+
"evidence": ["bullet 1", "bullet 2", ...],
|
|
31
|
+
"suggestedFix": "human-readable description of what to change",
|
|
32
|
+
"patch": [{
|
|
33
|
+
"file": "path",
|
|
34
|
+
"oldStr": "exact text to replace",
|
|
35
|
+
"newStr": "replacement text",
|
|
36
|
+
"rationale": "why this fixes it"
|
|
37
|
+
}]
|
|
38
|
+
}
|
|
39
|
+
\`\`\`
|
|
40
|
+
|
|
41
|
+
\`patch\` is OPTIONAL — only include it when you can produce a concrete edit you're confident about. If you can't, omit it and say what additional information would let you produce one (in \`suggestedFix\`).
|
|
42
|
+
|
|
43
|
+
## Classification rubric
|
|
44
|
+
|
|
45
|
+
See \`classify-bug.md\` (you should consult it). Headline rules:
|
|
46
|
+
|
|
47
|
+
- **\`real-bug\`** — code regressed; spec is correct. The behavior the spec asserted was correct yesterday and is wrong today. Best signal: spec hasn't changed, component recently changed.
|
|
48
|
+
- **\`test-bug\`** — code is fine; spec is stale or wrong. Best signal: component hasn't changed, spec recently changed; or the spec asserts something the source code never actually does.
|
|
49
|
+
- **\`flaky\`** — neither code nor spec changed; the failure is non-deterministic. Best signal: timing-related error (waiting for element, race), no recent commits to either file.
|
|
50
|
+
- **\`env-issue\`** — the failure is infrastructure (port conflict, missing env var, browser binary missing). Best signal: error mentions ports, env vars, browser launch, network unreachable.
|
|
51
|
+
|
|
52
|
+
## Confidence calibration (CRITICAL)
|
|
53
|
+
|
|
54
|
+
We measure your accuracy at each confidence level. Do not lie.
|
|
55
|
+
|
|
56
|
+
- **\`high\`** — you have direct evidence in the snapshot AND/OR a recent commit that explains the behavior. Example: "the snapshot shows \`cvv: '12'\` — only 2 chars — but the component schema requires \`min(3)\`. The validation error visible in the snapshot is what the spec is asserting against."
|
|
57
|
+
- **\`medium\`** — strong but indirect signal. Eg "the spec hasn't changed in 3 weeks, the component changed yesterday — but I haven't found the actual bug in the diff."
|
|
58
|
+
- **\`low\`** — informed guess. Use this freely; it's better than a wrong \`high\`.
|
|
59
|
+
|
|
60
|
+
If you cannot produce evidence, use \`low\`. Never inflate.
|
|
61
|
+
|
|
62
|
+
## What to read
|
|
63
|
+
|
|
64
|
+
In order:
|
|
65
|
+
1. The component snapshot at failure (you'll get this in the user message inline). Note components, props, hook values.
|
|
66
|
+
2. The spec (\`Read specPath\`).
|
|
67
|
+
3. The component (\`Read componentPath\`).
|
|
68
|
+
4. Anything the spec or component imports that's relevant (\`Read\` selectively).
|
|
69
|
+
5. \`git log -p\` on either file if you need to see what changed recently.
|
|
70
|
+
|
|
71
|
+
Do not read the entire codebase. Stop reading when you have enough to commit to an answer.
|
|
72
|
+
|
|
73
|
+
## Output format
|
|
74
|
+
|
|
75
|
+
Final message MUST be only the JSON object — no prose, no markdown, no code fence. Earlier messages can include reasoning. The system parses your last message as JSON.
|
|
76
|
+
`;
|
|
77
|
+
export const CLASSIFY_BUG_RUBRIC = `# reactlens — bug classification rubric
|
|
78
|
+
|
|
79
|
+
This document defines, with examples, when to choose each classification. Diagnosis agent: consult this when in doubt. Calibration is measured against the eval set in \`tests/diagnostic-eval/cases/\`.
|
|
80
|
+
|
|
81
|
+
## \`real-bug\`
|
|
82
|
+
|
|
83
|
+
**Definition:** the code regressed and the spec is correctly asserting the prior, correct behavior.
|
|
84
|
+
|
|
85
|
+
**Strongest signals:**
|
|
86
|
+
- Component file has a recent commit (today, this week) and the spec hasn't changed in much longer.
|
|
87
|
+
- The component snapshot at failure shows props/state inconsistent with the source's invariants — eg the source says \`if (data.length === 0) return Empty\` but the snapshot shows \`data.length === 0\` AND the rendered subtree doesn't include \`<Empty />\`.
|
|
88
|
+
- The snapshot shows a thrown error in a render path the spec was exercising.
|
|
89
|
+
|
|
90
|
+
**Examples:**
|
|
91
|
+
|
|
92
|
+
> Spec: "checkout shows declined banner for cards starting with 4000".
|
|
93
|
+
> Component snapshot: status='success' (not 'declined') after submitting a 4000-card.
|
|
94
|
+
> Component diff (last commit): the developer changed the API check from \`startsWith('4000')\` to \`startsWith('5000')\`.
|
|
95
|
+
> → \`real-bug\`, \`high\` confidence. Patch: revert the prefix check.
|
|
96
|
+
|
|
97
|
+
> Spec: "dashboard shows empty state when there are no orders".
|
|
98
|
+
> Component snapshot: orders=[] but \`<empty />\` is not in the tree.
|
|
99
|
+
> Component: the developer added an \`early-return null\` above the empty check that fires unconditionally.
|
|
100
|
+
> → \`real-bug\`, \`high\`. Patch: remove the early return.
|
|
101
|
+
|
|
102
|
+
## \`test-bug\`
|
|
103
|
+
|
|
104
|
+
**Definition:** the code is correct; the spec is stale, makes a wrong assumption, or asserts something the component doesn't do.
|
|
105
|
+
|
|
106
|
+
**Strongest signals:**
|
|
107
|
+
- Spec file has a recent commit and the component hasn't changed.
|
|
108
|
+
- The spec asserts a \`data-testid\` or \`text\` that doesn't exist anywhere in the component source.
|
|
109
|
+
- The spec assumes a route, a label, or an order-of-operations that contradicts the source.
|
|
110
|
+
- The snapshot shows the component is in the state the test wanted — but the assertion is wrong.
|
|
111
|
+
|
|
112
|
+
**Examples:**
|
|
113
|
+
|
|
114
|
+
> Spec: \`await expect(page.getByTestId('checkout-fail')).toBeVisible()\`.
|
|
115
|
+
> Component source: there is \`data-testid="checkout-network-error"\` and \`data-testid="checkout-declined"\` but no \`checkout-fail\`.
|
|
116
|
+
> → \`test-bug\`, \`high\`. Patch: change the selector to whichever testid was intended.
|
|
117
|
+
|
|
118
|
+
> Spec was copy-pasted from another spec and still asserts the wrong page title.
|
|
119
|
+
> → \`test-bug\`, \`high\`. Patch: update the title.
|
|
120
|
+
|
|
121
|
+
## \`flaky\`
|
|
122
|
+
|
|
123
|
+
**Definition:** non-deterministic failure unrelated to a code change. Includes ordering failures — failures that depend on what *other* specs did, not on the code or this spec.
|
|
124
|
+
|
|
125
|
+
**Strongest signals:**
|
|
126
|
+
- Failure mode is "waiting for X, timed out". X exists in source.
|
|
127
|
+
- Neither file has changed recently.
|
|
128
|
+
- Multiple consecutive runs against unchanged code show different outcomes.
|
|
129
|
+
- Snapshot shows the component DID render the asserted state — just after the timeout.
|
|
130
|
+
- Snapshot or runtime evidence shows state coming from **outside this spec's own actions** — eg \`localStorage\` populated with keys this spec never writes, an authenticated session this spec didn't sign in for, a database row another spec inserted. Often paired with an \`error.txt\` / trace line that says the failure correlates with run order ("passes in isolation, fails after spec X").
|
|
131
|
+
|
|
132
|
+
Use sparingly. "I don't know" is more often a \`low\`-confidence other-classification than \`flaky\`.
|
|
133
|
+
|
|
134
|
+
**Examples:**
|
|
135
|
+
|
|
136
|
+
> Spec: clicks submit, asserts redirect within 5s. Sometimes the redirect takes 6s due to a queue.
|
|
137
|
+
> → \`flaky\`, \`medium\`. Suggested fix: increase timeout for this spec, or add a deterministic gate (\`waitForResponse\`).
|
|
138
|
+
|
|
139
|
+
> Spec: \`await page.goto('/cart'); await expect(getByTestId('cart-empty')).toBeVisible()\`.
|
|
140
|
+
> Component snapshot: \`hooks.state = [{id: 'abc'}]\` (items array populated).
|
|
141
|
+
> error.txt: "Browser storage at time of failure: localStorage = {cart: '[{id:abc}]'}. Failure correlates with running after 'adds item to cart' spec. No spec or component change in 60 days."
|
|
142
|
+
> Component on disk reads from localStorage on mount; spec doesn't call \`localStorage.clear()\`.
|
|
143
|
+
> → \`flaky\` (ordering / state leak), \`high\`. The localStorage state did not come from this spec's actions — it leaked from another spec running first in the same worker. Even though the spec is also structurally fragile, the proximate cause is the order.
|
|
144
|
+
> Suggested fix: add \`test.beforeEach(({page}) => page.evaluate(() => localStorage.clear()))\` to this spec or the file's parent describe. Optionally file an issue against the spec that wrote the leaked data.
|
|
145
|
+
|
|
146
|
+
## Disambiguating \`test-bug\` vs \`flaky\` (ordering) when both fit
|
|
147
|
+
|
|
148
|
+
Some failures look structurally like test-bugs (the spec doesn't \`beforeEach(localStorage.clear())\`, doesn't reset a global, doesn't re-seed the database) while *also* having runtime evidence of an ordering issue (snapshot shows state this spec never wrote; trace says the failure correlates with another spec running first). This is the most common false-confidence trap in the eval set, so be explicit.
|
|
149
|
+
|
|
150
|
+
**Prefer \`flaky\` when:**
|
|
151
|
+
- The component snapshot at failure contains state values this spec's own actions cannot explain. Eg \`hooks.state = [{id:'abc'}]\` but the spec never wrote that id; \`props.user.email = '...'\` but the spec didn't log anyone in.
|
|
152
|
+
- The error trace or \`error.txt\` mentions the failure correlates with run order ("passes in isolation, fails after X"), failure rate is below 100%, or no recent changes to either file.
|
|
153
|
+
- The same spec passes when run alone (a strong tell — if you only see "fails in suite, passes alone" in the trace, lean \`flaky\`).
|
|
154
|
+
|
|
155
|
+
**Prefer \`test-bug\` when:**
|
|
156
|
+
- The state the spec depends on is conventionally set up by the framework (eg \`storageState\`, fixtures) and *this* spec is the one that should have configured it.
|
|
157
|
+
- The spec asserts a \`data-testid\` / text / route that doesn't exist anywhere in the source — the bug is in the assertion itself, not in the runtime state.
|
|
158
|
+
- Multiple specs in the same file fail the same way regardless of order.
|
|
159
|
+
|
|
160
|
+
The structural shortcoming (no \`beforeEach(clear)\`) is real and worth fixing in both cases. But if the runtime evidence shows the leak came from *elsewhere*, that's the proximate cause — and the proximate cause is what \`flaky\` captures. Don't punish the symptom-bearing spec for being the one that exposed the order-dependency.
|
|
161
|
+
|
|
162
|
+
Confidence in this disambiguation should track the directness of the evidence: snapshot hook values + an error.txt that names the suspected upstream spec is \`high\`; just "no recent commits + spec doesn't clear state" is \`medium\`.
|
|
163
|
+
|
|
164
|
+
## \`env-issue\`
|
|
165
|
+
|
|
166
|
+
**Definition:** the failure is infrastructure, not application logic.
|
|
167
|
+
|
|
168
|
+
**Strongest signals:**
|
|
169
|
+
- Error mentions: "browserType.launch", "Executable doesn't exist", "EADDRINUSE", "ECONNREFUSED", "missing env var".
|
|
170
|
+
- All tests fail (not just one).
|
|
171
|
+
- The component snapshot is empty / probe never connected.
|
|
172
|
+
|
|
173
|
+
**Examples:**
|
|
174
|
+
|
|
175
|
+
> Error: \`Executable doesn't exist at .../chrome-headless-shell\`.
|
|
176
|
+
> → \`env-issue\`, \`high\`. Patch: \`pnpm exec playwright install chromium\`.
|
|
177
|
+
|
|
178
|
+
> Error: \`connect ECONNREFUSED 127.0.0.1:5173\` on every test.
|
|
179
|
+
> → \`env-issue\`, \`high\`. The dev server didn't start.
|
|
180
|
+
|
|
181
|
+
## When you genuinely can't tell
|
|
182
|
+
|
|
183
|
+
Choose the classification you'd guess at, drop confidence to \`low\`, and in \`suggestedFix\` say what evidence would let you upgrade.
|
|
184
|
+
`;
|
|
185
|
+
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../src/prompts.ts"],"names":[],"mappings":"AAAA,4EAA4E;AAC5E,uEAAuE;AACvE,EAAE;AACF,yEAAyE;AACzE,4EAA4E;AAC5E,yEAAyE;AACzE,2EAA2E;AAC3E,wBAAwB;AAExB,MAAM,CAAC,MAAM,sBAAsB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAmErC,CAAC;AAEF,MAAM,CAAC,MAAM,mBAAmB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA2GlC,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@reynsu/reactlens-diagnosis-prompts",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Canonical diagnosis prompts + Zod Diagnosis schema + pure eval-metrics for the reactlens / nativelens AI failure-diagnosis loop. Pure helpers only — no LLM SDK dep.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist",
|
|
9
|
+
"src",
|
|
10
|
+
"!src/**/*.test.ts"
|
|
11
|
+
],
|
|
12
|
+
"main": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"types": "./dist/index.d.ts",
|
|
17
|
+
"import": "./dist/index.js"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"scripts": {
|
|
21
|
+
"build": "tsc -p tsconfig.build.json",
|
|
22
|
+
"test": "vitest run",
|
|
23
|
+
"test:watch": "vitest",
|
|
24
|
+
"typecheck": "tsc --noEmit",
|
|
25
|
+
"prepublishOnly": "npm run typecheck && npm test && npm run build"
|
|
26
|
+
},
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"zod": "^3.23.8"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"typescript": "~5.9.2",
|
|
32
|
+
"vitest": "^2.1.0"
|
|
33
|
+
},
|
|
34
|
+
"publishConfig": {
|
|
35
|
+
"access": "public"
|
|
36
|
+
},
|
|
37
|
+
"repository": {
|
|
38
|
+
"type": "git",
|
|
39
|
+
"url": "git+https://github.com/reynsu/reactlens-prompts.git"
|
|
40
|
+
},
|
|
41
|
+
"pnpm": {
|
|
42
|
+
"onlyBuiltDependencies": ["esbuild"]
|
|
43
|
+
}
|
|
44
|
+
}
|
package/src/diagnosis.ts
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
// Canonical Diagnosis output shape and the pure helpers that surround it
|
|
2
|
+
// (user-message builder, degraded fallback). Schema-first: `DiagnosisSchema`
|
|
3
|
+
// is the source of truth — `Diagnosis` is `z.infer<typeof DiagnosisSchema>`.
|
|
4
|
+
//
|
|
5
|
+
// Consumers (reactlens, nativelens) provide their own agent runner, supply
|
|
6
|
+
// per-host fields on FailedTest (e.g. a `componentSnapshot` shape that fits
|
|
7
|
+
// their probe), and treat the returned Diagnosis as opaque-but-typed.
|
|
8
|
+
|
|
9
|
+
import { z } from 'zod';
|
|
10
|
+
|
|
11
|
+
export const ClassificationSchema = z.enum(['real-bug', 'test-bug', 'flaky', 'env-issue']);
|
|
12
|
+
export type Classification = z.infer<typeof ClassificationSchema>;
|
|
13
|
+
|
|
14
|
+
export const ConfidenceSchema = z.enum(['high', 'medium', 'low']);
|
|
15
|
+
export type Confidence = z.infer<typeof ConfidenceSchema>;
|
|
16
|
+
|
|
17
|
+
export const PatchEntrySchema = z.object({
|
|
18
|
+
file: z.string(),
|
|
19
|
+
oldStr: z.string(),
|
|
20
|
+
newStr: z.string(),
|
|
21
|
+
rationale: z.string(),
|
|
22
|
+
});
|
|
23
|
+
export type PatchEntry = z.infer<typeof PatchEntrySchema>;
|
|
24
|
+
|
|
25
|
+
export const GitContextEntrySchema = z.object({
|
|
26
|
+
sha: z.string(),
|
|
27
|
+
author: z.string(),
|
|
28
|
+
date: z.string(),
|
|
29
|
+
message: z.string(),
|
|
30
|
+
});
|
|
31
|
+
export type GitContextEntry = z.infer<typeof GitContextEntrySchema>;
|
|
32
|
+
|
|
33
|
+
export const GitContextSchema = z.object({
|
|
34
|
+
componentLastChanged: GitContextEntrySchema.optional(),
|
|
35
|
+
specLastChanged: GitContextEntrySchema.optional(),
|
|
36
|
+
});
|
|
37
|
+
export type GitContext = z.infer<typeof GitContextSchema>;
|
|
38
|
+
|
|
39
|
+
export const DiagnosisSchema = z.object({
|
|
40
|
+
classification: ClassificationSchema,
|
|
41
|
+
confidence: ConfidenceSchema,
|
|
42
|
+
rootCause: z.string().min(1),
|
|
43
|
+
evidence: z.array(z.string()),
|
|
44
|
+
suggestedFix: z.string(),
|
|
45
|
+
patch: z.array(PatchEntrySchema).optional(),
|
|
46
|
+
gitContext: GitContextSchema.optional(),
|
|
47
|
+
});
|
|
48
|
+
export type Diagnosis = z.infer<typeof DiagnosisSchema>;
|
|
49
|
+
|
|
50
|
+
// FailedTest is the input to the diagnosis prompt. `componentSnapshot` is
|
|
51
|
+
// typed as `unknown` because reactlens (DOM) and nativelens (React Native
|
|
52
|
+
// tree) serialise different shapes; each consumer narrows on its own side.
|
|
53
|
+
export type FailedTest = {
|
|
54
|
+
testId: string;
|
|
55
|
+
testTitle: string;
|
|
56
|
+
specFile: string;
|
|
57
|
+
errorMessage?: string;
|
|
58
|
+
componentFile?: string;
|
|
59
|
+
componentSnapshot?: unknown;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Builds the user-message string fed to the diagnosis agent alongside the
|
|
63
|
+
// system prompts. Pure — produces the exact bytes that go on the wire so
|
|
64
|
+
// the prompt surface can be diffed and golden-tested.
|
|
65
|
+
export function buildUserMessage(failure: FailedTest): string {
|
|
66
|
+
const lines: string[] = [];
|
|
67
|
+
lines.push(`# Failure to diagnose`);
|
|
68
|
+
lines.push(``);
|
|
69
|
+
lines.push(`Test: ${failure.testTitle}`);
|
|
70
|
+
lines.push(`Spec: ${failure.specFile}`);
|
|
71
|
+
if (failure.componentFile !== undefined) lines.push(`Component (probable): ${failure.componentFile}`);
|
|
72
|
+
if (failure.errorMessage !== undefined) {
|
|
73
|
+
lines.push(``);
|
|
74
|
+
lines.push(`# Error`);
|
|
75
|
+
lines.push('```');
|
|
76
|
+
lines.push(failure.errorMessage);
|
|
77
|
+
lines.push('```');
|
|
78
|
+
}
|
|
79
|
+
if (failure.componentSnapshot !== undefined) {
|
|
80
|
+
lines.push(``);
|
|
81
|
+
lines.push(`# Component snapshot at failure`);
|
|
82
|
+
lines.push('```json');
|
|
83
|
+
const snippet = JSON.stringify(failure.componentSnapshot, null, 2);
|
|
84
|
+
// Truncate enormous trees to keep token usage sane. 30k chars is the
|
|
85
|
+
// empirically-tuned ceiling — beyond it the agent's classification
|
|
86
|
+
// accuracy starts to degrade in eval.
|
|
87
|
+
lines.push(snippet.length > 30_000 ? snippet.slice(0, 30_000) + '\n…(truncated)' : snippet);
|
|
88
|
+
lines.push('```');
|
|
89
|
+
}
|
|
90
|
+
lines.push(``);
|
|
91
|
+
lines.push(`Read the spec, the component, and any other context you need. Output a single JSON object matching the Diagnosis schema as the FINAL message.`);
|
|
92
|
+
return lines.join('\n');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Fallback when the agent loop exhausts retries without producing a valid
|
|
96
|
+
// Diagnosis. We DELIBERATELY return `env-issue` / `low` so the operator
|
|
97
|
+
// notices: this is the calibration failure mode Principle 2 forbids.
|
|
98
|
+
export function degradedDiagnosis(gitContext: GitContext = {}): Diagnosis {
|
|
99
|
+
return {
|
|
100
|
+
classification: 'env-issue',
|
|
101
|
+
confidence: 'low',
|
|
102
|
+
rootCause: 'diagnosis agent failed to produce a valid output',
|
|
103
|
+
evidence: ['agent returned non-JSON or schema-mismatched output twice'],
|
|
104
|
+
suggestedFix: 'rerun the diagnosis with --verbose; if it persists, file an issue with the trace',
|
|
105
|
+
...(Object.keys(gitContext).length > 0 ? { gitContext } : {}),
|
|
106
|
+
};
|
|
107
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
// Pure scoring helpers for the diagnostic-eval harness. No fs, no agent —
|
|
2
|
+
// consumers (reactlens, nativelens) drive their own per-host case loading
|
|
3
|
+
// and call into these for grading.
|
|
4
|
+
//
|
|
5
|
+
// `falseConfidenceRate` is surfaced explicitly because Principle 2's
|
|
6
|
+
// implicit cap on it (>=95% high-confidence accuracy) is the most useful
|
|
7
|
+
// regression signal in eval reports.
|
|
8
|
+
|
|
9
|
+
import { z } from 'zod';
|
|
10
|
+
import { ClassificationSchema, ConfidenceSchema, type Classification, type Diagnosis } from './diagnosis.js';
|
|
11
|
+
|
|
12
|
+
export const TruthSchema = z.object({
|
|
13
|
+
expectedClassification: ClassificationSchema,
|
|
14
|
+
minimumConfidence: ConfidenceSchema,
|
|
15
|
+
category: z.string().optional(),
|
|
16
|
+
notes: z.string().optional(),
|
|
17
|
+
});
|
|
18
|
+
export type Truth = z.infer<typeof TruthSchema>;
|
|
19
|
+
|
|
20
|
+
export type CaseResult = {
|
|
21
|
+
name: string;
|
|
22
|
+
expected: Truth;
|
|
23
|
+
actual: Diagnosis;
|
|
24
|
+
correct: boolean;
|
|
25
|
+
// True when the diagnosis is wrong but the agent reported `high` confidence.
|
|
26
|
+
// This is the calibration failure mode Principle 2 forbids: confident wrong
|
|
27
|
+
// beats unconfident wrong, but we only ship if it's vanishingly rare.
|
|
28
|
+
falseConfidence: boolean;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
export function parseTruth(raw: string): Truth {
|
|
32
|
+
return TruthSchema.parse(JSON.parse(raw));
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function compareToTruth(actual: Diagnosis, expected: Truth, name: string): CaseResult {
|
|
36
|
+
const correct = actual.classification === expected.expectedClassification;
|
|
37
|
+
return {
|
|
38
|
+
name,
|
|
39
|
+
expected,
|
|
40
|
+
actual,
|
|
41
|
+
correct,
|
|
42
|
+
falseConfidence: !correct && actual.confidence === 'high',
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export type EvalMetrics = {
|
|
47
|
+
total: number;
|
|
48
|
+
correct: number;
|
|
49
|
+
accuracy: number;
|
|
50
|
+
// Wrong diagnoses delivered with `high` confidence, divided by total.
|
|
51
|
+
falseConfidenceRate: number;
|
|
52
|
+
// Bucketed by expected classification so per-category recall is visible
|
|
53
|
+
// (e.g. "we get real-bug right 90% of the time but flaky only 40%").
|
|
54
|
+
perClassification: Record<Classification, { total: number; correct: number }>;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
export function aggregateMetrics(results: CaseResult[]): EvalMetrics {
|
|
58
|
+
const total = results.length;
|
|
59
|
+
const correct = results.filter((r) => r.correct).length;
|
|
60
|
+
const falseConfident = results.filter((r) => r.falseConfidence).length;
|
|
61
|
+
const perClassification: EvalMetrics['perClassification'] = {
|
|
62
|
+
'real-bug': { total: 0, correct: 0 },
|
|
63
|
+
'test-bug': { total: 0, correct: 0 },
|
|
64
|
+
flaky: { total: 0, correct: 0 },
|
|
65
|
+
'env-issue': { total: 0, correct: 0 },
|
|
66
|
+
};
|
|
67
|
+
for (const r of results) {
|
|
68
|
+
const bucket = perClassification[r.expected.expectedClassification];
|
|
69
|
+
bucket.total += 1;
|
|
70
|
+
if (r.correct) bucket.correct += 1;
|
|
71
|
+
}
|
|
72
|
+
return {
|
|
73
|
+
total,
|
|
74
|
+
correct,
|
|
75
|
+
accuracy: total === 0 ? 0 : correct / total,
|
|
76
|
+
falseConfidenceRate: total === 0 ? 0 : falseConfident / total,
|
|
77
|
+
perClassification,
|
|
78
|
+
};
|
|
79
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// Public surface — keep narrow. Consumers should import from the root only.
|
|
2
|
+
|
|
3
|
+
export { DIAGNOSE_SYSTEM_PROMPT, CLASSIFY_BUG_RUBRIC } from './prompts.js';
|
|
4
|
+
|
|
5
|
+
export {
|
|
6
|
+
// Schemas
|
|
7
|
+
ClassificationSchema,
|
|
8
|
+
ConfidenceSchema,
|
|
9
|
+
PatchEntrySchema,
|
|
10
|
+
GitContextEntrySchema,
|
|
11
|
+
GitContextSchema,
|
|
12
|
+
DiagnosisSchema,
|
|
13
|
+
// Types
|
|
14
|
+
type Classification,
|
|
15
|
+
type Confidence,
|
|
16
|
+
type PatchEntry,
|
|
17
|
+
type GitContextEntry,
|
|
18
|
+
type GitContext,
|
|
19
|
+
type Diagnosis,
|
|
20
|
+
type FailedTest,
|
|
21
|
+
// Pure helpers
|
|
22
|
+
buildUserMessage,
|
|
23
|
+
degradedDiagnosis,
|
|
24
|
+
} from './diagnosis.js';
|
|
25
|
+
|
|
26
|
+
export {
|
|
27
|
+
TruthSchema,
|
|
28
|
+
type Truth,
|
|
29
|
+
type CaseResult,
|
|
30
|
+
type EvalMetrics,
|
|
31
|
+
parseTruth,
|
|
32
|
+
compareToTruth,
|
|
33
|
+
aggregateMetrics,
|
|
34
|
+
} from './eval-metrics.js';
|
package/src/prompts.ts
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
// Canonical diagnosis system prompts. Authored in markdown for readability;
|
|
2
|
+
// exported as plain strings so consumers don't need a markdown loader.
|
|
3
|
+
//
|
|
4
|
+
// These are reactlens-flavoured (Playwright + DOM). Nativelens consumers
|
|
5
|
+
// will eventually ship a parallel set keyed on Detox + React Native idioms.
|
|
6
|
+
// Both versions are intended to share the same `Diagnosis` output schema
|
|
7
|
+
// and `classification` rubric — only the surrounding "what tools you have"
|
|
8
|
+
// scaffolding diverges.
|
|
9
|
+
|
|
10
|
+
export const DIAGNOSE_SYSTEM_PROMPT = `# reactlens — diagnose a Playwright failure
|
|
11
|
+
|
|
12
|
+
You are the diagnosis agent for reactlens. A Playwright test just failed. You have access to:
|
|
13
|
+
|
|
14
|
+
- The failing **spec** (\`specPath\` below) — read it.
|
|
15
|
+
- The **component** under test (\`componentPath\`) — read it. This is the code the user wrote.
|
|
16
|
+
- The **trace summary** — Playwright's error message + locator chain.
|
|
17
|
+
- The **component snapshot at failure** — the React tree at the moment the assertion failed, including props and hooks. This is the unique signal you have that other tools don't.
|
|
18
|
+
- Optional **git context** for both files: who changed what most recently, and when.
|
|
19
|
+
|
|
20
|
+
You have access to the \`Read\`, \`Glob\`, \`Grep\`, and \`Bash\` tools to look around. Use them deliberately — every read costs the user money.
|
|
21
|
+
|
|
22
|
+
## Your job
|
|
23
|
+
|
|
24
|
+
Output a single JSON object matching this schema (plain JSON; no markdown fence):
|
|
25
|
+
|
|
26
|
+
\`\`\`json
|
|
27
|
+
{
|
|
28
|
+
"classification": "real-bug" | "test-bug" | "flaky" | "env-issue",
|
|
29
|
+
"confidence": "high" | "medium" | "low",
|
|
30
|
+
"rootCause": "one sentence",
|
|
31
|
+
"evidence": ["bullet 1", "bullet 2", ...],
|
|
32
|
+
"suggestedFix": "human-readable description of what to change",
|
|
33
|
+
"patch": [{
|
|
34
|
+
"file": "path",
|
|
35
|
+
"oldStr": "exact text to replace",
|
|
36
|
+
"newStr": "replacement text",
|
|
37
|
+
"rationale": "why this fixes it"
|
|
38
|
+
}]
|
|
39
|
+
}
|
|
40
|
+
\`\`\`
|
|
41
|
+
|
|
42
|
+
\`patch\` is OPTIONAL — only include it when you can produce a concrete edit you're confident about. If you can't, omit it and say what additional information would let you produce one (in \`suggestedFix\`).
|
|
43
|
+
|
|
44
|
+
## Classification rubric
|
|
45
|
+
|
|
46
|
+
See \`classify-bug.md\` (you should consult it). Headline rules:
|
|
47
|
+
|
|
48
|
+
- **\`real-bug\`** — code regressed; spec is correct. The behavior the spec asserted was correct yesterday and is wrong today. Best signal: spec hasn't changed, component recently changed.
|
|
49
|
+
- **\`test-bug\`** — code is fine; spec is stale or wrong. Best signal: component hasn't changed, spec recently changed; or the spec asserts something the source code never actually does.
|
|
50
|
+
- **\`flaky\`** — neither code nor spec changed; the failure is non-deterministic. Best signal: timing-related error (waiting for element, race), no recent commits to either file.
|
|
51
|
+
- **\`env-issue\`** — the failure is infrastructure (port conflict, missing env var, browser binary missing). Best signal: error mentions ports, env vars, browser launch, network unreachable.
|
|
52
|
+
|
|
53
|
+
## Confidence calibration (CRITICAL)
|
|
54
|
+
|
|
55
|
+
We measure your accuracy at each confidence level. Do not lie.
|
|
56
|
+
|
|
57
|
+
- **\`high\`** — you have direct evidence in the snapshot AND/OR a recent commit that explains the behavior. Example: "the snapshot shows \`cvv: '12'\` — only 2 chars — but the component schema requires \`min(3)\`. The validation error visible in the snapshot is what the spec is asserting against."
|
|
58
|
+
- **\`medium\`** — strong but indirect signal. Eg "the spec hasn't changed in 3 weeks, the component changed yesterday — but I haven't found the actual bug in the diff."
|
|
59
|
+
- **\`low\`** — informed guess. Use this freely; it's better than a wrong \`high\`.
|
|
60
|
+
|
|
61
|
+
If you cannot produce evidence, use \`low\`. Never inflate.
|
|
62
|
+
|
|
63
|
+
## What to read
|
|
64
|
+
|
|
65
|
+
In order:
|
|
66
|
+
1. The component snapshot at failure (you'll get this in the user message inline). Note components, props, hook values.
|
|
67
|
+
2. The spec (\`Read specPath\`).
|
|
68
|
+
3. The component (\`Read componentPath\`).
|
|
69
|
+
4. Anything the spec or component imports that's relevant (\`Read\` selectively).
|
|
70
|
+
5. \`git log -p\` on either file if you need to see what changed recently.
|
|
71
|
+
|
|
72
|
+
Do not read the entire codebase. Stop reading when you have enough to commit to an answer.
|
|
73
|
+
|
|
74
|
+
## Output format
|
|
75
|
+
|
|
76
|
+
Final message MUST be only the JSON object — no prose, no markdown, no code fence. Earlier messages can include reasoning. The system parses your last message as JSON.
|
|
77
|
+
`;
|
|
78
|
+
|
|
79
|
+
export const CLASSIFY_BUG_RUBRIC = `# reactlens — bug classification rubric
|
|
80
|
+
|
|
81
|
+
This document defines, with examples, when to choose each classification. Diagnosis agent: consult this when in doubt. Calibration is measured against the eval set in \`tests/diagnostic-eval/cases/\`.
|
|
82
|
+
|
|
83
|
+
## \`real-bug\`
|
|
84
|
+
|
|
85
|
+
**Definition:** the code regressed and the spec is correctly asserting the prior, correct behavior.
|
|
86
|
+
|
|
87
|
+
**Strongest signals:**
|
|
88
|
+
- Component file has a recent commit (today, this week) and the spec hasn't changed in much longer.
|
|
89
|
+
- The component snapshot at failure shows props/state inconsistent with the source's invariants — eg the source says \`if (data.length === 0) return Empty\` but the snapshot shows \`data.length === 0\` AND the rendered subtree doesn't include \`<Empty />\`.
|
|
90
|
+
- The snapshot shows a thrown error in a render path the spec was exercising.
|
|
91
|
+
|
|
92
|
+
**Examples:**
|
|
93
|
+
|
|
94
|
+
> Spec: "checkout shows declined banner for cards starting with 4000".
|
|
95
|
+
> Component snapshot: status='success' (not 'declined') after submitting a 4000-card.
|
|
96
|
+
> Component diff (last commit): the developer changed the API check from \`startsWith('4000')\` to \`startsWith('5000')\`.
|
|
97
|
+
> → \`real-bug\`, \`high\` confidence. Patch: revert the prefix check.
|
|
98
|
+
|
|
99
|
+
> Spec: "dashboard shows empty state when there are no orders".
|
|
100
|
+
> Component snapshot: orders=[] but \`<empty />\` is not in the tree.
|
|
101
|
+
> Component: the developer added an \`early-return null\` above the empty check that fires unconditionally.
|
|
102
|
+
> → \`real-bug\`, \`high\`. Patch: remove the early return.
|
|
103
|
+
|
|
104
|
+
## \`test-bug\`
|
|
105
|
+
|
|
106
|
+
**Definition:** the code is correct; the spec is stale, makes a wrong assumption, or asserts something the component doesn't do.
|
|
107
|
+
|
|
108
|
+
**Strongest signals:**
|
|
109
|
+
- Spec file has a recent commit and the component hasn't changed.
|
|
110
|
+
- The spec asserts a \`data-testid\` or \`text\` that doesn't exist anywhere in the component source.
|
|
111
|
+
- The spec assumes a route, a label, or an order-of-operations that contradicts the source.
|
|
112
|
+
- The snapshot shows the component is in the state the test wanted — but the assertion is wrong.
|
|
113
|
+
|
|
114
|
+
**Examples:**
|
|
115
|
+
|
|
116
|
+
> Spec: \`await expect(page.getByTestId('checkout-fail')).toBeVisible()\`.
|
|
117
|
+
> Component source: there is \`data-testid="checkout-network-error"\` and \`data-testid="checkout-declined"\` but no \`checkout-fail\`.
|
|
118
|
+
> → \`test-bug\`, \`high\`. Patch: change the selector to whichever testid was intended.
|
|
119
|
+
|
|
120
|
+
> Spec was copy-pasted from another spec and still asserts the wrong page title.
|
|
121
|
+
> → \`test-bug\`, \`high\`. Patch: update the title.
|
|
122
|
+
|
|
123
|
+
## \`flaky\`
|
|
124
|
+
|
|
125
|
+
**Definition:** non-deterministic failure unrelated to a code change. Includes ordering failures — failures that depend on what *other* specs did, not on the code or this spec.
|
|
126
|
+
|
|
127
|
+
**Strongest signals:**
|
|
128
|
+
- Failure mode is "waiting for X, timed out". X exists in source.
|
|
129
|
+
- Neither file has changed recently.
|
|
130
|
+
- Multiple consecutive runs against unchanged code show different outcomes.
|
|
131
|
+
- Snapshot shows the component DID render the asserted state — just after the timeout.
|
|
132
|
+
- Snapshot or runtime evidence shows state coming from **outside this spec's own actions** — eg \`localStorage\` populated with keys this spec never writes, an authenticated session this spec didn't sign in for, a database row another spec inserted. Often paired with an \`error.txt\` / trace line that says the failure correlates with run order ("passes in isolation, fails after spec X").
|
|
133
|
+
|
|
134
|
+
Use sparingly. "I don't know" is more often a \`low\`-confidence other-classification than \`flaky\`.
|
|
135
|
+
|
|
136
|
+
**Examples:**
|
|
137
|
+
|
|
138
|
+
> Spec: clicks submit, asserts redirect within 5s. Sometimes the redirect takes 6s due to a queue.
|
|
139
|
+
> → \`flaky\`, \`medium\`. Suggested fix: increase timeout for this spec, or add a deterministic gate (\`waitForResponse\`).
|
|
140
|
+
|
|
141
|
+
> Spec: \`await page.goto('/cart'); await expect(getByTestId('cart-empty')).toBeVisible()\`.
|
|
142
|
+
> Component snapshot: \`hooks.state = [{id: 'abc'}]\` (items array populated).
|
|
143
|
+
> error.txt: "Browser storage at time of failure: localStorage = {cart: '[{id:abc}]'}. Failure correlates with running after 'adds item to cart' spec. No spec or component change in 60 days."
|
|
144
|
+
> Component on disk reads from localStorage on mount; spec doesn't call \`localStorage.clear()\`.
|
|
145
|
+
> → \`flaky\` (ordering / state leak), \`high\`. The localStorage state did not come from this spec's actions — it leaked from another spec running first in the same worker. Even though the spec is also structurally fragile, the proximate cause is the order.
|
|
146
|
+
> Suggested fix: add \`test.beforeEach(({page}) => page.evaluate(() => localStorage.clear()))\` to this spec or the file's parent describe. Optionally file an issue against the spec that wrote the leaked data.
|
|
147
|
+
|
|
148
|
+
## Disambiguating \`test-bug\` vs \`flaky\` (ordering) when both fit
|
|
149
|
+
|
|
150
|
+
Some failures look structurally like test-bugs (the spec doesn't \`beforeEach(localStorage.clear())\`, doesn't reset a global, doesn't re-seed the database) while *also* having runtime evidence of an ordering issue (snapshot shows state this spec never wrote; trace says the failure correlates with another spec running first). This is the most common false-confidence trap in the eval set, so be explicit.
|
|
151
|
+
|
|
152
|
+
**Prefer \`flaky\` when:**
|
|
153
|
+
- The component snapshot at failure contains state values this spec's own actions cannot explain. Eg \`hooks.state = [{id:'abc'}]\` but the spec never wrote that id; \`props.user.email = '...'\` but the spec didn't log anyone in.
|
|
154
|
+
- The error trace or \`error.txt\` mentions the failure correlates with run order ("passes in isolation, fails after X"), failure rate is below 100%, or no recent changes to either file.
|
|
155
|
+
- The same spec passes when run alone (a strong tell — if you only see "fails in suite, passes alone" in the trace, lean \`flaky\`).
|
|
156
|
+
|
|
157
|
+
**Prefer \`test-bug\` when:**
|
|
158
|
+
- The state the spec depends on is conventionally set up by the framework (eg \`storageState\`, fixtures) and *this* spec is the one that should have configured it.
|
|
159
|
+
- The spec asserts a \`data-testid\` / text / route that doesn't exist anywhere in the source — the bug is in the assertion itself, not in the runtime state.
|
|
160
|
+
- Multiple specs in the same file fail the same way regardless of order.
|
|
161
|
+
|
|
162
|
+
The structural shortcoming (no \`beforeEach(clear)\`) is real and worth fixing in both cases. But if the runtime evidence shows the leak came from *elsewhere*, that's the proximate cause — and the proximate cause is what \`flaky\` captures. Don't punish the symptom-bearing spec for being the one that exposed the order-dependency.
|
|
163
|
+
|
|
164
|
+
Confidence in this disambiguation should track the directness of the evidence: snapshot hook values + an error.txt that names the suspected upstream spec is \`high\`; just "no recent commits + spec doesn't clear state" is \`medium\`.
|
|
165
|
+
|
|
166
|
+
## \`env-issue\`
|
|
167
|
+
|
|
168
|
+
**Definition:** the failure is infrastructure, not application logic.
|
|
169
|
+
|
|
170
|
+
**Strongest signals:**
|
|
171
|
+
- Error mentions: "browserType.launch", "Executable doesn't exist", "EADDRINUSE", "ECONNREFUSED", "missing env var".
|
|
172
|
+
- All tests fail (not just one).
|
|
173
|
+
- The component snapshot is empty / probe never connected.
|
|
174
|
+
|
|
175
|
+
**Examples:**
|
|
176
|
+
|
|
177
|
+
> Error: \`Executable doesn't exist at .../chrome-headless-shell\`.
|
|
178
|
+
> → \`env-issue\`, \`high\`. Patch: \`pnpm exec playwright install chromium\`.
|
|
179
|
+
|
|
180
|
+
> Error: \`connect ECONNREFUSED 127.0.0.1:5173\` on every test.
|
|
181
|
+
> → \`env-issue\`, \`high\`. The dev server didn't start.
|
|
182
|
+
|
|
183
|
+
## When you genuinely can't tell
|
|
184
|
+
|
|
185
|
+
Choose the classification you'd guess at, drop confidence to \`low\`, and in \`suggestedFix\` say what evidence would let you upgrade.
|
|
186
|
+
`;
|