comfy-qa 1.5.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent/qa-research.ts +83 -37
package/package.json
CHANGED
package/src/agent/qa-research.ts
CHANGED
|
@@ -80,46 +80,48 @@ interface ResearchResults {
|
|
|
80
80
|
const ANTHROPIC_KEY = process.env.ANTHROPIC_API_KEY_QA ?? process.env.ANTHROPIC_API_KEY ?? "";
|
|
81
81
|
const OPENROUTER_KEY = process.env.OPENROUTER_API_KEY ?? "";
|
|
82
82
|
|
|
83
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
84
|
+
|
|
85
|
+
const anthropicClient = ANTHROPIC_KEY ? new Anthropic({ apiKey: ANTHROPIC_KEY, timeout: 60_000 }) : null;
|
|
86
|
+
|
|
83
87
|
async function callLLM(system: string, messages: any[]): Promise<string> {
|
|
84
|
-
if (
|
|
88
|
+
if (anthropicClient) {
|
|
85
89
|
try {
|
|
86
|
-
const res = await
|
|
90
|
+
const res = await anthropicClient.messages.create({
|
|
91
|
+
model: "claude-sonnet-4-20250514",
|
|
92
|
+
max_tokens: 8192,
|
|
93
|
+
system,
|
|
94
|
+
messages,
|
|
95
|
+
});
|
|
96
|
+
return res.content?.[0]?.type === "text" ? res.content[0].text : "";
|
|
97
|
+
} catch (err: any) {
|
|
98
|
+
console.log(` ⚠ Anthropic SDK: ${err.message?.slice(0, 80)}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (OPENROUTER_KEY) {
|
|
103
|
+
try {
|
|
104
|
+
const controller = new AbortController();
|
|
105
|
+
const timer = setTimeout(() => controller.abort(), 60_000);
|
|
106
|
+
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
|
|
87
107
|
method: "POST",
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
"anthropic-version": "2023-06-01",
|
|
91
|
-
"content-type": "application/json",
|
|
92
|
-
},
|
|
108
|
+
signal: controller.signal,
|
|
109
|
+
headers: { Authorization: `Bearer ${OPENROUTER_KEY}`, "content-type": "application/json" },
|
|
93
110
|
body: JSON.stringify({
|
|
94
|
-
model: "claude-sonnet-4-20250514",
|
|
111
|
+
model: "anthropic/claude-sonnet-4-20250514",
|
|
112
|
+
messages: [{ role: "system", content: system }, ...messages],
|
|
95
113
|
max_tokens: 8192,
|
|
96
|
-
system,
|
|
97
|
-
messages,
|
|
98
114
|
}),
|
|
99
115
|
});
|
|
116
|
+
clearTimeout(timer);
|
|
100
117
|
const json = (await res.json()) as any;
|
|
101
|
-
return json.
|
|
102
|
-
} catch {
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
if (OPENROUTER_KEY) {
|
|
106
|
-
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
|
|
107
|
-
method: "POST",
|
|
108
|
-
headers: {
|
|
109
|
-
Authorization: `Bearer ${OPENROUTER_KEY}`,
|
|
110
|
-
"content-type": "application/json",
|
|
111
|
-
},
|
|
112
|
-
body: JSON.stringify({
|
|
113
|
-
model: "anthropic/claude-sonnet-4-20250514",
|
|
114
|
-
messages: [{ role: "system", content: system }, ...messages],
|
|
115
|
-
max_tokens: 2048,
|
|
116
|
-
}),
|
|
117
|
-
});
|
|
118
|
-
const json = (await res.json()) as any;
|
|
119
|
-
return json.choices?.[0]?.message?.content ?? "";
|
|
118
|
+
return json.choices?.[0]?.message?.content ?? "";
|
|
119
|
+
} catch (err: any) {
|
|
120
|
+
console.log(` ⚠ OpenRouter: ${err.message?.slice(0, 60)}`);
|
|
121
|
+
}
|
|
120
122
|
}
|
|
121
123
|
|
|
122
|
-
|
|
124
|
+
return "";
|
|
123
125
|
}
|
|
124
126
|
|
|
125
127
|
// ---------------------------------------------------------------------------
|
|
@@ -229,20 +231,33 @@ async function testOperation(
|
|
|
229
231
|
try {
|
|
230
232
|
const state = await captureState(page);
|
|
231
233
|
|
|
232
|
-
const systemPrompt = `You are a QA tester
|
|
234
|
+
const systemPrompt = `You are a QA tester recording a video demo of a website.
|
|
233
235
|
|
|
234
236
|
Product: ${checklist.product}
|
|
235
237
|
|
|
236
238
|
RULES:
|
|
237
239
|
- Headless browser, NO URL bar. Use {"type": "goto", "text": "url"} to navigate.
|
|
238
240
|
- Use simple CSS selectors. Maximum 5 actions.
|
|
239
|
-
- Set "success": true
|
|
240
|
-
-
|
|
241
|
-
|
|
241
|
+
- Set "success": true if the success criteria is met in the current state.
|
|
242
|
+
- ALWAYS include at least 1 visual action (safeMove, hover, scroll) so the video shows
|
|
243
|
+
something happening. Even if content is already visible, move the cursor to it so the
|
|
244
|
+
viewer's eye is drawn to the relevant element.
|
|
245
|
+
- For "read" operations: use safeMove or hover to highlight the relevant element.
|
|
246
|
+
- For "create"/"update"/"delete" operations: perform the actual action (click, type).
|
|
247
|
+
- On retry, try a different selector approach.
|
|
248
|
+
|
|
249
|
+
Action types:
|
|
250
|
+
- {"type": "goto", "text": "url"} — navigate (use absolute URL)
|
|
251
|
+
- {"type": "safeMove", "selector": "..."} — move cursor to element (visual)
|
|
252
|
+
- {"type": "hover", "selector": "..."} — hover over element (visual)
|
|
253
|
+
- {"type": "scroll", "value": 300} — scroll down N pixels (visual)
|
|
254
|
+
- {"type": "click", "selector": "..."} — click element
|
|
255
|
+
- {"type": "type", "selector": "...", "text": "..."} — type text
|
|
256
|
+
- {"type": "wait", "value": 1000} — wait N ms
|
|
242
257
|
|
|
243
258
|
Respond with ONLY JSON:
|
|
244
259
|
{
|
|
245
|
-
"actions": [{"type": "
|
|
260
|
+
"actions": [{"type": "safeMove", "selector": "h1"}],
|
|
246
261
|
"success": true/false,
|
|
247
262
|
"observation": "what I see"
|
|
248
263
|
}`;
|
|
@@ -464,6 +479,34 @@ function generateScorecardHtml(results: ResearchResults, checklist: Checklist):
|
|
|
464
479
|
</body></html>`;
|
|
465
480
|
}
|
|
466
481
|
|
|
482
|
+
/**
|
|
483
|
+
* Build a multi-sentence scorecard narration that takes ~12-15s to read
|
|
484
|
+
* so the scorecard stays on screen long enough to be readable.
|
|
485
|
+
*/
|
|
486
|
+
function buildScorecardNarration(results: ResearchResults, checklist: Checklist): string {
|
|
487
|
+
const parts: string[] = [];
|
|
488
|
+
parts.push(`Here are the final QA results for ${checklist.product}.`);
|
|
489
|
+
parts.push(`Out of ${results.totalOperations} operations tested, ${results.totalPassed} passed, giving an overall score of ${results.scorePercent} percent.`);
|
|
490
|
+
|
|
491
|
+
const passed = results.features.filter(f => f.passed === f.total);
|
|
492
|
+
const partial = results.features.filter(f => f.passed < f.total);
|
|
493
|
+
|
|
494
|
+
if (passed.length > 0) {
|
|
495
|
+
const names = passed.map(f => f.name).join(", ");
|
|
496
|
+
parts.push(`The following features work as expected: ${names}.`);
|
|
497
|
+
}
|
|
498
|
+
if (partial.length > 0) {
|
|
499
|
+
const details = partial.map(f => {
|
|
500
|
+
const failedOps = f.operations.filter(o => !o.success).map(o => o.id).join(", ");
|
|
501
|
+
return `${f.name} scored ${f.score} — failing operations include ${failedOps}`;
|
|
502
|
+
}).join("; ");
|
|
503
|
+
parts.push(`Partial coverage in: ${details}.`);
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
parts.push(`This video serves as evidence of the current product state. Failing operations are demonstrated as bugs to be fixed.`);
|
|
507
|
+
return parts.join(" ");
|
|
508
|
+
}
|
|
509
|
+
|
|
467
510
|
function escapeHtml(s: string): string {
|
|
468
511
|
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
469
512
|
}
|
|
@@ -631,7 +674,7 @@ test("${slug} QA evidence", async ({ page }) => {
|
|
|
631
674
|
${segments.join("\n")}
|
|
632
675
|
|
|
633
676
|
// Render the full scorecard as the last segment (visible for ~8s)
|
|
634
|
-
.segment(
|
|
677
|
+
.segment(${JSON.stringify(buildScorecardNarration(results, checklist))}, {
|
|
635
678
|
setup: async () => {
|
|
636
679
|
await page.setContent(SCORECARD_HTML, { waitUntil: "domcontentloaded" });
|
|
637
680
|
await page.waitForTimeout(500);
|
|
@@ -670,7 +713,10 @@ async function runSpec(specPath: string, label: string): Promise<{ ok: boolean;
|
|
|
670
713
|
console.log(`\n${label}\n Running: bunx playwright test ${specPath}\n`);
|
|
671
714
|
try {
|
|
672
715
|
const result = await $`bunx playwright test ${specPath} --reporter=list 2>&1`.text();
|
|
673
|
-
|
|
716
|
+
// Match Playwright's summary line: "N passed" or "N failed"
|
|
717
|
+
const passMatch = result.match(/(\d+) passed/);
|
|
718
|
+
const failMatch = result.match(/(\d+) failed/);
|
|
719
|
+
const ok = passMatch !== null && (failMatch === null || parseInt(failMatch[1]) === 0);
|
|
674
720
|
console.log(result.slice(-1000));
|
|
675
721
|
return { ok, output: result };
|
|
676
722
|
} catch (err: any) {
|