qualia-framework 5.1.0 → 5.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -26
- package/agents/builder.md +8 -0
- package/agents/plan-checker.md +10 -1
- package/agents/planner.md +1 -1
- package/agents/qa-browser.md +10 -0
- package/agents/research-synthesizer.md +10 -0
- package/agents/researcher.md +38 -2
- package/agents/roadmapper.md +10 -0
- package/agents/verifier.md +15 -3
- package/agents/visual-evaluator.md +1 -1
- package/bin/install.js +42 -0
- package/bin/state.js +155 -133
- package/docs/archive/session-report-2026-04-18.md +199 -0
- package/docs/archive/v4.0.0-review.md +288 -0
- package/docs/instruction-budget-audit.md +113 -0
- package/docs/polish-loop-supervised-run.md +111 -0
- package/guide.md +11 -4
- package/hooks/session-start.js +1 -1
- package/package.json +5 -2
- package/rules/architecture.md +125 -0
- package/rules/infrastructure.md +1 -2
- package/rules/speed.md +55 -0
- package/skills/qualia-help/SKILL.md +1 -1
- package/skills/qualia-hook-gen/SKILL.md +206 -0
- package/skills/qualia-map/SKILL.md +1 -1
- package/skills/qualia-milestone/SKILL.md +1 -1
- package/skills/qualia-new/SKILL.md +2 -2
- package/skills/qualia-optimize/REFERENCE.md +65 -2
- package/skills/qualia-optimize/SKILL.md +26 -1
- package/skills/qualia-polish/SKILL.md +3 -3
- package/skills/qualia-polish-loop/REFERENCE.md +1 -1
- package/skills/qualia-polish-loop/SKILL.md +3 -3
- package/skills/qualia-polish-loop/fixtures/broken.html +2 -2
- package/skills/qualia-polish-loop/scripts/loop.mjs +26 -5
- package/skills/qualia-polish-loop/scripts/playwright-capture.mjs +14 -5
- package/skills/qualia-polish-loop/scripts/score.mjs +1 -1
- package/skills/qualia-postmortem/SKILL.md +1 -1
- package/skills/qualia-prd/SKILL.md +199 -0
- package/skills/qualia-quick/SKILL.md +1 -1
- package/skills/qualia-research/SKILL.md +5 -3
- package/skills/qualia-road/SKILL.md +15 -5
- package/skills/qualia-task/SKILL.md +1 -1
- package/templates/PRODUCT.md +1 -1
- package/tests/bin.test.sh +155 -8
- package/tests/skills.test.sh +143 -0
- package/tests/slop-detect.test.sh +160 -0
- package/docs/playwright-loop-review-2026-05-03.md +0 -65
- /package/{rules → qualia-design}/design-brand.md +0 -0
- /package/{rules → qualia-design}/design-laws.md +0 -0
- /package/{rules → qualia-design}/design-product.md +0 -0
- /package/{rules → qualia-design}/design-reference.md +0 -0
- /package/{rules → qualia-design}/design-rubric.md +0 -0
- /package/{rules → qualia-design}/frontend.md +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Qualia Framework — skill smoke tests
|
|
3
|
+
# Verifies every skills/*/SKILL.md is well-formed:
|
|
4
|
+
# - YAML frontmatter present and parseable
|
|
5
|
+
# - name field matches folder name
|
|
6
|
+
# - description present and substantive
|
|
7
|
+
# - description has trigger phrases (or skill is disable-model-invocation)
|
|
8
|
+
# - body has at least one h1 heading and 2+ sections
|
|
9
|
+
#
|
|
10
|
+
# Run: bash tests/skills.test.sh
|
|
11
|
+
|
|
12
|
+
PASS=0
|
|
13
|
+
FAIL=0
|
|
14
|
+
SKILLS_DIR="$(cd "$(dirname "$0")/../skills" && pwd)"
|
|
15
|
+
|
|
16
|
+
# Skills allowed to ship without trigger phrases — disable-model-invocation
|
|
17
|
+
# skills only fire on explicit slash command, so triggers are optional.
|
|
18
|
+
SKIP_TRIGGER_CHECK=("qualia-road" "qualia-handoff")
|
|
19
|
+
|
|
20
|
+
pass() {
|
|
21
|
+
echo " ✓ $1"
|
|
22
|
+
PASS=$((PASS + 1))
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
fail_case() {
|
|
26
|
+
echo " ✗ $1"
|
|
27
|
+
echo " $2"
|
|
28
|
+
FAIL=$((FAIL + 1))
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
is_in_skip_list() {
|
|
32
|
+
local needle="$1"
|
|
33
|
+
for x in "${SKIP_TRIGGER_CHECK[@]}"; do
|
|
34
|
+
[ "$x" = "$needle" ] && return 0
|
|
35
|
+
done
|
|
36
|
+
return 1
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
echo "skills.test.sh — smoke tests for every skills/*/SKILL.md"
|
|
40
|
+
echo ""
|
|
41
|
+
|
|
42
|
+
for skill_dir in "$SKILLS_DIR"/*/; do
|
|
43
|
+
name=$(basename "$skill_dir")
|
|
44
|
+
skill_md="$skill_dir/SKILL.md"
|
|
45
|
+
|
|
46
|
+
# Existence
|
|
47
|
+
if [ ! -f "$skill_md" ]; then
|
|
48
|
+
fail_case "$name" "SKILL.md not found at $skill_md"
|
|
49
|
+
continue
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Frontmatter present
|
|
53
|
+
if ! head -1 "$skill_md" | grep -q "^---$"; then
|
|
54
|
+
fail_case "$name: frontmatter" "first line is not '---'"
|
|
55
|
+
continue
|
|
56
|
+
fi
|
|
57
|
+
if ! sed -n '2,30p' "$skill_md" | grep -q "^---$"; then
|
|
58
|
+
fail_case "$name: frontmatter" "no closing --- within first 30 lines"
|
|
59
|
+
continue
|
|
60
|
+
fi
|
|
61
|
+
|
|
62
|
+
# name field matches folder
|
|
63
|
+
fm_name=$(grep "^name:" "$skill_md" | head -1 | sed 's/^name:[[:space:]]*//' | tr -d '"')
|
|
64
|
+
if [ "$fm_name" != "$name" ]; then
|
|
65
|
+
fail_case "$name: name field" "frontmatter says name=\"$fm_name\", folder is \"$name\""
|
|
66
|
+
continue
|
|
67
|
+
fi
|
|
68
|
+
pass "$name: frontmatter name matches folder"
|
|
69
|
+
|
|
70
|
+
# description field present + substantive
|
|
71
|
+
fm_desc=$(grep "^description:" "$skill_md" | head -1 | sed 's/^description:[[:space:]]*//')
|
|
72
|
+
desc_len=${#fm_desc}
|
|
73
|
+
if [ "$desc_len" -lt 50 ]; then
|
|
74
|
+
fail_case "$name: description" "description is $desc_len chars, expected >= 50"
|
|
75
|
+
continue
|
|
76
|
+
fi
|
|
77
|
+
pass "$name: description present (${desc_len} chars)"
|
|
78
|
+
|
|
79
|
+
# Trigger phrases (unless disable-model-invocation or in transitional skip list)
|
|
80
|
+
if ! is_in_skip_list "$name"; then
|
|
81
|
+
has_disable=$(grep -c "disable-model-invocation:[[:space:]]*true" "$skill_md")
|
|
82
|
+
if [ "$has_disable" = "0" ]; then
|
|
83
|
+
if echo "$fm_desc" | grep -qiE "trigger|when user|use when|invoke|says|use this|fire on|user types"; then
|
|
84
|
+
pass "$name: description has trigger guidance"
|
|
85
|
+
else
|
|
86
|
+
fail_case "$name: triggers" "description lacks trigger phrases (Trigger:/Use when:/'says'/etc.) and skill is not disable-model-invocation"
|
|
87
|
+
fi
|
|
88
|
+
fi
|
|
89
|
+
fi
|
|
90
|
+
|
|
91
|
+
# Body has an h1 heading
|
|
92
|
+
h1_count=$(grep -cE "^# " "$skill_md")
|
|
93
|
+
if [ "$h1_count" -ge 1 ]; then
|
|
94
|
+
pass "$name: body has h1 heading"
|
|
95
|
+
else
|
|
96
|
+
fail_case "$name: body" "no h1 heading (^# ) in body"
|
|
97
|
+
fi
|
|
98
|
+
|
|
99
|
+
# Body has at least one section (any ## heading)
|
|
100
|
+
h2_count=$(grep -cE "^## " "$skill_md")
|
|
101
|
+
if [ "$h2_count" -ge 1 ]; then
|
|
102
|
+
pass "$name: body has section heading (${h2_count} found)"
|
|
103
|
+
else
|
|
104
|
+
fail_case "$name: body" "no '## ' section heading; every skill needs at least one"
|
|
105
|
+
fi
|
|
106
|
+
|
|
107
|
+
# Cache-aware spawn audit (per rules/grounding.md):
|
|
108
|
+
# Every spawn to a CUSTOM (qualia-*) agent must anchor the prompt with
|
|
109
|
+
# `@~/.claude/agents/{name}.md` (either `Role: @...` or `Read your role:
|
|
110
|
+
# @...` — both forms accepted). The role file is session-stable; placing
|
|
111
|
+
# it first lets Anthropic's prompt cache reuse the prefix across spawns
|
|
112
|
+
# (documented 81-90% cost reduction). If task-specific content lands
|
|
113
|
+
# before the role anchor, the entire prefix recomputes on every spawn.
|
|
114
|
+
#
|
|
115
|
+
# Built-in subagent types (Explore, general-purpose, Plan, etc.) have
|
|
116
|
+
# stable system-prompt baselines on Anthropic's side; no Role anchor
|
|
117
|
+
# required. We count only `subagent_type="qualia-*"` spawns.
|
|
118
|
+
#
|
|
119
|
+
# Some skills follow progressive-disclosure discipline (e.g.
|
|
120
|
+
# qualia-polish-loop) and put the literal spawn template in REFERENCE.md
|
|
121
|
+
# while SKILL.md mentions the spawn in prose. We scan both.
|
|
122
|
+
custom_spawn_count=$(grep -c 'subagent_type="qualia-' "$skill_md")
|
|
123
|
+
ref_md="$skill_dir/REFERENCE.md"
|
|
124
|
+
if [ -f "$ref_md" ]; then
|
|
125
|
+
custom_spawn_count=$((custom_spawn_count + $(grep -c 'subagent_type="qualia-' "$ref_md")))
|
|
126
|
+
fi
|
|
127
|
+
if [ "${custom_spawn_count:-0}" -gt 0 ]; then
|
|
128
|
+
role_count=$(grep -cE '@~/\.claude/agents/' "$skill_md")
|
|
129
|
+
if [ -f "$ref_md" ]; then
|
|
130
|
+
role_count=$((role_count + $(grep -cE '@~/\.claude/agents/' "$ref_md")))
|
|
131
|
+
fi
|
|
132
|
+
if [ "${role_count:-0}" -ge "$custom_spawn_count" ]; then
|
|
133
|
+
pass "$name: spawn audit ($custom_spawn_count custom spawn(s), all role-anchored for cache)"
|
|
134
|
+
else
|
|
135
|
+
fail_case "$name: spawn audit" "$custom_spawn_count custom spawn(s) but only ${role_count:-0} '@~/.claude/agents/' anchors — prompt cache will miss"
|
|
136
|
+
fi
|
|
137
|
+
fi
|
|
138
|
+
done
|
|
139
|
+
|
|
140
|
+
echo ""
|
|
141
|
+
echo "=== Results: $PASS passed, $FAIL failed ==="
|
|
142
|
+
|
|
143
|
+
[ "$FAIL" = "0" ]
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Qualia Framework — bin/slop-detect.mjs behavior tests
|
|
3
|
+
# Verifies the AI-tells gatekeeper actually catches what it claims to catch.
|
|
4
|
+
#
|
|
5
|
+
# Run: bash tests/slop-detect.test.sh
|
|
6
|
+
|
|
7
|
+
PASS=0
|
|
8
|
+
FAIL=0
|
|
9
|
+
SLOP_DETECT="$(cd "$(dirname "$0")/../bin" && pwd)/slop-detect.mjs"
|
|
10
|
+
NODE="${NODE:-node}"
|
|
11
|
+
|
|
12
|
+
TMP_DIRS=()
|
|
13
|
+
cleanup() {
|
|
14
|
+
for d in "${TMP_DIRS[@]}"; do
|
|
15
|
+
[ -d "$d" ] && rm -rf "$d"
|
|
16
|
+
done
|
|
17
|
+
}
|
|
18
|
+
trap cleanup EXIT
|
|
19
|
+
|
|
20
|
+
mktmp() {
|
|
21
|
+
local TMP
|
|
22
|
+
TMP=$(mktemp -d)
|
|
23
|
+
TMP_DIRS+=("$TMP")
|
|
24
|
+
echo "$TMP"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
pass() {
|
|
28
|
+
echo " ✓ $1"
|
|
29
|
+
PASS=$((PASS + 1))
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
fail_case() {
|
|
33
|
+
echo " ✗ $1"
|
|
34
|
+
echo " $2"
|
|
35
|
+
FAIL=$((FAIL + 1))
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
echo "slop-detect.test.sh — bin/slop-detect.mjs behavioral tests"
|
|
39
|
+
echo ""
|
|
40
|
+
|
|
41
|
+
# ── Sanity: file exists and parses ────────────────────────────────────
|
|
42
|
+
if [ ! -f "$SLOP_DETECT" ]; then
|
|
43
|
+
fail_case "slop-detect exists" "$SLOP_DETECT not found"
|
|
44
|
+
echo "=== Results: $PASS passed, $FAIL failed ==="
|
|
45
|
+
exit 1
|
|
46
|
+
fi
|
|
47
|
+
pass "slop-detect.mjs exists at expected path"
|
|
48
|
+
|
|
49
|
+
if ! $NODE --check "$SLOP_DETECT" 2>&1 | head -1; then
|
|
50
|
+
pass "slop-detect.mjs parses as valid JS"
|
|
51
|
+
fi
|
|
52
|
+
# Above is a heredoc test — node --check succeeds silently on valid JS.
|
|
53
|
+
# Re-test explicitly so a parse error fails the suite.
|
|
54
|
+
if $NODE --check "$SLOP_DETECT" 2>/dev/null; then
|
|
55
|
+
pass "slop-detect.mjs syntax is valid"
|
|
56
|
+
else
|
|
57
|
+
fail_case "syntax check" "node --check failed on $SLOP_DETECT"
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
# ── Clean file: should exit 0 ─────────────────────────────────────────
|
|
61
|
+
TMP=$(mktmp)
|
|
62
|
+
cat > "$TMP/clean.tsx" <<'EOF'
|
|
63
|
+
import { Button } from '@/components/ui/button';
|
|
64
|
+
|
|
65
|
+
export default function Page() {
|
|
66
|
+
return (
|
|
67
|
+
<div className="bg-surface text-foreground">
|
|
68
|
+
<h1 className="text-display">Welcome</h1>
|
|
69
|
+
<Button variant="primary">Continue setup</Button>
|
|
70
|
+
</div>
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
EOF
|
|
74
|
+
if $NODE "$SLOP_DETECT" "$TMP/clean.tsx" >/dev/null 2>&1; then
|
|
75
|
+
pass "exits 0 on a clean .tsx file"
|
|
76
|
+
else
|
|
77
|
+
fail_case "clean file" "exit non-zero on a deliberately clean file"
|
|
78
|
+
fi
|
|
79
|
+
|
|
80
|
+
# ── Em-dash detection (HIGH severity — reported, doesn't block) ──────
|
|
81
|
+
# Em-dash is HIGH not CRITICAL, so default exit is 0; we verify the
|
|
82
|
+
# FINDING is reported to stdout/stderr, not the exit code.
|
|
83
|
+
TMP2=$(mktmp)
|
|
84
|
+
cat > "$TMP2/emdash.tsx" <<'EOF'
|
|
85
|
+
export default function Page() {
|
|
86
|
+
return <p>Welcome — to our amazing platform</p>;
|
|
87
|
+
}
|
|
88
|
+
EOF
|
|
89
|
+
OUT=$($NODE "$SLOP_DETECT" "$TMP2/emdash.tsx" 2>&1 || true)
|
|
90
|
+
if echo "$OUT" | grep -qiE "em.?dash|—"; then
|
|
91
|
+
pass "reports em-dash finding (HIGH severity, non-blocking)"
|
|
92
|
+
else
|
|
93
|
+
fail_case "em-dash detection" "no em-dash mention in output: $(echo "$OUT" | head -c 120)"
|
|
94
|
+
fi
|
|
95
|
+
|
|
96
|
+
# ── Banned-font detection ─────────────────────────────────────────────
|
|
97
|
+
TMP3=$(mktmp)
|
|
98
|
+
cat > "$TMP3/font.css" <<'EOF'
|
|
99
|
+
body { font-family: "Inter", sans-serif; }
|
|
100
|
+
EOF
|
|
101
|
+
EXIT_CODE=0
|
|
102
|
+
$NODE "$SLOP_DETECT" "$TMP3/font.css" >/dev/null 2>&1 || EXIT_CODE=$?
|
|
103
|
+
if [ "$EXIT_CODE" = "1" ]; then
|
|
104
|
+
pass "exits 1 on banned font (Inter) in CSS"
|
|
105
|
+
else
|
|
106
|
+
fail_case "banned-font detection" "expected exit 1, got $EXIT_CODE"
|
|
107
|
+
fi
|
|
108
|
+
|
|
109
|
+
# ── Purple-blue gradient detection ────────────────────────────────────
|
|
110
|
+
TMP4=$(mktmp)
|
|
111
|
+
cat > "$TMP4/gradient.tsx" <<'EOF'
|
|
112
|
+
export default function Hero() {
|
|
113
|
+
return <div className="bg-gradient-to-r from-blue-500 to-purple-600">Hi</div>;
|
|
114
|
+
}
|
|
115
|
+
EOF
|
|
116
|
+
EXIT_CODE=0
|
|
117
|
+
$NODE "$SLOP_DETECT" "$TMP4/gradient.tsx" >/dev/null 2>&1 || EXIT_CODE=$?
|
|
118
|
+
if [ "$EXIT_CODE" = "1" ]; then
|
|
119
|
+
pass "exits 1 on purple-blue gradient (the #1 AI-design tell)"
|
|
120
|
+
else
|
|
121
|
+
fail_case "gradient detection" "expected exit 1, got $EXIT_CODE"
|
|
122
|
+
fi
|
|
123
|
+
|
|
124
|
+
# ── Existing fixture: skills/qualia-polish-loop/fixtures/broken.html ──
|
|
125
|
+
FIXTURE="$(cd "$(dirname "$0")/.." && pwd)/skills/qualia-polish-loop/fixtures/broken.html"
|
|
126
|
+
if [ -f "$FIXTURE" ]; then
|
|
127
|
+
EXIT_CODE=0
|
|
128
|
+
$NODE "$SLOP_DETECT" "$FIXTURE" >/dev/null 2>&1 || EXIT_CODE=$?
|
|
129
|
+
if [ "$EXIT_CODE" = "1" ]; then
|
|
130
|
+
pass "exits 1 on the broken.html fixture (designed to hit critical bans)"
|
|
131
|
+
else
|
|
132
|
+
fail_case "fixture detection" "broken.html fixture exited $EXIT_CODE; expected 1"
|
|
133
|
+
fi
|
|
134
|
+
else
|
|
135
|
+
echo " - broken.html fixture not present, skipping"
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
# ── --json flag produces JSON output ─────────────────────────────────
|
|
139
|
+
TMP5=$(mktmp)
|
|
140
|
+
cp "$TMP3/font.css" "$TMP5/font.css"
|
|
141
|
+
JSON_OUT=$($NODE "$SLOP_DETECT" --json "$TMP5/font.css" 2>/dev/null || true)
|
|
142
|
+
if echo "$JSON_OUT" | head -1 | grep -qE "^[\{\[]"; then
|
|
143
|
+
pass "--json flag produces JSON-shaped output"
|
|
144
|
+
else
|
|
145
|
+
fail_case "--json output" "first line is not JSON-shaped: '$(echo "$JSON_OUT" | head -c 80)'"
|
|
146
|
+
fi
|
|
147
|
+
|
|
148
|
+
# ── Invocation error: no path provided AND no default repo ───────────
|
|
149
|
+
EXIT_CODE=0
|
|
150
|
+
$NODE "$SLOP_DETECT" /nonexistent/path/that/cannot/exist >/dev/null 2>&1 || EXIT_CODE=$?
|
|
151
|
+
if [ "$EXIT_CODE" = "2" ] || [ "$EXIT_CODE" = "0" ]; then
|
|
152
|
+
pass "handles missing path gracefully (exit=$EXIT_CODE — 0=skip, 2=invocation error)"
|
|
153
|
+
else
|
|
154
|
+
fail_case "missing path" "unexpected exit $EXIT_CODE on /nonexistent path"
|
|
155
|
+
fi
|
|
156
|
+
|
|
157
|
+
echo ""
|
|
158
|
+
echo "=== Results: $PASS passed, $FAIL failed ==="
|
|
159
|
+
|
|
160
|
+
[ "$FAIL" = "0" ]
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
# Playwright Visual-Polish Loop — Adversarial Review 2026-05-03
|
|
2
|
-
|
|
3
|
-
## TL;DR
|
|
4
|
-
|
|
5
|
-
**Recommendation:** **NO-SHIP**
|
|
6
|
-
|
|
7
|
-
**Headline finding:** The feature does not exist in the repository. No `skills/qualia-polish-loop/` folder, no pilot results doc, no design notes, no v5.1.0 CHANGELOG entry, no version bump, no commits past the prompt-only commit `8e7d33d`. There is nothing to evaluate against the builder spec. The v5.1 "autonomous visual-polish loop" remains in the state declared at `CHANGELOG.md:280-285` — Deferred.
|
|
8
|
-
|
|
9
|
-
## Gate-by-gate verdict
|
|
10
|
-
|
|
11
|
-
| Gate | Status | Evidence |
|
|
12
|
-
|---|---|---|
|
|
13
|
-
| 1 — Builder claim integrity | **FAIL** | No claims to verify. `docs/playwright-loop-pilot-results.md` does not exist (`ls docs/playwright-loop-*` returns only `builder-prompt.md` + `tester-prompt.md`). `docs/playwright-loop-design-notes.md` does not exist. `git log 8e7d33d..HEAD` returns empty. `package.json:3` still reads `"version": "5.0.0"`. `CHANGELOG.md` has no `[5.1.0]` entry. |
|
|
14
|
-
| 2 — Framework regression | **PASS (degenerate)** | `npm test` reports 14 + 59 + 66 + 101 + 15 = **255 passing, 0 failed** across 5 suites. Pass solely because no code changed. Note: the spec claims "260+ tests"; actual baseline is 255. Spec figure is loose. |
|
|
15
|
-
| 3 — Skill structural validity | **FAIL** | `ls skills/qualia-polish-loop/` returns `No such file or directory`. SKILL.md, REFERENCE.md, `scripts/playwright-capture.mjs`, `scripts/score.mjs` — none exist. Gate cannot proceed. |
|
|
16
|
-
| 4 — Pilot results audit | **FAIL** | `docs/playwright-loop-pilot-results.md` does not exist. Scenario 1 / 2 / 3 unverifiable. No `qpl-N:` commit prefixes anywhere in `git log`. |
|
|
17
|
-
| 5 — Adversarial probes | **N/A — 0 PASS, 0 FAIL** | No artifact to probe. Each of 5a–5h marked INSUFFICIENT EVIDENCE: no skill installed → nothing to invoke → no behavior to observe. Re-run when builder ships. |
|
|
18
|
-
| 6 — Token cost reality | **FAIL** | No iterations executed. Token-budget claim ("≤100K per loop") in `playwright-loop-builder-prompt.md:108` and `:32` is unverifiable. |
|
|
19
|
-
| 7 — Security review | **FAIL** | Spec attack surface (Playwright MCP + Bash + Edit + Write + user-provided URL → shell) has no implementation to audit. Must be re-run post-build. |
|
|
20
|
-
| 8 — Doc accuracy | **FAIL** | No CHANGELOG v5.1.0 entry to verify (`grep -n "v5.1\|polish-loop" CHANGELOG.md` returns only the existing "Deferred to v5.1" section at lines 272/280/288/291). No design-notes doc to verify. |
|
|
21
|
-
|
|
22
|
-
## Critical findings (CRITICAL severity, must-fix before ship)
|
|
23
|
-
|
|
24
|
-
### C1 — Builder produced zero artifacts
|
|
25
|
-
|
|
26
|
-
- **Severity:** CRITICAL — matches Severity Rubric line "feature broken for >50% of users; ... wiring missing (component exists but unreachable)" trivially: nothing exists to wire or reach.
|
|
27
|
-
- **Evidence:**
|
|
28
|
-
- `git log --oneline -30` — most recent commit is `8e7d33d docs(v5.1-prep): Playwright visual-polish loop prompts (builder + reviewer)`. That commit added only the two prompt markdowns.
|
|
29
|
-
- `git status` — clean, branch `feat/env-empty-guard`, no uncommitted work.
|
|
30
|
-
- `ls skills/` — 32 skills, none named `qualia-polish-loop`.
|
|
31
|
-
- `ls docs/` — pilot-results.md and design-notes.md absent.
|
|
32
|
-
- `package.json:3` — `"version": "5.0.0"`.
|
|
33
|
-
- **Impact:** v5.1 cannot ship. Users invoking `/qualia-polish-loop` will hit a routing miss. The spec's success criteria (`playwright-loop-builder-prompt.md:164-173`) — items 1, 2, 3, 4, 5, 6 — all fail.
|
|
34
|
-
- **Action:** Re-run the builder prompt in a fresh session. Verify the agent actually executes (does not silently hallucinate completion).
|
|
35
|
-
|
|
36
|
-
## High findings (HIGH severity, fix in v5.1.1 patch)
|
|
37
|
-
|
|
38
|
-
None applicable — the feature must exist before HIGH-severity behavioral findings can be raised.
|
|
39
|
-
|
|
40
|
-
## Medium findings (MEDIUM severity, v5.2 backlog)
|
|
41
|
-
|
|
42
|
-
### M1 — Builder spec contains a verifiable factual error
|
|
43
|
-
|
|
44
|
-
- **Severity:** MEDIUM — "feature works but missing states; ... contract drift between docs and behavior" applied to the spec itself.
|
|
45
|
-
- **Evidence:** `playwright-loop-builder-prompt.md:9` claims the framework has "260+ tests." `npm test` totals on the current main of `feat/env-empty-guard` give **255**. Off-by-five against the stated baseline. Either tests were lost since the figure was written, or the figure was rounded up.
|
|
46
|
-
- **Impact:** Tester Gate 2 step 1 ("all suites pass with the same count or higher than v5.0.0 baseline (260 tests)") is impossible to satisfy as written. A future builder reading this spec literally will treat 255 as a regression.
|
|
47
|
-
- **Action:** Patch the spec line to `255` or run `git log --all --grep test` to find where the lost 5+ tests went and restore them before v5.1 begins.
|
|
48
|
-
|
|
49
|
-
## What works well (give credit honestly)
|
|
50
|
-
|
|
51
|
-
- **The prompt pair is well-engineered.** `playwright-loop-builder-prompt.md` is concrete, cites `file:line` for every integration point, lists 7 hard constraints with named failure modes, mandates 3 self-test scenarios with quantitative expected outcomes, and explicitly forbids silent workarounds (`docs/playwright-loop-builder-prompt.md:175-181`). This is the rare AI-build prompt that survives adversarial reading.
|
|
52
|
-
- **Tester prompt enforces grounding discipline.** Cites `rules/grounding.md`, mandates `file:line` evidence, prohibits hedging, caps tool budget at 50 calls (`playwright-loop-tester-prompt.md:204-205`). The reviewer-side rigor is in place; the builder-side execution is not.
|
|
53
|
-
- **CHANGELOG is honest about deferred work.** `CHANGELOG.md:280-291` already lists the visual-polish loop as v5.1 deferred, with accurate reasoning. The framework owner's documented intent and the current repo state are consistent — the spec was set up correctly; the build run did not happen.
|
|
54
|
-
- **Framework regression test still green.** 255/255 passing on the baseline (`npm test`). The reviewer harness is healthy and ready when the build lands.
|
|
55
|
-
|
|
56
|
-
## Recommended next steps
|
|
57
|
-
|
|
58
|
-
1. **Re-spawn the builder agent in a fresh session and verify it actually writes files.** The most likely failure mode is: builder session died, was killed, or hallucinated DONE without committing. Watch the session log; require `git log` output proving commits exist before declaring DONE.
|
|
59
|
-
2. **Patch the test-baseline figure in `playwright-loop-builder-prompt.md:9`** — change "260+ tests" to "255 tests" or audit the git history for missing tests. This unblocks Gate 2.
|
|
60
|
-
3. **Defer this review.** When the builder produces real artifacts, re-run `playwright-loop-tester-prompt.md` against them. The current review is a no-op except for documenting that the build run did not occur.
|
|
61
|
-
4. **Consider a builder pre-flight check.** Add a heartbeat to the builder agent: write `docs/.qpl-builder-started` when the session begins and `docs/.qpl-builder-progress` after every major file. The reviewer can then distinguish "builder didn't run" from "builder ran and failed silently" — a real failure mode given the spec's complexity (Playwright MCP install + Vercel preview deploy + 3 self-tests + commit discipline + slop-detect gate, all in one session).
|
|
62
|
-
|
|
63
|
-
---
|
|
64
|
-
|
|
65
|
-
**Reviewer note (honesty over signoff, per `playwright-loop-tester-prompt.md:213`):** This review took ~10 tool calls because the absence of artifacts halted Gates 3–8 immediately. The remaining 40 calls of budget are reserved for the next review pass when real artifacts exist. No CRITICAL findings beyond C1 are surfaceable at this stage; once the loop ships, expect Gate 5 (adversarial probes) and Gate 7 (security) to do most of the work.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|