@graypark/loophaus 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/loophaus.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  // loophaus CLI — install, status, stats, uninstall
3
3
 
4
- import { resolve, dirname } from "node:path";
4
+ import { resolve, dirname, join } from "node:path";
5
5
  import { fileURLToPath } from "node:url";
6
6
  import { access } from "node:fs/promises";
7
7
 
@@ -46,6 +46,7 @@ Usage:
46
46
  npx @graypark/loophaus loops
47
47
  npx @graypark/loophaus worktree <create|remove|list>
48
48
  npx @graypark/loophaus parallel <prd.json> [--count N] [--base branch]
49
+ npx @graypark/loophaus quality [--story US-001]
49
50
  npx @graypark/loophaus sessions
50
51
  npx @graypark/loophaus resume <session-id>
51
52
  npx @graypark/loophaus --version
@@ -448,6 +449,48 @@ async function runParallelCmd() {
448
449
  }
449
450
  }
450
451
 
452
+ async function runQuality() {
453
+ const storyId = getFlag("--story");
454
+ const cwd = process.cwd();
455
+
456
+ if (storyId) {
457
+ const { evaluateStory } = await import("../core/quality-scorer.mjs");
458
+ const { read } = await import("../store/state-store.mjs");
459
+ const state = await read(cwd);
460
+ const config = state.qualityConfig || {};
461
+
462
+ if (!config.typecheckCommand) {
463
+ try { await access(join(cwd, "tsconfig.json")); config.typecheckCommand = "npx tsc --noEmit"; } catch {}
464
+ }
465
+
466
+ const result = await evaluateStory(storyId, cwd, config);
467
+ console.log(`Quality: ${storyId}`);
468
+ console.log("\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500");
469
+ console.log(`Score: ${result.score}/100 (${result.grade})`);
470
+ for (const [k, v] of Object.entries(result.breakdown)) {
471
+ const bar = "\u2588".repeat(v) + "\u2591".repeat(10 - v);
472
+ console.log(` ${k.padEnd(10)} ${bar} ${v}/10`);
473
+ }
474
+ } else {
475
+ const { readResults } = await import("../core/quality-scorer.mjs");
476
+ const results = await readResults(cwd);
477
+ if (results.length === 0) { console.log("No quality results yet. Run /loop-plan first."); return; }
478
+
479
+ console.log("Quality Results");
480
+ console.log("\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550");
481
+ const byStory = {};
482
+ for (const r of results) {
483
+ if (!byStory[r.storyId]) byStory[r.storyId] = [];
484
+ byStory[r.storyId].push(r);
485
+ }
486
+ for (const [sid, attempts] of Object.entries(byStory)) {
487
+ const best = attempts.reduce((a, b) => a.score > b.score ? a : b);
488
+ const icon = best.status === "keep" ? "\u2713" : best.status === "discard" ? "\u2717" : "~";
489
+ console.log(` ${icon} ${sid} score: ${best.score} (${attempts.length} attempts)`);
490
+ }
491
+ }
492
+ }
493
+
451
494
  try {
452
495
  switch (command) {
453
496
  case "install": await runInstall(); break;
@@ -460,6 +503,7 @@ try {
460
503
  case "compare": await runCompare(); break;
461
504
  case "worktree": await runWorktree(); break;
462
505
  case "parallel": await runParallelCmd(); break;
506
+ case "quality": await runQuality(); break;
463
507
  case "sessions": await runSessions(); break;
464
508
  case "resume": await runResume(); break;
465
509
  default:
@@ -172,7 +172,59 @@ Single loop, no worktrees:
172
172
  3. Each iteration: implement one story, verify, commit, update prd.json.
173
173
  4. Output `<promise>TASK COMPLETE</promise>` when ALL stories pass.
174
174
 
175
- ## Phase 5: Summary Report
175
+ ## Phase 5: Evaluate
176
+
177
+ After all stories are implemented (parallel or sequential), evaluate each:
178
+
179
+ For each story in prd.json:
180
+ 1. Run testCommand if defined
181
+ 2. Run typecheck if project has tsconfig.json: `npx tsc --noEmit`
182
+ 3. Run lint if project has eslint config: `npx eslint . --quiet`
183
+ 4. Check .loophaus/verify.sh if exists
184
+ 5. Analyze git diff size
185
+
186
+ Score each story 0-100. Record in `.loophaus/results.tsv`.
187
+
188
+ Display quality dashboard:
189
+ ```
190
+ Quality Evaluation
191
+ ──────────────────
192
+ US-001 Add login API score: 65 (D) <- needs refinement
193
+ US-002 Add auth middleware score: 92 (A) ✓
194
+ US-003 Add login UI score: 45 (F) <- needs refinement
195
+
196
+ Overall: 67/100 — threshold: 80
197
+ Stories needing refinement: 2
198
+ ```
199
+
200
+ ## Phase 6: Refine Loop (autoresearch pattern)
201
+
202
+ For each story below the quality threshold (default: 80):
203
+
204
+ LOOP (max 3 attempts per story):
205
+ 1. Git checkpoint: `git add -A && git commit -m "checkpoint: <story-id> attempt <N>"`
206
+ 2. Read the quality feedback (which criteria failed, error messages)
207
+ 3. Re-implement with a different approach, focusing on weak areas
208
+ 4. Re-evaluate (same criteria as Phase 5)
209
+ 5. If score improved -> KEEP (advance the commit)
210
+ If score same or worse -> DISCARD (git reset --hard to checkpoint)
211
+ 6. Record attempt in .loophaus/results.tsv
212
+ 7. If score >= threshold -> DONE with this story
213
+ If max attempts reached -> move on (best-effort)
214
+
215
+ After all refinements:
216
+ ```
217
+ Refinement Complete
218
+ ───────────────────
219
+ US-001 65 -> 82 (B) ✓ (2 attempts)
220
+ US-003 45 -> 78 (C) (3 attempts, best effort)
221
+
222
+ Overall: 84/100 — PASS
223
+ ```
224
+
225
+ CRITICAL: The refine loop uses git reset --hard to discard bad attempts. This is the autoresearch pattern — safe because we always checkpoint first.
226
+
227
+ ## Phase 7: Summary Report
176
228
 
177
229
  After completion (parallel or sequential), output:
178
230
 
package/core/events.mjs CHANGED
@@ -15,6 +15,10 @@ export const EventType = {
15
15
  CHECKPOINT: "checkpoint",
16
16
  ERROR: "error",
17
17
  STATE_CHANGE: "state_change",
18
+ QUALITY_SCORE: "quality_score",
19
+ REFINE_ATTEMPT: "refine_attempt",
20
+ REFINE_KEEP: "refine_keep",
21
+ REFINE_DISCARD: "refine_discard",
18
22
  };
19
23
 
20
24
  export function filterByType(events, type) {
@@ -0,0 +1,136 @@
1
+ // core/quality-scorer.mjs
2
+ // Quality scoring for story implementations (autoresearch pattern: val_bpb -> quality score)
3
+
4
+ import { execFile } from "node:child_process";
5
+ import { promisify } from "node:util";
6
+ import { readFile, stat } from "node:fs/promises";
7
+ import { join } from "node:path";
8
+
9
+ const execFileAsync = promisify(execFile);
10
+
11
+ const CRITERIA = {
12
+ tests: { weight: 3, max: 10 },
13
+ typecheck: { weight: 2, max: 10 },
14
+ lint: { weight: 1, max: 10 },
15
+ verify: { weight: 2, max: 10 },
16
+ diff: { weight: 1, max: 10 },
17
+ custom: { weight: 1, max: 10 },
18
+ };
19
+
20
+ export function scoreStory(results) {
21
+ let totalWeight = 0;
22
+ let weightedSum = 0;
23
+ const breakdown = {};
24
+
25
+ for (const [key, config] of Object.entries(CRITERIA)) {
26
+ if (results[key] === undefined || results[key] === null) continue;
27
+ const value = typeof results[key] === "number" ? results[key] : (results[key].score ?? 0);
28
+ const clamped = Math.max(0, Math.min(config.max, value));
29
+ breakdown[key] = clamped;
30
+ weightedSum += clamped * config.weight;
31
+ totalWeight += config.max * config.weight;
32
+ }
33
+
34
+ const score = totalWeight > 0 ? Math.round((weightedSum / totalWeight) * 100) : 0;
35
+ const grade = score >= 90 ? "A" : score >= 80 ? "B" : score >= 70 ? "C" : score >= 60 ? "D" : "F";
36
+
37
+ return { score, grade, breakdown };
38
+ }
39
+
40
+ export async function evaluateStory(storyId, cwd, config = {}) {
41
+ const results = {};
42
+
43
+ if (config.testCommand) {
44
+ try {
45
+ await execFileAsync("sh", ["-c", config.testCommand], { cwd, timeout: 120_000 });
46
+ results.tests = 10;
47
+ } catch {
48
+ results.tests = 0;
49
+ }
50
+ }
51
+
52
+ if (config.typecheckCommand) {
53
+ try {
54
+ await execFileAsync("sh", ["-c", config.typecheckCommand], { cwd, timeout: 60_000 });
55
+ results.typecheck = 10;
56
+ } catch (err) {
57
+ const errorCount = (err.stdout || "").split("\n").filter(l => l.includes("error")).length;
58
+ results.typecheck = Math.max(0, 10 - errorCount);
59
+ }
60
+ }
61
+
62
+ if (config.lintCommand) {
63
+ try {
64
+ await execFileAsync("sh", ["-c", config.lintCommand], { cwd, timeout: 60_000 });
65
+ results.lint = 10;
66
+ } catch (err) {
67
+ const warnings = (err.stdout || "").split("\n").filter(l => l.includes("warning") || l.includes("error")).length;
68
+ results.lint = Math.max(0, 10 - warnings);
69
+ }
70
+ }
71
+
72
+ if (config.verifyScript) {
73
+ try {
74
+ await execFileAsync("sh", ["-c", config.verifyScript], { cwd, timeout: 60_000 });
75
+ results.verify = 10;
76
+ } catch {
77
+ results.verify = 0;
78
+ }
79
+ }
80
+
81
+ try {
82
+ const { stdout } = await execFileAsync("git", ["diff", "--stat", "HEAD~1"], { cwd, timeout: 10_000 });
83
+ const lines = stdout.trim().split("\n");
84
+ const lastLine = lines[lines.length - 1] || "";
85
+ const match = lastLine.match(/(\d+) insertion.+?(\d+) deletion/);
86
+ if (match) {
87
+ const total = parseInt(match[1]) + parseInt(match[2]);
88
+ results.diff = total < 100 ? 10 : total < 300 ? 8 : total < 500 ? 6 : total < 1000 ? 4 : 2;
89
+ }
90
+ } catch {
91
+ // No git diff available
92
+ }
93
+
94
+ const customPath = join(cwd, ".loophaus", "quality.mjs");
95
+ try {
96
+ await stat(customPath);
97
+ const mod = await import(customPath);
98
+ if (typeof mod.evaluate === "function") {
99
+ const customResult = await mod.evaluate(storyId, cwd);
100
+ results.custom = typeof customResult === "number" ? customResult : (customResult?.score ?? 0);
101
+ }
102
+ } catch {
103
+ // No custom evaluator
104
+ }
105
+
106
+ return { storyId, results, ...scoreStory(results) };
107
+ }
108
+
109
+ export async function logResult(entry, cwd) {
110
+ const { appendFile, mkdir } = await import("node:fs/promises");
111
+ const tsvPath = join(cwd || process.cwd(), ".loophaus", "results.tsv");
112
+ await mkdir(join(cwd || process.cwd(), ".loophaus"), { recursive: true });
113
+
114
+ try {
115
+ await stat(tsvPath);
116
+ } catch {
117
+ await appendFile(tsvPath, "story_id\tattempt\tscore\tstatus\tdescription\tcommit\n", "utf-8");
118
+ }
119
+
120
+ const line = `${entry.storyId}\t${entry.attempt}\t${entry.score}\t${entry.status}\t${entry.description}\t${entry.commit || ""}\n`;
121
+ await appendFile(tsvPath, line, "utf-8");
122
+ }
123
+
124
+ export async function readResults(cwd) {
125
+ const tsvPath = join(cwd || process.cwd(), ".loophaus", "results.tsv");
126
+ try {
127
+ const raw = await readFile(tsvPath, "utf-8");
128
+ const lines = raw.trim().split("\n").slice(1);
129
+ return lines.map(line => {
130
+ const [storyId, attempt, score, status, description, commit] = line.split("\t");
131
+ return { storyId, attempt: parseInt(attempt), score: parseInt(score), status, description, commit };
132
+ });
133
+ } catch {
134
+ return [];
135
+ }
136
+ }
@@ -0,0 +1,29 @@
1
+ // core/refine-loop.mjs
2
+ // autoresearch keep/discard pattern for code quality improvement
3
+
4
+ export function shouldKeep(newScore, baselineScore) {
5
+ return newScore > baselineScore;
6
+ }
7
+
8
+ export function generateFeedback(evaluation, previousAttempts = []) {
9
+ const { storyId, score, grade, breakdown } = evaluation;
10
+ const failedCriteria = Object.entries(breakdown)
11
+ .filter(([_, v]) => v < 7)
12
+ .map(([k, v]) => `${k}: ${v}/10`);
13
+
14
+ let prompt = `Story ${storyId} quality: ${score}/100 (${grade}).\n`;
15
+ if (failedCriteria.length > 0) {
16
+ prompt += `Weak areas: ${failedCriteria.join(", ")}.\n`;
17
+ }
18
+ if (previousAttempts.length > 0) {
19
+ prompt += `Previous attempts: ${previousAttempts.map(a => `attempt ${a.attempt}: ${a.score} (${a.status})`).join(", ")}.\n`;
20
+ }
21
+ prompt += `Improve the implementation. Focus on the weak areas. Try a different approach if the same strategy keeps failing.`;
22
+ return prompt;
23
+ }
24
+
25
+ export function identifyRefinementTargets(evaluations, threshold = 80) {
26
+ return evaluations
27
+ .filter(e => e.score < threshold)
28
+ .sort((a, b) => a.score - b.score);
29
+ }
package/core/validate.mjs CHANGED
@@ -14,6 +14,9 @@ const STATE_OPTIONAL = {
14
14
  verifyScript: "string",
15
15
  startedAt: "string",
16
16
  cost: "object",
17
+ qualityThreshold: "number",
18
+ maxRefineAttempts: "number",
19
+ qualityConfig: "object",
17
20
  };
18
21
 
19
22
  export function validateState(obj) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@graypark/loophaus",
3
- "version": "3.3.0",
3
+ "version": "3.4.0",
4
4
  "type": "module",
5
5
  "description": "loophaus — Control plane for coding agents. Iterative dev loops with multi-agent orchestration.",
6
6
  "license": "MIT",
@@ -112,6 +112,14 @@ Score >= 3: parallel mode (worktrees by group). Score < 3: sequential mode.
112
112
  ## Phase 4B: Sequential Execution (score < 3)
113
113
  Create \`.loophaus/state.json\` and work through stories one at a time.
114
114
 
115
+ ## Phase 5: Evaluate
116
+ Score each story 0-100 (tests, typecheck, lint, verify, diff size). Record in \`.loophaus/results.tsv\`.
117
+
118
+ ## Phase 6: Refine Loop (autoresearch pattern)
119
+ For stories below quality threshold (default 80), loop up to 3 attempts:
120
+ 1. Checkpoint, 2. Re-implement weak areas, 3. Re-evaluate.
121
+ Keep if improved, discard (git reset) if not. Best-effort after max attempts.
122
+
115
123
  ## Rules
116
124
  - Present PRD for user approval before execution
117
125
  - Show parallelism score and recommendation
@@ -84,6 +84,14 @@ Score >= 3: parallel (worktrees). Score < 3: sequential.
84
84
  Parallel: create worktrees per group, distribute stories, run simultaneously, merge back.
85
85
  Sequential: single loop through stories in order.
86
86
 
87
+ ## Phase 5: Evaluate
88
+ Score each story 0-100 (tests, typecheck, lint, verify, diff size). Record in \`.loophaus/results.tsv\`.
89
+
90
+ ## Phase 6: Refine Loop (autoresearch pattern)
91
+ For stories below quality threshold (default 80), loop up to 3 attempts:
92
+ 1. Checkpoint, 2. Re-implement weak areas, 3. Re-evaluate.
93
+ Keep if improved, discard (git reset) if not. Best-effort after max attempts.
94
+
87
95
  Rules: present PRD for approval, show parallelism score, stop on merge conflicts.
88
96
  `,
89
97
  },
@@ -30,6 +30,8 @@ Ask **concise questions** for missing items. Max 3-5 per round, one round only.
30
30
  | **Constraints** | Must not break existing tests? Library restrictions? |
31
31
  | **When stuck** | Document? Skip? Suggest alternative? |
32
32
  | **Parallelism potential** | Multiple services? Independent file groups? |
33
+ | **Quality verification** | What verification commands? (npm test, npx tsc, etc.) |
34
+ | **Quality threshold** | What quality threshold? (default: 80/100) |
33
35
 
34
36
  ## Phase Design
35
37
 
@@ -9,6 +9,9 @@ const DEFAULT_STATE = {
9
9
  maxIterations: 20,
10
10
  currentIteration: 0,
11
11
  sessionId: "",
12
+ qualityThreshold: 80,
13
+ maxRefineAttempts: 3,
14
+ qualityConfig: null,
12
15
  };
13
16
 
14
17
  export function getStatePath(cwd, name) {