ralphctl 0.9.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.mjs +4421 -3168
- package/dist/manifest.json +6 -2
- package/dist/prompts/implement/template.md +2 -0
- package/dist/skills/ralphctl-code-review-and-quality/SKILL.md +250 -0
- package/dist/skills/ralphctl-debugging-and-error-recovery/SKILL.md +191 -0
- package/dist/skills/ralphctl-surgical-simplicity/SKILL.md +65 -0
- package/dist/skills/ralphctl-test-driven-development/SKILL.md +343 -0
- package/package.json +3 -1
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ralphctl-test-driven-development
|
|
3
|
+
description: Execute-phase skill — write the failing test before the code that makes it pass; reproduce bugs with a test before fixing them. Use for any logic change, bug fix, or behavioural modification.
|
|
4
|
+
license: MIT
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Test-Driven Development
|
|
8
|
+
|
|
9
|
+
> Concept from [Addy Osmani — "Test-Driven Development"](https://github.com/addyosmani/agent-skills)
|
|
10
|
+
> (agent-skills, MIT). Adapted for ralphctl's execute phase.
|
|
11
|
+
|
|
12
|
+
Write a failing test before writing the code that makes it pass. For bug fixes, reproduce the bug with a
|
|
13
|
+
test before attempting a fix. Tests are proof — "seems right" is not done. A codebase with good tests is an
|
|
14
|
+
AI agent's superpower; a codebase without tests is a liability.
|
|
15
|
+
|
|
16
|
+
## When this applies
|
|
17
|
+
|
|
18
|
+
- **Execute** — any new logic, bug fix, or behavioural change. Follow the RED→GREEN→REFACTOR cycle for each
|
|
19
|
+
unit of work. Run the project's narrow check after each step; emit `<task-complete>` once the task's
|
|
20
|
+
acceptance criteria are met. The harness runs the post-task verify gate — you do not own that verdict.
|
|
21
|
+
|
|
22
|
+
**When NOT to use:** Pure configuration changes, documentation updates, or static content changes with no
|
|
23
|
+
behavioural impact.
|
|
24
|
+
|
|
25
|
+
## The TDD Cycle
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
RED GREEN REFACTOR
|
|
29
|
+
Write a test Write minimal code Clean up the
|
|
30
|
+
that fails ──→ to make it pass ──→ implementation ──→ (repeat)
|
|
31
|
+
│ │ │
|
|
32
|
+
▼ ▼ ▼
|
|
33
|
+
Test FAILS Test PASSES Tests still PASS
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Step 1: RED — Write a Failing Test
|
|
37
|
+
|
|
38
|
+
Write the test first. It must fail. A test that passes immediately proves nothing.
|
|
39
|
+
|
|
40
|
+
```typescript
|
|
41
|
+
// RED: This test fails because createTask doesn't exist yet
|
|
42
|
+
describe('TaskService', () => {
|
|
43
|
+
it('creates a task with title and default status', async () => {
|
|
44
|
+
const task = await taskService.createTask({ title: 'Buy groceries' });
|
|
45
|
+
|
|
46
|
+
expect(task.id).toBeDefined();
|
|
47
|
+
expect(task.title).toBe('Buy groceries');
|
|
48
|
+
expect(task.status).toBe('pending');
|
|
49
|
+
expect(task.createdAt).toBeInstanceOf(Date);
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Step 2: GREEN — Make It Pass
|
|
55
|
+
|
|
56
|
+
Write the minimum code to make the test pass. Do not over-engineer:
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
// GREEN: Minimal implementation
|
|
60
|
+
export async function createTask(input: { title: string }): Promise<Task> {
|
|
61
|
+
const task = {
|
|
62
|
+
id: generateId(),
|
|
63
|
+
title: input.title,
|
|
64
|
+
status: 'pending' as const,
|
|
65
|
+
createdAt: new Date(),
|
|
66
|
+
};
|
|
67
|
+
await db.tasks.insert(task);
|
|
68
|
+
return task;
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Step 3: REFACTOR — Clean Up
|
|
73
|
+
|
|
74
|
+
With tests green, improve the code without changing behaviour:
|
|
75
|
+
|
|
76
|
+
- Extract shared logic
|
|
77
|
+
- Improve naming
|
|
78
|
+
- Remove duplication
|
|
79
|
+
- Optimise if necessary
|
|
80
|
+
|
|
81
|
+
Run the project's narrow check after every refactor step to confirm nothing broke.
|
|
82
|
+
|
|
83
|
+
## The Prove-It Pattern (Bug Fixes)
|
|
84
|
+
|
|
85
|
+
When a bug is reported, **do not start by trying to fix it.** Start by writing a test that reproduces it.
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
Bug report arrives
|
|
89
|
+
│
|
|
90
|
+
▼
|
|
91
|
+
Write a test that demonstrates the bug
|
|
92
|
+
│
|
|
93
|
+
▼
|
|
94
|
+
Test FAILS (confirming the bug exists)
|
|
95
|
+
│
|
|
96
|
+
▼
|
|
97
|
+
Implement the fix
|
|
98
|
+
│
|
|
99
|
+
▼
|
|
100
|
+
Test PASSES (proving the fix works)
|
|
101
|
+
│
|
|
102
|
+
▼
|
|
103
|
+
Run the project's narrow check (no regressions in the affected scope)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Example:**
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
// Bug: "Completing a task doesn't update the completedAt timestamp"
|
|
110
|
+
|
|
111
|
+
// Step 1: Write the reproduction test (it should FAIL)
|
|
112
|
+
it('sets completedAt when task is completed', async () => {
|
|
113
|
+
const task = await taskService.createTask({ title: 'Test' });
|
|
114
|
+
const completed = await taskService.completeTask(task.id);
|
|
115
|
+
|
|
116
|
+
expect(completed.status).toBe('completed');
|
|
117
|
+
expect(completed.completedAt).toBeInstanceOf(Date); // This fails → bug confirmed
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
// Step 2: Fix the bug
|
|
121
|
+
export async function completeTask(id: string): Promise<Task> {
|
|
122
|
+
return db.tasks.update(id, {
|
|
123
|
+
status: 'completed',
|
|
124
|
+
completedAt: new Date(), // This was missing
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Step 3: Test passes → bug fixed, regression guarded
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## The Test Pyramid
|
|
132
|
+
|
|
133
|
+
Invest testing effort according to the pyramid — most tests should be small and fast, with progressively
|
|
134
|
+
fewer tests at higher levels:
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
╱╲
|
|
138
|
+
╱ ╲ E2E Tests (~5%)
|
|
139
|
+
╱ ╲ Full user flows, real system
|
|
140
|
+
╱──────╲
|
|
141
|
+
╱ ╲ Integration Tests (~15%)
|
|
142
|
+
╱ ╲ Component interactions, API boundaries
|
|
143
|
+
╱────────────╲
|
|
144
|
+
╱ ╲ Unit Tests (~80%)
|
|
145
|
+
╱ ╲ Pure logic, isolated, milliseconds each
|
|
146
|
+
╱──────────────────╲
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**The Beyoncé Rule:** If you liked it, you should have put a test on it. Infrastructure changes,
|
|
150
|
+
refactoring, and migrations are not responsible for catching your bugs — your tests are. If a change
|
|
151
|
+
breaks your code and you did not have a test for it, that is on you.
|
|
152
|
+
|
|
153
|
+
### Test Sizes (Resource Model)
|
|
154
|
+
|
|
155
|
+
Beyond the pyramid levels, classify tests by what resources they consume:
|
|
156
|
+
|
|
157
|
+
| Size | Constraints | Speed | Example |
|
|
158
|
+
| ---------- | ------------------------------------------------------ | ------------ | --------------------------------------- |
|
|
159
|
+
| **Small** | Single process, no I/O, no network, no database | Milliseconds | Pure function tests, data transforms |
|
|
160
|
+
| **Medium** | Multi-process OK, localhost only, no external services | Seconds | API tests with test DB, component tests |
|
|
161
|
+
| **Large** | Multi-machine OK, external services allowed | Minutes | E2E tests, performance benchmarks |
|
|
162
|
+
|
|
163
|
+
Small tests should make up the vast majority of your suite. They are fast, reliable, and easy to debug
|
|
164
|
+
when they fail.
|
|
165
|
+
|
|
166
|
+
### Decision Guide
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
Is it pure logic with no side effects?
|
|
170
|
+
→ Unit test (small)
|
|
171
|
+
|
|
172
|
+
Does it cross a boundary (API, database, file system)?
|
|
173
|
+
→ Integration test (medium)
|
|
174
|
+
|
|
175
|
+
Is it a critical user flow that must work end-to-end?
|
|
176
|
+
→ E2E test (large) — limit these to critical paths
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Writing Good Tests
|
|
180
|
+
|
|
181
|
+
### Test State, Not Interactions
|
|
182
|
+
|
|
183
|
+
Assert on the _outcome_ of an operation, not on which methods were called internally. Tests that verify
|
|
184
|
+
method call sequences break when you refactor, even if the behaviour is unchanged.
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
// Good: Tests what the function does (state-based)
|
|
188
|
+
it('returns tasks sorted by creation date, newest first', async () => {
|
|
189
|
+
const tasks = await listTasks({ sortBy: 'createdAt', sortOrder: 'desc' });
|
|
190
|
+
expect(tasks[0].createdAt.getTime()).toBeGreaterThan(tasks[1].createdAt.getTime());
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
// Bad: Tests how the function works internally (interaction-based)
|
|
194
|
+
it('calls db.query with ORDER BY created_at DESC', async () => {
|
|
195
|
+
await listTasks({ sortBy: 'createdAt', sortOrder: 'desc' });
|
|
196
|
+
expect(db.query).toHaveBeenCalledWith(expect.stringContaining('ORDER BY created_at DESC'));
|
|
197
|
+
});
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### DAMP Over DRY in Tests
|
|
201
|
+
|
|
202
|
+
In production code, DRY (Don't Repeat Yourself) is usually right. In tests, **DAMP (Descriptive And
|
|
203
|
+
Meaningful Phrases)** is better. A test should read like a specification — each test should tell a
|
|
204
|
+
complete story without requiring the reader to trace through shared helpers.
|
|
205
|
+
|
|
206
|
+
```typescript
|
|
207
|
+
// DAMP: Each test is self-contained and readable
|
|
208
|
+
it('rejects tasks with empty titles', () => {
|
|
209
|
+
const input = { title: '', assignee: 'user-1' };
|
|
210
|
+
expect(() => createTask(input)).toThrow('Title is required');
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it('trims whitespace from titles', () => {
|
|
214
|
+
const input = { title: ' Buy groceries ', assignee: 'user-1' };
|
|
215
|
+
const task = createTask(input);
|
|
216
|
+
expect(task.title).toBe('Buy groceries');
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
// Over-DRY: Shared setup obscures what each test actually verifies
|
|
220
|
+
// (Do not do this just to avoid repeating the input shape)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Duplication in tests is acceptable when it makes each test independently understandable.
|
|
224
|
+
|
|
225
|
+
### Prefer Real Implementations Over Mocks
|
|
226
|
+
|
|
227
|
+
Use the simplest test double that gets the job done. The more your tests use real code, the more
|
|
228
|
+
confidence they provide.
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
Preference order (most to least preferred):
|
|
232
|
+
1. Real implementation → Highest confidence, catches real bugs
|
|
233
|
+
2. Fake → In-memory version of a dependency (e.g., fake DB)
|
|
234
|
+
3. Stub → Returns canned data, no behaviour
|
|
235
|
+
4. Mock (interaction) → Verifies method calls — use sparingly
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Use mocks only when the real implementation is too slow, non-deterministic, or has side effects you
|
|
239
|
+
cannot control (external APIs, email sending). Over-mocking creates tests that pass while production
|
|
240
|
+
breaks.
|
|
241
|
+
|
|
242
|
+
### Use the Arrange-Act-Assert Pattern
|
|
243
|
+
|
|
244
|
+
```typescript
|
|
245
|
+
it('marks overdue tasks when deadline has passed', () => {
|
|
246
|
+
// Arrange: Set up the test scenario
|
|
247
|
+
const task = createTask({
|
|
248
|
+
title: 'Test',
|
|
249
|
+
deadline: new Date('2025-01-01'),
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
// Act: Perform the action being tested
|
|
253
|
+
const result = checkOverdue(task, new Date('2025-01-02'));
|
|
254
|
+
|
|
255
|
+
// Assert: Verify the outcome
|
|
256
|
+
expect(result.isOverdue).toBe(true);
|
|
257
|
+
});
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### One Assertion Per Concept
|
|
261
|
+
|
|
262
|
+
```typescript
|
|
263
|
+
// Good: Each test verifies one behaviour
|
|
264
|
+
it('rejects empty titles', () => { ... });
|
|
265
|
+
it('trims whitespace from titles', () => { ... });
|
|
266
|
+
it('enforces maximum title length', () => { ... });
|
|
267
|
+
|
|
268
|
+
// Bad: Everything in one test
|
|
269
|
+
it('validates titles correctly', () => {
|
|
270
|
+
expect(() => createTask({ title: '' })).toThrow();
|
|
271
|
+
expect(createTask({ title: ' hello ' }).title).toBe('hello');
|
|
272
|
+
expect(() => createTask({ title: 'a'.repeat(256) })).toThrow();
|
|
273
|
+
});
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### Name Tests Descriptively
|
|
277
|
+
|
|
278
|
+
```typescript
|
|
279
|
+
// Good: Reads like a specification
|
|
280
|
+
describe('TaskService.completeTask', () => {
|
|
281
|
+
it('sets status to completed and records timestamp', ...);
|
|
282
|
+
it('throws NotFoundError for non-existent task', ...);
|
|
283
|
+
it('is idempotent — completing an already-completed task is a no-op', ...);
|
|
284
|
+
it('sends notification to task assignee', ...);
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
// Bad: Vague names
|
|
288
|
+
describe('TaskService', () => {
|
|
289
|
+
it('works', ...);
|
|
290
|
+
it('handles errors', ...);
|
|
291
|
+
it('test 3', ...);
|
|
292
|
+
});
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Anti-Patterns to Avoid
|
|
296
|
+
|
|
297
|
+
| Anti-Pattern | Problem | Fix |
|
|
298
|
+
| ------------------------------------- | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
|
|
299
|
+
| Testing implementation details | Tests break when refactoring even if behaviour is unchanged | Test inputs and outputs, not internal structure |
|
|
300
|
+
| Flaky tests (timing, order-dependent) | Erode trust in the test suite | Use deterministic assertions, isolate test state |
|
|
301
|
+
| Testing framework code | Wastes time testing third-party behaviour | Only test YOUR code |
|
|
302
|
+
| Snapshot abuse | Large snapshots nobody reviews, break on any change | Use snapshots sparingly and review every change |
|
|
303
|
+
| No test isolation | Tests pass individually but fail together | Each test sets up and tears down its own state |
|
|
304
|
+
| Mocking everything | Tests pass but production breaks | Prefer real implementations > fakes > stubs > mocks — mock only at boundaries where real deps are slow or non-deterministic |
|
|
305
|
+
|
|
306
|
+
## Common Rationalizations
|
|
307
|
+
|
|
308
|
+
| Rationalization | Reality |
|
|
309
|
+
| -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
|
310
|
+
| "I'll write tests after the code works" | You won't. And tests written after the fact test implementation, not behaviour. |
|
|
311
|
+
| "This is too simple to test" | Simple code gets complicated. The test documents the expected behaviour. |
|
|
312
|
+
| "Tests slow me down" | Tests slow you down now. They speed you up every time you change the code later. |
|
|
313
|
+
| "I tested it manually" | Manual testing does not persist. Tomorrow's change might break it with no way to know. |
|
|
314
|
+
| "The code is self-explanatory" | Tests ARE the specification. They document what the code should do, not what it does. |
|
|
315
|
+
| "It's just a prototype" | Prototypes become production code. Tests from day one prevent the "test debt" crisis. |
|
|
316
|
+
| "Let me run the tests again just to be extra sure" | After a clean run, repeating the same command on unchanged code adds nothing. Run again after subsequent edits, not as reassurance. |
|
|
317
|
+
|
|
318
|
+
## Red Flags
|
|
319
|
+
|
|
320
|
+
- Writing code without any corresponding tests
|
|
321
|
+
- Tests that pass on the first run (they may not be testing what you think)
|
|
322
|
+
- "All tests pass" but no tests were actually run
|
|
323
|
+
- Bug fixes without a reproduction test
|
|
324
|
+
- Tests that verify framework behaviour instead of application behaviour
|
|
325
|
+
- Test names that do not describe the expected behaviour
|
|
326
|
+
- Skipping tests to make the suite pass
|
|
327
|
+
- Running the same test command twice in a row without any intervening code change
|
|
328
|
+
|
|
329
|
+
## Verification Checklist
|
|
330
|
+
|
|
331
|
+
Before emitting `<task-complete>`, confirm:
|
|
332
|
+
|
|
333
|
+
- Every new behaviour introduced by this task has a corresponding test
|
|
334
|
+
- Run the project's narrow check (consult the project's AI context file — `CLAUDE.md`, `AGENTS.md`,
|
|
335
|
+
or `.github/copilot-instructions.md` when present — for the exact test command) after each meaningful
|
|
336
|
+
change; confirm it is green
|
|
337
|
+
- Bug fixes include a reproduction test that failed before the fix
|
|
338
|
+
- Test names describe the behaviour being verified
|
|
339
|
+
- No tests were skipped or disabled to achieve a passing run
|
|
340
|
+
- Coverage for the changed scope has not decreased (if tracked by the project)
|
|
341
|
+
|
|
342
|
+
The harness runs the post-task verify gate after you signal completion — incremental narrow checks
|
|
343
|
+
during the work are your responsibility; the final gate verdict is the harness's.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralphctl",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.1",
|
|
4
4
|
"description": "Agent harness for long-running AI coding tasks — orchestrates Claude Code, GitHub Copilot, and OpenAI Codex across repositories",
|
|
5
5
|
"homepage": "https://github.com/lukas-grigis/ralphctl",
|
|
6
6
|
"type": "module",
|
|
@@ -79,6 +79,7 @@
|
|
|
79
79
|
"scripts": {
|
|
80
80
|
"build": "tsup && tsx scripts/build-assets.ts",
|
|
81
81
|
"dev": "NODE_OPTIONS=--max-old-space-size=8192 tsx src/index.ts",
|
|
82
|
+
"dev:heap-snapshot": "mkdir -p .diagnostics && NODE_OPTIONS='--max-old-space-size=8192 --heapsnapshot-near-heap-limit=2 --diagnostic-dir=.diagnostics' tsx src/index.ts",
|
|
82
83
|
"start": "NODE_OPTIONS=--max-old-space-size=8192 tsx src/index.ts",
|
|
83
84
|
"typecheck": "tsc",
|
|
84
85
|
"test": "vitest run",
|
|
@@ -90,6 +91,7 @@
|
|
|
90
91
|
"verify:coverage": "pnpm coverage",
|
|
91
92
|
"coverage:unused": "tsx scripts/find-unused.ts",
|
|
92
93
|
"deadcode": "knip",
|
|
94
|
+
"skills:update": "tsx scripts/sync-skills.ts",
|
|
93
95
|
"lint": "eslint .",
|
|
94
96
|
"lint:fix": "eslint . --fix",
|
|
95
97
|
"format": "prettier --write .",
|