claudecode-omc 4.7.4 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +50 -0
- package/agents/test-engineer.md +74 -0
- package/bridge/cli.cjs +9335 -117
- package/dist/cli/index.js +201 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/testing/analyzers/complexity.d.ts +18 -0
- package/dist/testing/analyzers/complexity.d.ts.map +1 -0
- package/dist/testing/analyzers/complexity.js +121 -0
- package/dist/testing/analyzers/complexity.js.map +1 -0
- package/dist/testing/analyzers/coverage.d.ts +13 -0
- package/dist/testing/analyzers/coverage.d.ts.map +1 -0
- package/dist/testing/analyzers/coverage.js +99 -0
- package/dist/testing/analyzers/coverage.js.map +1 -0
- package/dist/testing/analyzers/quality-scorer.d.ts +8 -0
- package/dist/testing/analyzers/quality-scorer.d.ts.map +1 -0
- package/dist/testing/analyzers/quality-scorer.js +128 -0
- package/dist/testing/analyzers/quality-scorer.js.map +1 -0
- package/dist/testing/analyzers/types.d.ts +56 -0
- package/dist/testing/analyzers/types.d.ts.map +1 -0
- package/dist/testing/analyzers/types.js +2 -0
- package/dist/testing/analyzers/types.js.map +1 -0
- package/dist/testing/cli/agent-integration.d.ts +20 -0
- package/dist/testing/cli/agent-integration.d.ts.map +1 -0
- package/dist/testing/cli/agent-integration.js +60 -0
- package/dist/testing/cli/agent-integration.js.map +1 -0
- package/dist/testing/cli/commands.d.ts +100 -0
- package/dist/testing/cli/commands.d.ts.map +1 -0
- package/dist/testing/cli/commands.js +250 -0
- package/dist/testing/cli/commands.js.map +1 -0
- package/dist/testing/cli/ultraqa-integration.d.ts +13 -0
- package/dist/testing/cli/ultraqa-integration.d.ts.map +1 -0
- package/dist/testing/cli/ultraqa-integration.js +68 -0
- package/dist/testing/cli/ultraqa-integration.js.map +1 -0
- package/dist/testing/detectors/go.d.ts +3 -0
- package/dist/testing/detectors/go.d.ts.map +1 -0
- package/dist/testing/detectors/go.js +38 -0
- package/dist/testing/detectors/go.js.map +1 -0
- package/dist/testing/detectors/index.d.ts +8 -0
- package/dist/testing/detectors/index.d.ts.map +1 -0
- package/dist/testing/detectors/index.js +46 -0
- package/dist/testing/detectors/index.js.map +1 -0
- package/dist/testing/detectors/package-json.d.ts +3 -0
- package/dist/testing/detectors/package-json.d.ts.map +1 -0
- package/dist/testing/detectors/package-json.js +52 -0
- package/dist/testing/detectors/package-json.js.map +1 -0
- package/dist/testing/detectors/python.d.ts +3 -0
- package/dist/testing/detectors/python.d.ts.map +1 -0
- package/dist/testing/detectors/python.js +37 -0
- package/dist/testing/detectors/python.js.map +1 -0
- package/dist/testing/detectors/rust.d.ts +3 -0
- package/dist/testing/detectors/rust.d.ts.map +1 -0
- package/dist/testing/detectors/rust.js +39 -0
- package/dist/testing/detectors/rust.js.map +1 -0
- package/dist/testing/generators/contract.d.ts +14 -0
- package/dist/testing/generators/contract.d.ts.map +1 -0
- package/dist/testing/generators/contract.js +163 -0
- package/dist/testing/generators/contract.js.map +1 -0
- package/dist/testing/generators/e2e.d.ts +34 -0
- package/dist/testing/generators/e2e.d.ts.map +1 -0
- package/dist/testing/generators/e2e.js +74 -0
- package/dist/testing/generators/e2e.js.map +1 -0
- package/dist/testing/generators/go.d.ts +12 -0
- package/dist/testing/generators/go.d.ts.map +1 -0
- package/dist/testing/generators/go.js +144 -0
- package/dist/testing/generators/go.js.map +1 -0
- package/dist/testing/generators/nodejs.d.ts +12 -0
- package/dist/testing/generators/nodejs.d.ts.map +1 -0
- package/dist/testing/generators/nodejs.js +37 -0
- package/dist/testing/generators/nodejs.js.map +1 -0
- package/dist/testing/generators/python.d.ts +12 -0
- package/dist/testing/generators/python.d.ts.map +1 -0
- package/dist/testing/generators/python.js +163 -0
- package/dist/testing/generators/python.js.map +1 -0
- package/dist/testing/generators/react.d.ts +12 -0
- package/dist/testing/generators/react.d.ts.map +1 -0
- package/dist/testing/generators/react.js +31 -0
- package/dist/testing/generators/react.js.map +1 -0
- package/dist/testing/generators/rust.d.ts +11 -0
- package/dist/testing/generators/rust.d.ts.map +1 -0
- package/dist/testing/generators/rust.js +138 -0
- package/dist/testing/generators/rust.js.map +1 -0
- package/dist/testing/index.d.ts +6 -0
- package/dist/testing/index.d.ts.map +1 -0
- package/dist/testing/index.js +11 -0
- package/dist/testing/index.js.map +1 -0
- package/dist/testing/integrations/autopilot.d.ts +42 -0
- package/dist/testing/integrations/autopilot.d.ts.map +1 -0
- package/dist/testing/integrations/autopilot.js +55 -0
- package/dist/testing/integrations/autopilot.js.map +1 -0
- package/dist/testing/integrations/cicd.d.ts +26 -0
- package/dist/testing/integrations/cicd.d.ts.map +1 -0
- package/dist/testing/integrations/cicd.js +162 -0
- package/dist/testing/integrations/cicd.js.map +1 -0
- package/dist/testing/integrations/giskard/behavioral-tests.d.ts +4 -0
- package/dist/testing/integrations/giskard/behavioral-tests.d.ts.map +1 -0
- package/dist/testing/integrations/giskard/behavioral-tests.js +66 -0
- package/dist/testing/integrations/giskard/behavioral-tests.js.map +1 -0
- package/dist/testing/integrations/giskard/types.d.ts +35 -0
- package/dist/testing/integrations/giskard/types.d.ts.map +1 -0
- package/dist/testing/integrations/giskard/types.js +2 -0
- package/dist/testing/integrations/giskard/types.js.map +1 -0
- package/dist/testing/integrations/promptfoo/config-generator.d.ts +5 -0
- package/dist/testing/integrations/promptfoo/config-generator.d.ts.map +1 -0
- package/dist/testing/integrations/promptfoo/config-generator.js +44 -0
- package/dist/testing/integrations/promptfoo/config-generator.js.map +1 -0
- package/dist/testing/integrations/promptfoo/types.d.ts +36 -0
- package/dist/testing/integrations/promptfoo/types.d.ts.map +1 -0
- package/dist/testing/integrations/promptfoo/types.js +2 -0
- package/dist/testing/integrations/promptfoo/types.js.map +1 -0
- package/dist/testing/integrations/ralph.d.ts +65 -0
- package/dist/testing/integrations/ralph.d.ts.map +1 -0
- package/dist/testing/integrations/ralph.js +69 -0
- package/dist/testing/integrations/ralph.js.map +1 -0
- package/dist/testing/performance/cache-manager.d.ts +16 -0
- package/dist/testing/performance/cache-manager.d.ts.map +1 -0
- package/dist/testing/performance/cache-manager.js +39 -0
- package/dist/testing/performance/cache-manager.js.map +1 -0
- package/dist/testing/performance/parallel-generator.d.ts +23 -0
- package/dist/testing/performance/parallel-generator.d.ts.map +1 -0
- package/dist/testing/performance/parallel-generator.js +31 -0
- package/dist/testing/performance/parallel-generator.js.map +1 -0
- package/dist/testing/types.d.ts +23 -0
- package/dist/testing/types.d.ts.map +1 -0
- package/dist/testing/types.js +2 -0
- package/dist/testing/types.js.map +1 -0
- package/docs/2026-03-06-llm-testing-system-phase1.md +0 -0
- package/docs/plans/2026-03-06-llm-testing-system-design.md +311 -0
- package/docs/plans/2026-03-06-llm-testing-system-phase1.md +1268 -0
- package/docs/plans/2026-03-06-llm-testing-system-phase2.md +3053 -0
- package/docs/plans/2026-03-06-llm-testing-system-phase3.md +1830 -0
- package/docs/testing/PHASE2.md +266 -0
- package/docs/testing/PHASE3.md +601 -0
- package/docs/testing/README.md +634 -0
- package/package.json +1 -1
- package/skills/test-gen/skill.md +531 -0
- package/skills/ultraqa.md +58 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
# Phase 3: Advanced Testing Features
|
|
2
|
+
|
|
3
|
+
Phase 3 completes the OMC testing ecosystem with advanced integrations for LLM prompt testing, behavioral testing, E2E test generation, CI/CD automation, and test quality scoring.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Phase 3 adds:
|
|
8
|
+
- **Promptfoo Integration**: LLM prompt testing and evaluation
|
|
9
|
+
- **Giskard Behavioral Tests**: Robustness and perturbation testing
|
|
10
|
+
- **Playwright E2E Generation**: User flow to test automation
|
|
11
|
+
- **CI/CD Templates**: GitHub Actions workflow generation
|
|
12
|
+
- **Test Quality Scoring**: Automated test quality assessment
|
|
13
|
+
- **Ralph/Autopilot Integration**: Automated testing loops
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
### 1. Promptfoo Integration
|
|
18
|
+
|
|
19
|
+
Generate Promptfoo configurations for testing LLM prompts with multiple providers and test cases.
|
|
20
|
+
|
|
21
|
+
**Command:**
|
|
22
|
+
```bash
|
|
23
|
+
omc test promptfoo <prompt-file> [options]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
**Options:**
|
|
27
|
+
- `-p, --provider <provider>`: LLM provider (default: `anthropic:claude-3-5-sonnet-20241022`)
|
|
28
|
+
- `-o, --output <path>`: Output config file path (default: `./promptfoo.config.yaml`)
|
|
29
|
+
|
|
30
|
+
**Example:**
|
|
31
|
+
```bash
|
|
32
|
+
# Generate Promptfoo config for a code review prompt
|
|
33
|
+
omc test promptfoo src/prompts/code-review.txt
|
|
34
|
+
|
|
35
|
+
# Use a different provider
|
|
36
|
+
omc test promptfoo src/prompts/summarize.txt -p openai:gpt-4
|
|
37
|
+
|
|
38
|
+
# Custom output path
|
|
39
|
+
omc test promptfoo src/prompts/analyze.txt -o config/promptfoo.yaml
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Generated Config:**
|
|
43
|
+
```yaml
|
|
44
|
+
prompts:
|
|
45
|
+
- file://src/prompts/code-review.txt
|
|
46
|
+
providers:
|
|
47
|
+
- anthropic:claude-3-5-sonnet-20241022
|
|
48
|
+
tests: []
|
|
49
|
+
outputPath: ./promptfoo-results.json
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Usage:**
|
|
53
|
+
1. Generate config: `omc test promptfoo <prompt-file>`
|
|
54
|
+
2. Add test cases to the generated YAML
|
|
55
|
+
3. Run tests: `npx promptfoo eval`
|
|
56
|
+
4. View results: `npx promptfoo view`
|
|
57
|
+
|
|
58
|
+
### 2. E2E Test Generation
|
|
59
|
+
|
|
60
|
+
Generate Playwright E2E tests from natural language user flow descriptions.
|
|
61
|
+
|
|
62
|
+
**Command:**
|
|
63
|
+
```bash
|
|
64
|
+
omc test e2e <flow-description> [options]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Options:**
|
|
68
|
+
- `-b, --base-url <url>`: Base URL (default: `http://localhost:3000`)
|
|
69
|
+
- `-n, --test-name <name>`: Test name (default: `User flow test`)
|
|
70
|
+
- `-o, --output <path>`: Output file (default: `./tests/e2e/user-flow.spec.ts`)
|
|
71
|
+
|
|
72
|
+
**Example:**
|
|
73
|
+
```bash
|
|
74
|
+
# Generate E2E test from flow description
|
|
75
|
+
omc test e2e "User logs in, navigates to dashboard, clicks on settings"
|
|
76
|
+
|
|
77
|
+
# Custom base URL and test name
|
|
78
|
+
omc test e2e "Admin creates new user" \
|
|
79
|
+
-b https://app.example.com \
|
|
80
|
+
-n "Admin user creation flow"
|
|
81
|
+
|
|
82
|
+
# Custom output path
|
|
83
|
+
omc test e2e "Checkout flow" -o tests/e2e/checkout.spec.ts
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Generated Test:**
|
|
87
|
+
```typescript
|
|
88
|
+
import { test, expect } from '@playwright/test';
|
|
89
|
+
|
|
90
|
+
test.describe('E2E Tests', () => {
|
|
91
|
+
test('User flow test', async ({ page }) => {
|
|
92
|
+
await page.goto('http://localhost:3000/login');
|
|
93
|
+
await page.goto('http://localhost:3000/dashboard');
|
|
94
|
+
await page.click('a[href="/settings"]');
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 3. Giskard Behavioral Tests
|
|
100
|
+
|
|
101
|
+
Generate behavioral tests for robustness and perturbation testing.
|
|
102
|
+
|
|
103
|
+
**Command:**
|
|
104
|
+
```bash
|
|
105
|
+
omc test giskard <file> [options]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Options:**
|
|
109
|
+
- `-t, --test-type <type>`: Test type - `perturbation` or `robustness` (default: `perturbation`)
|
|
110
|
+
- `-o, --output <path>`: Output file (default: `./tests/behavioral/perturbation.test.ts`)
|
|
111
|
+
|
|
112
|
+
**Example:**
|
|
113
|
+
```bash
|
|
114
|
+
# Generate perturbation tests
|
|
115
|
+
omc test giskard src/models/classifier.ts
|
|
116
|
+
|
|
117
|
+
# Generate robustness tests
|
|
118
|
+
omc test giskard src/models/sentiment.ts -t robustness
|
|
119
|
+
|
|
120
|
+
# Custom output
|
|
121
|
+
omc test giskard src/llm/prompt.ts -o tests/behavioral/prompt-robustness.test.ts
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Test Types:**
|
|
125
|
+
|
|
126
|
+
**Perturbation Tests**: Test model behavior under input variations
|
|
127
|
+
- Typos
|
|
128
|
+
- Negations
|
|
129
|
+
- Synonyms
|
|
130
|
+
- Case changes
|
|
131
|
+
|
|
132
|
+
**Robustness Tests**: Test model stability
|
|
133
|
+
- Case sensitivity
|
|
134
|
+
- Whitespace handling
|
|
135
|
+
- Special characters
|
|
136
|
+
- Input length variations
|
|
137
|
+
|
|
138
|
+
**Generated Test:**
|
|
139
|
+
```typescript
|
|
140
|
+
// Generated Giskard behavioral tests
|
|
141
|
+
import { describe, it, expect } from 'vitest';
|
|
142
|
+
|
|
143
|
+
describe('Behavioral Tests', () => {
|
|
144
|
+
it('should still classify as expected', async () => {
|
|
145
|
+
// Original: sample input
|
|
146
|
+
// Perturbed (typo): smaple input
|
|
147
|
+
// TODO: Add test implementation
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it('should still classify as expected', async () => {
|
|
151
|
+
// Original: sample input
|
|
152
|
+
// Perturbed (negation): not sample input
|
|
153
|
+
// TODO: Add test implementation
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### 4. CI/CD Workflow Generation
|
|
159
|
+
|
|
160
|
+
Generate GitHub Actions workflows for automated testing.
|
|
161
|
+
|
|
162
|
+
**Command:**
|
|
163
|
+
```bash
|
|
164
|
+
omc test cicd [options]
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Options:**
|
|
168
|
+
- `-l, --language <lang>`: Primary language - `nodejs`, `python`, `go`, `rust` (default: `nodejs`)
|
|
169
|
+
- `-o, --output <path>`: Output file (default: `./.github/workflows/test.yml`)
|
|
170
|
+
|
|
171
|
+
**Example:**
|
|
172
|
+
```bash
|
|
173
|
+
# Generate Node.js workflow
|
|
174
|
+
omc test cicd
|
|
175
|
+
|
|
176
|
+
# Generate Python workflow
|
|
177
|
+
omc test cicd -l python
|
|
178
|
+
|
|
179
|
+
# Generate Go workflow with custom path
|
|
180
|
+
omc test cicd -l go -o .github/workflows/go-test.yml
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Generated Workflow:**
|
|
184
|
+
```yaml
|
|
185
|
+
name: Test
|
|
186
|
+
|
|
187
|
+
on:
|
|
188
|
+
push:
|
|
189
|
+
branches: [main, dev]
|
|
190
|
+
pull_request:
|
|
191
|
+
branches: [main, dev]
|
|
192
|
+
|
|
193
|
+
jobs:
|
|
194
|
+
test:
|
|
195
|
+
runs-on: ubuntu-latest
|
|
196
|
+
steps:
|
|
197
|
+
- uses: actions/checkout@v4
|
|
198
|
+
|
|
199
|
+
- name: Setup Node.js
|
|
200
|
+
uses: actions/setup-node@v4
|
|
201
|
+
with:
|
|
202
|
+
node-version: 20
|
|
203
|
+
cache: 'pnpm'
|
|
204
|
+
|
|
205
|
+
- name: Install dependencies
|
|
206
|
+
run: pnpm install
|
|
207
|
+
|
|
208
|
+
- name: Run tests
|
|
209
|
+
run: pnpm test
|
|
210
|
+
|
|
211
|
+
- name: Upload coverage
|
|
212
|
+
uses: codecov/codecov-action@v4
|
|
213
|
+
with:
|
|
214
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
215
|
+
|
|
216
|
+
- name: Upload test results
|
|
217
|
+
if: always()
|
|
218
|
+
uses: actions/upload-artifact@v4
|
|
219
|
+
with:
|
|
220
|
+
name: test-results
|
|
221
|
+
path: test-results/
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### 5. Test Quality Scoring
|
|
225
|
+
|
|
226
|
+
Analyze test quality and get actionable recommendations.
|
|
227
|
+
|
|
228
|
+
**Command:**
|
|
229
|
+
```bash
|
|
230
|
+
omc test quality <test-file> [options]
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
**Options:**
|
|
234
|
+
- `-t, --test-type <type>`: Test type - `unit`, `integration`, `e2e` (default: `unit`)
|
|
235
|
+
|
|
236
|
+
**Example:**
|
|
237
|
+
```bash
|
|
238
|
+
# Score a unit test
|
|
239
|
+
omc test quality tests/utils/parser.test.ts
|
|
240
|
+
|
|
241
|
+
# Score an integration test
|
|
242
|
+
omc test quality tests/api/users.test.ts -t integration
|
|
243
|
+
|
|
244
|
+
# Score an E2E test
|
|
245
|
+
omc test quality tests/e2e/checkout.spec.ts -t e2e
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
**Output:**
|
|
249
|
+
```
|
|
250
|
+
📊 Test Quality Score:
|
|
251
|
+
Overall: 85/100
|
|
252
|
+
Completeness: 90/100
|
|
253
|
+
Assertions: 80/100
|
|
254
|
+
Independence: 95/100
|
|
255
|
+
Naming: 75/100
|
|
256
|
+
Assertion Count: 12
|
|
257
|
+
|
|
258
|
+
💡 Recommendations:
|
|
259
|
+
- Test edge cases like null, undefined, empty values, and boundary conditions
|
|
260
|
+
- Use descriptive test names that explain what is being tested
|
|
261
|
+
- Improve assertion quality with more specific matchers
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
**Scoring Metrics:**
|
|
265
|
+
|
|
266
|
+
1. **Completeness (35% weight)**
|
|
267
|
+
- Has assertions
|
|
268
|
+
- Assertion count
|
|
269
|
+
- Tests edge cases
|
|
270
|
+
- Uses mocks
|
|
271
|
+
- Has setup/teardown
|
|
272
|
+
|
|
273
|
+
2. **Assertion Quality (25% weight)**
|
|
274
|
+
- Uses specific assertions (toBe, toEqual)
|
|
275
|
+
- Avoids generic truthy checks
|
|
276
|
+
- Assertion-to-test ratio
|
|
277
|
+
|
|
278
|
+
3. **Independence (20% weight)**
|
|
279
|
+
- No shared state
|
|
280
|
+
- Proper test isolation
|
|
281
|
+
- Uses beforeEach/afterEach
|
|
282
|
+
|
|
283
|
+
4. **Naming (20% weight)**
|
|
284
|
+
- Descriptive test names
|
|
285
|
+
- Uses "should" pattern
|
|
286
|
+
- Clear intent
|
|
287
|
+
|
|
288
|
+
## Integration with Ralph/Autopilot
|
|
289
|
+
|
|
290
|
+
Phase 3 features integrate seamlessly with OMC's execution modes:
|
|
291
|
+
|
|
292
|
+
### Ralph Mode Testing Loop
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
# Ralph automatically runs test quality checks
|
|
296
|
+
omc ralph "implement user authentication with tests"
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
Ralph will:
|
|
300
|
+
1. Generate implementation
|
|
301
|
+
2. Generate tests
|
|
302
|
+
3. Run quality scoring
|
|
303
|
+
4. Fix issues based on recommendations
|
|
304
|
+
5. Loop until quality threshold met
|
|
305
|
+
|
|
306
|
+
### Autopilot Testing Phase
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
# Autopilot includes comprehensive testing
|
|
310
|
+
omc autopilot "build a REST API"
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Autopilot will:
|
|
314
|
+
1. Generate code
|
|
315
|
+
2. Generate unit tests
|
|
316
|
+
3. Generate integration tests
|
|
317
|
+
4. Generate E2E tests
|
|
318
|
+
5. Generate CI/CD workflow
|
|
319
|
+
6. Run quality checks
|
|
320
|
+
7. Generate behavioral tests
|
|
321
|
+
|
|
322
|
+
## Workflows
|
|
323
|
+
|
|
324
|
+
### Complete Testing Workflow
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# 1. Generate implementation tests
|
|
328
|
+
omc test gen src/utils/parser.ts
|
|
329
|
+
|
|
330
|
+
# 2. Score test quality
|
|
331
|
+
omc test quality tests/utils/parser.test.ts
|
|
332
|
+
|
|
333
|
+
# 3. Generate E2E tests
|
|
334
|
+
omc test e2e "User parses configuration file"
|
|
335
|
+
|
|
336
|
+
# 4. Generate behavioral tests
|
|
337
|
+
omc test giskard src/utils/parser.ts
|
|
338
|
+
|
|
339
|
+
# 5. Generate CI/CD workflow
|
|
340
|
+
omc test cicd
|
|
341
|
+
|
|
342
|
+
# 6. Generate Promptfoo config (if using LLM)
|
|
343
|
+
omc test promptfoo src/prompts/parse-config.txt
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### LLM Prompt Testing Workflow
|
|
347
|
+
|
|
348
|
+
```bash
|
|
349
|
+
# 1. Create prompt file
|
|
350
|
+
echo "Analyze this code and suggest improvements" > prompts/code-review.txt
|
|
351
|
+
|
|
352
|
+
# 2. Generate Promptfoo config
|
|
353
|
+
omc test promptfoo prompts/code-review.txt
|
|
354
|
+
|
|
355
|
+
# 3. Edit config to add test cases
|
|
356
|
+
# Edit promptfoo.config.yaml
|
|
357
|
+
|
|
358
|
+
# 4. Run Promptfoo tests
|
|
359
|
+
npx promptfoo eval
|
|
360
|
+
|
|
361
|
+
# 5. View results
|
|
362
|
+
npx promptfoo view
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
### Behavioral Testing Workflow
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
# 1. Generate perturbation tests
|
|
369
|
+
omc test giskard src/models/classifier.ts
|
|
370
|
+
|
|
371
|
+
# 2. Implement test logic
|
|
372
|
+
# Edit tests/behavioral/perturbation.test.ts
|
|
373
|
+
|
|
374
|
+
# 3. Run tests
|
|
375
|
+
pnpm test tests/behavioral/
|
|
376
|
+
|
|
377
|
+
# 4. Generate robustness tests
|
|
378
|
+
omc test giskard src/models/classifier.ts -t robustness
|
|
379
|
+
|
|
380
|
+
# 5. Run all behavioral tests
|
|
381
|
+
pnpm test tests/behavioral/
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
## Best Practices
|
|
385
|
+
|
|
386
|
+
### Promptfoo Testing
|
|
387
|
+
|
|
388
|
+
1. **Start with basic test cases**: Add simple inputs first
|
|
389
|
+
2. **Test edge cases**: Include boundary conditions
|
|
390
|
+
3. **Use multiple providers**: Compare outputs across models
|
|
391
|
+
4. **Version your prompts**: Track prompt changes over time
|
|
392
|
+
5. **Automate in CI**: Run Promptfoo tests in GitHub Actions
|
|
393
|
+
|
|
394
|
+
### E2E Testing
|
|
395
|
+
|
|
396
|
+
1. **Keep flows focused**: One user journey per test
|
|
397
|
+
2. **Use descriptive names**: Clear test intent
|
|
398
|
+
3. **Handle async properly**: Wait for elements
|
|
399
|
+
4. **Test critical paths**: Focus on core functionality
|
|
400
|
+
5. **Run in CI**: Automate E2E tests
|
|
401
|
+
|
|
402
|
+
### Behavioral Testing
|
|
403
|
+
|
|
404
|
+
1. **Test systematically**: Cover all perturbation types
|
|
405
|
+
2. **Set thresholds**: Define acceptable behavior ranges
|
|
406
|
+
3. **Monitor over time**: Track robustness metrics
|
|
407
|
+
4. **Fix regressions**: Address behavioral issues promptly
|
|
408
|
+
5. **Document expectations**: Clear behavior specifications
|
|
409
|
+
|
|
410
|
+
### Test Quality
|
|
411
|
+
|
|
412
|
+
1. **Aim for 80+ overall score**: Good quality baseline
|
|
413
|
+
2. **Address recommendations**: Fix issues systematically
|
|
414
|
+
3. **Review regularly**: Check quality on new tests
|
|
415
|
+
4. **Enforce in CI**: Fail builds on low quality
|
|
416
|
+
5. **Improve iteratively**: Gradual quality improvements
|
|
417
|
+
|
|
418
|
+
## Configuration
|
|
419
|
+
|
|
420
|
+
### Promptfoo Config
|
|
421
|
+
|
|
422
|
+
```yaml
|
|
423
|
+
# promptfoo.config.yaml
|
|
424
|
+
prompts:
|
|
425
|
+
- file://prompts/code-review.txt
|
|
426
|
+
- file://prompts/summarize.txt
|
|
427
|
+
|
|
428
|
+
providers:
|
|
429
|
+
- anthropic:claude-3-5-sonnet-20241022
|
|
430
|
+
- openai:gpt-4
|
|
431
|
+
- openai:gpt-3.5-turbo
|
|
432
|
+
|
|
433
|
+
tests:
|
|
434
|
+
- vars:
|
|
435
|
+
code: "function add(a, b) { return a + b; }"
|
|
436
|
+
assert:
|
|
437
|
+
- type: contains
|
|
438
|
+
value: "function"
|
|
439
|
+
- type: contains
|
|
440
|
+
value: "parameters"
|
|
441
|
+
|
|
442
|
+
- vars:
|
|
443
|
+
code: "const x = 1;"
|
|
444
|
+
assert:
|
|
445
|
+
- type: contains
|
|
446
|
+
value: "variable"
|
|
447
|
+
|
|
448
|
+
defaultTest:
|
|
449
|
+
assert:
|
|
450
|
+
- type: llm-rubric
|
|
451
|
+
value: "Output should be helpful and accurate"
|
|
452
|
+
|
|
453
|
+
outputPath: ./promptfoo-results.json
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
### Playwright Config
|
|
457
|
+
|
|
458
|
+
```typescript
|
|
459
|
+
// playwright.config.ts
|
|
460
|
+
import { defineConfig } from '@playwright/test';
|
|
461
|
+
|
|
462
|
+
export default defineConfig({
|
|
463
|
+
testDir: './tests/e2e',
|
|
464
|
+
fullyParallel: true,
|
|
465
|
+
forbidOnly: !!process.env.CI,
|
|
466
|
+
retries: process.env.CI ? 2 : 0,
|
|
467
|
+
workers: process.env.CI ? 1 : undefined,
|
|
468
|
+
reporter: 'html',
|
|
469
|
+
use: {
|
|
470
|
+
baseURL: 'http://localhost:3000',
|
|
471
|
+
trace: 'on-first-retry',
|
|
472
|
+
},
|
|
473
|
+
});
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
## Troubleshooting
|
|
477
|
+
|
|
478
|
+
### Promptfoo Issues
|
|
479
|
+
|
|
480
|
+
**Problem**: Config not found
|
|
481
|
+
```bash
|
|
482
|
+
# Solution: Check file path
|
|
483
|
+
ls -la promptfoo.config.yaml
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
**Problem**: Provider authentication
|
|
487
|
+
```bash
|
|
488
|
+
# Solution: Set API keys
|
|
489
|
+
export ANTHROPIC_API_KEY=your-key
|
|
490
|
+
export OPENAI_API_KEY=your-key
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### E2E Test Issues
|
|
494
|
+
|
|
495
|
+
**Problem**: Selector not found
|
|
496
|
+
```typescript
|
|
497
|
+
// Solution: Use better selectors
|
|
498
|
+
await page.waitForSelector('[data-testid="login-button"]');
|
|
499
|
+
await page.click('[data-testid="login-button"]');
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
**Problem**: Flaky tests
|
|
503
|
+
```typescript
|
|
504
|
+
// Solution: Add explicit waits
|
|
505
|
+
await page.waitForLoadState('networkidle');
|
|
506
|
+
await expect(page.locator('.result')).toBeVisible();
|
|
507
|
+
```
|
|
508
|
+
|
|
509
|
+
### Quality Scoring Issues
|
|
510
|
+
|
|
511
|
+
**Problem**: Low scores
|
|
512
|
+
```bash
|
|
513
|
+
# Solution: Follow recommendations
|
|
514
|
+
omc test quality tests/file.test.ts
|
|
515
|
+
# Read and implement recommendations
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
**Problem**: False positives
|
|
519
|
+
```bash
|
|
520
|
+
# Solution: Review metrics manually
|
|
521
|
+
# Some patterns may not apply to your test style
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
## Next Steps
|
|
525
|
+
|
|
526
|
+
1. **Explore Phase 1**: Basic test generation - [Phase 1 Guide](./PHASE1.md)
|
|
527
|
+
2. **Explore Phase 2**: Advanced features - [Phase 2 Guide](./PHASE2.md)
|
|
528
|
+
3. **Read Testing Guide**: Complete overview - [Testing README](./README.md)
|
|
529
|
+
4. **Check Examples**: Sample tests - [Examples](../../examples/testing/)
|
|
530
|
+
|
|
531
|
+
## API Reference
|
|
532
|
+
|
|
533
|
+
### Promptfoo Config Generator
|
|
534
|
+
|
|
535
|
+
```typescript
|
|
536
|
+
import { generatePromptfooConfig } from '@/testing/integrations/promptfoo/config-generator';
|
|
537
|
+
|
|
538
|
+
const config = await generatePromptfooConfig({
|
|
539
|
+
promptFile: 'prompts/code-review.txt',
|
|
540
|
+
testCases: [
|
|
541
|
+
{ input: 'code sample', expected: 'contains:function' }
|
|
542
|
+
],
|
|
543
|
+
provider: 'anthropic:claude-3-5-sonnet-20241022',
|
|
544
|
+
});
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
### E2E Test Generator
|
|
548
|
+
|
|
549
|
+
```typescript
|
|
550
|
+
import { generateFromUserFlow } from '@/testing/generators/e2e';
|
|
551
|
+
|
|
552
|
+
const result = await generateFromUserFlow({
|
|
553
|
+
flowDescription: 'User logs in and views dashboard',
|
|
554
|
+
baseUrl: 'http://localhost:3000',
|
|
555
|
+
testName: 'Login flow',
|
|
556
|
+
});
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
### Behavioral Test Generator
|
|
560
|
+
|
|
561
|
+
```typescript
|
|
562
|
+
import { generatePerturbationTests } from '@/testing/integrations/giskard/behavioral-tests';
|
|
563
|
+
|
|
564
|
+
const suite = await generatePerturbationTests({
|
|
565
|
+
testCases: [
|
|
566
|
+
{ input: 'sample', expectedOutput: 'result' }
|
|
567
|
+
],
|
|
568
|
+
perturbations: ['typo', 'negation', 'synonym'],
|
|
569
|
+
});
|
|
570
|
+
```
|
|
571
|
+
|
|
572
|
+
### CI/CD Workflow Generator
|
|
573
|
+
|
|
574
|
+
```typescript
|
|
575
|
+
import { generateGitHubActionsWorkflow } from '@/testing/integrations/cicd';
|
|
576
|
+
|
|
577
|
+
const workflow = await generateGitHubActionsWorkflow({
|
|
578
|
+
language: 'nodejs',
|
|
579
|
+
coverage: true,
|
|
580
|
+
artifacts: true,
|
|
581
|
+
});
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
### Test Quality Scorer
|
|
585
|
+
|
|
586
|
+
```typescript
|
|
587
|
+
import { scoreTestQuality } from '@/testing/analyzers/quality-scorer';
|
|
588
|
+
|
|
589
|
+
const score = await scoreTestQuality({
|
|
590
|
+
testCode: testFileContent,
|
|
591
|
+
testType: 'unit',
|
|
592
|
+
});
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
## Resources
|
|
596
|
+
|
|
597
|
+
- [Promptfoo Documentation](https://promptfoo.dev/)
|
|
598
|
+
- [Playwright Documentation](https://playwright.dev/)
|
|
599
|
+
- [Giskard Documentation](https://docs.giskard.ai/)
|
|
600
|
+
- [GitHub Actions Documentation](https://docs.github.com/en/actions)
|
|
601
|
+
- [OMC Testing Guide](./README.md)
|