mdan-cli 2.2.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mcp.json +46 -0
- package/AGENTS.md +246 -0
- package/README.md +32 -7
- package/agents/test.md +60 -2
- package/cli/mdan.js +149 -26
- package/cli/mdan.py +111 -54
- package/cli/mdan.sh +43 -43
- package/install.sh +30 -167
- package/integrations/all-integrations.md +2 -2
- package/integrations/cursor.md +11 -11
- package/integrations/mcp.md +153 -0
- package/integrations/windsurf.md +4 -4
- package/package.json +4 -2
- package/phases/04-verify.md +9 -3
- package/templates/prompts/README.md +108 -0
- package/templates/prompts/dev-agent.yaml +85 -0
- package/templates/prompts/orchestrator.yaml +97 -0
- package/templates/prompts.json +81 -0
- package/templates/tests/evaluations/README.md +80 -0
- package/templates/tests/evaluations/classification_eval.md +136 -0
- package/templates/tests/evaluations/rag_eval.md +116 -0
- package/templates/tests/scenarios/README.md +62 -0
- package/templates/tests/scenarios/basic_authentication.test.md +82 -0
- package/templates/tests/scenarios/user_registration.test.md +107 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# RAG Evaluation Template
|
|
2
|
+
|
|
3
|
+
> Benchmark RAG (Retrieval-Augmented Generation) correctness and quality
|
|
4
|
+
|
|
5
|
+
## Metadata
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| eval_name | rag_correctness |
|
|
10
|
+
| version | 1.0.0 |
|
|
11
|
+
| metrics | F1 Score, Precision, Recall, Context Relevance |
|
|
12
|
+
|
|
13
|
+
## Purpose
|
|
14
|
+
|
|
15
|
+
Evaluate how well the RAG pipeline retrieves relevant context and generates accurate answers.
|
|
16
|
+
|
|
17
|
+
## Dataset Format
|
|
18
|
+
|
|
19
|
+
```json
|
|
20
|
+
[
|
|
21
|
+
{
|
|
22
|
+
"query": "What is the refund policy?",
|
|
23
|
+
"expected_chunks": [
|
|
24
|
+
"refund_policy.md: paragraphs 1-3",
|
|
25
|
+
"faq.md: refund section"
|
|
26
|
+
],
|
|
27
|
+
"expected_answer_contains": ["30 days", "original payment", "processing time"]
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Evaluation Metrics
|
|
33
|
+
|
|
34
|
+
### Retrieval Metrics
|
|
35
|
+
|
|
36
|
+
| Metric | Target | Description |
|
|
37
|
+
|--------|--------|-------------|
|
|
38
|
+
| Recall | ≥0.85 | % of relevant chunks retrieved |
|
|
39
|
+
| Precision | ≥0.90 | % of retrieved chunks that are relevant |
|
|
40
|
+
| F1 Score | ≥0.87 | Harmonic mean of precision/recall |
|
|
41
|
+
|
|
42
|
+
### Generation Metrics
|
|
43
|
+
|
|
44
|
+
| Metric | Target | Description |
|
|
45
|
+
|--------|--------|-------------|
|
|
46
|
+
| Context Relevance | ≥0.80 | LLM judge scores context usefulness |
|
|
47
|
+
| Answer Accuracy | ≥0.85 | Answer contains expected information |
|
|
48
|
+
| Hallucination Rate | ≤0.05 | Facts not in context |
|
|
49
|
+
|
|
50
|
+
## Evaluation Code
|
|
51
|
+
|
|
52
|
+
### Python (LangWatch)
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import langwatch
|
|
56
|
+
|
|
57
|
+
results = langwatch.evaluate(
|
|
58
|
+
dataset="customer-support-rag",
|
|
59
|
+
evaluator="rag_correctness",
|
|
60
|
+
metrics=["f1_score", "precision", "recall", "context_relevance"]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
print(f"F1 Score: {results.f1_score}")
|
|
64
|
+
print(f"Precision: {results.precision}")
|
|
65
|
+
print(f"Recall: {results.recall}")
|
|
66
|
+
print(f"Context Relevance: {results.context_relevance}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### JavaScript/TypeScript
|
|
70
|
+
|
|
71
|
+
```typescript
|
|
72
|
+
import { evaluate } from "@langwatch/evaluators";
|
|
73
|
+
|
|
74
|
+
const results = await evaluate({
|
|
75
|
+
dataset: "customer-support-rag",
|
|
76
|
+
evaluator: "rag_correctness",
|
|
77
|
+
metrics: ["f1_score", "precision", "recall", "context_relevance"],
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
console.log(`F1 Score: ${results.f1Score}`);
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Pass/Fail Criteria
|
|
84
|
+
|
|
85
|
+
| Metric | Threshold | Status |
|
|
86
|
+
|--------|-----------|--------|
|
|
87
|
+
| F1 Score | ≥0.87 | ✅ Pass |
|
|
88
|
+
| F1 Score | 0.70-0.86 | ⚠️ Warning |
|
|
89
|
+
| F1 Score | <0.70 | ❌ Fail |
|
|
90
|
+
| Hallucination | ≤0.05 | ✅ Pass |
|
|
91
|
+
| Hallucination | >0.15 | ❌ Fail |
|
|
92
|
+
|
|
93
|
+
## Troubleshooting
|
|
94
|
+
|
|
95
|
+
### Low Recall
|
|
96
|
+
- Check chunk size (try 512-1024)
|
|
97
|
+
- Add more overlapping chunks
|
|
98
|
+
- Improve embedding model
|
|
99
|
+
|
|
100
|
+
### Low Precision
|
|
101
|
+
- Reduce chunk size
|
|
102
|
+
- Add more specific metadata filters
|
|
103
|
+
- Filter out irrelevant sources
|
|
104
|
+
|
|
105
|
+
### High Hallucination
|
|
106
|
+
- Add source citations to prompt
|
|
107
|
+
- Reduce max_tokens
|
|
108
|
+
- Use better context ranking
|
|
109
|
+
|
|
110
|
+
## Integration with MDAN
|
|
111
|
+
|
|
112
|
+
During VERIFY phase, Test Agent should:
|
|
113
|
+
1. Create RAG evaluation dataset from PRD
|
|
114
|
+
2. Run retrieval + generation tests
|
|
115
|
+
3. Report metrics in quality gate
|
|
116
|
+
4. Fail if thresholds not met
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Test Scenarios Index
|
|
2
|
+
|
|
3
|
+
> End-to-end conversational tests for MDAN projects
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Scenarios are conversation-based tests that validate agent behavior in realistic, multi-turn interactions. Unlike unit tests, they simulate how real users interact with your agent.
|
|
8
|
+
|
|
9
|
+
## Available Scenarios
|
|
10
|
+
|
|
11
|
+
| Scenario | Description | Agent |
|
|
12
|
+
|----------|-------------|-------|
|
|
13
|
+
| [basic_authentication.test.md](basic_authentication.test.md) | Login, logout, session management | Dev Agent |
|
|
14
|
+
| [user_registration.test.md](user_registration.test.md) | Signup, validation, confirmation | Dev Agent |
|
|
15
|
+
|
|
16
|
+
## Adding New Scenarios
|
|
17
|
+
|
|
18
|
+
1. Copy this template:
|
|
19
|
+
```bash
|
|
20
|
+
cp basic_authentication.test.md my_new_scenario.test.md
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
2. Edit the scenario:
|
|
24
|
+
- Update metadata (name, version, framework)
|
|
25
|
+
- Write the conversation script
|
|
26
|
+
- Define success criteria
|
|
27
|
+
- Add security checks if applicable
|
|
28
|
+
|
|
29
|
+
3. Run the scenario:
|
|
30
|
+
```bash
|
|
31
|
+
# Python/pytest
|
|
32
|
+
pytest tests/scenarios/my_new_scenario.test.md -v
|
|
33
|
+
|
|
34
|
+
# Node/TypeScript
|
|
35
|
+
npm test -- tests/scenarios/my_new_scenario.test.ts
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Scenario Format
|
|
39
|
+
|
|
40
|
+
Each scenario includes:
|
|
41
|
+
- **Metadata**: version, framework, duration estimate
|
|
42
|
+
- **Preconditions**: what must be true before testing
|
|
43
|
+
- **Script**: conversation steps with verification points
|
|
44
|
+
- **Success Criteria**: checklist of must-pass conditions
|
|
45
|
+
- **Security Checks**: validation for security requirements
|
|
46
|
+
|
|
47
|
+
## Integration with MDAN
|
|
48
|
+
|
|
49
|
+
Scenarios are automatically generated during the VERIFY phase:
|
|
50
|
+
1. Test Agent reviews implemented features
|
|
51
|
+
2. Creates relevant scenarios for critical flows
|
|
52
|
+
3. Runs scenarios to validate behavior
|
|
53
|
+
4. Reports results in quality gate
|
|
54
|
+
|
|
55
|
+
## Framework Support
|
|
56
|
+
|
|
57
|
+
| Framework | Command |
|
|
58
|
+
|-----------|---------|
|
|
59
|
+
| Jest | `jest tests/scenarios/` |
|
|
60
|
+
| Pytest | `pytest tests/scenarios/` |
|
|
61
|
+
| Playwright | `playwright test tests/scenarios/` |
|
|
62
|
+
| Vitest | `vitest run tests/scenarios/` |
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Scenario: User Authentication
|
|
2
|
+
|
|
3
|
+
> Test conversation flow for basic authentication functionality
|
|
4
|
+
|
|
5
|
+
## Metadata
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| scenario_name | basic_authentication |
|
|
10
|
+
| version | 1.0.0 |
|
|
11
|
+
| agent | Dev Agent |
|
|
12
|
+
| framework | [Jest/Pytest/Playwright] |
|
|
13
|
+
| estimated_duration | 30s |
|
|
14
|
+
|
|
15
|
+
## Description
|
|
16
|
+
|
|
17
|
+
Test that the authentication system handles login, logout, and session management correctly.
|
|
18
|
+
|
|
19
|
+
## Preconditions
|
|
20
|
+
|
|
21
|
+
- User is not logged in
|
|
22
|
+
- Database contains test users
|
|
23
|
+
- Auth service is running
|
|
24
|
+
|
|
25
|
+
## Script
|
|
26
|
+
|
|
27
|
+
### Test Case 1: Successful Login
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
USER: I want to log in with my account
|
|
31
|
+
AGENT: [Should prompt for credentials or display login form]
|
|
32
|
+
USER: My email is test@example.com and password is Test123!
|
|
33
|
+
AGENT: [Should validate credentials]
|
|
34
|
+
-> VERIFY: auth_token received
|
|
35
|
+
-> VERIFY: user object returned with correct email
|
|
36
|
+
USER: What's my username?
|
|
37
|
+
AGENT: [Should return 'test@example.com' from session]
|
|
38
|
+
-> VERIFY: response contains correct username
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Test Case 2: Invalid Credentials
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
USER: I want to log in
|
|
45
|
+
AGENT: [Should prompt for credentials]
|
|
46
|
+
USER: email: wrong@example.com, password: wrongpass
|
|
47
|
+
AGENT: [Should reject with error message]
|
|
48
|
+
-> VERIFY: error message does NOT reveal if email exists
|
|
49
|
+
-> VERIFY: no auth_token in response
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Test Case 3: Logout
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
USER: I'm logged in and want to log out
|
|
56
|
+
AGENT: [Should clear session]
|
|
57
|
+
-> VERIFY: session cleared
|
|
58
|
+
-> VERIFY: confirmation message shown
|
|
59
|
+
USER: Can I see my profile?
|
|
60
|
+
AGENT: [Should deny access]
|
|
61
|
+
-> VERIFY: 401 or redirect to login
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Success Criteria
|
|
65
|
+
|
|
66
|
+
- [ ] Login with valid credentials succeeds
|
|
67
|
+
- [ ] Login with invalid credentials fails with secure error
|
|
68
|
+
- [ ] Logout clears session completely
|
|
69
|
+
- [ ] Protected routes redirect unauthenticated users
|
|
70
|
+
- [ ] Session expires after configured timeout
|
|
71
|
+
|
|
72
|
+
## Failure Handling
|
|
73
|
+
|
|
74
|
+
If any step fails, the scenario should:
|
|
75
|
+
1. Capture the actual response
|
|
76
|
+
2. Compare with expected behavior
|
|
77
|
+
3. Log the difference for debugging
|
|
78
|
+
4. Fail the test with descriptive error
|
|
79
|
+
|
|
80
|
+
## Notes
|
|
81
|
+
|
|
82
|
+
This scenario tests the authentication flow end-to-end. Use with Playwright for browser-based testing or Pytest for API-based testing.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Scenario: User Registration
|
|
2
|
+
|
|
3
|
+
> Test conversation flow for new user signup
|
|
4
|
+
|
|
5
|
+
## Metadata
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| scenario_name | user_registration |
|
|
10
|
+
| version | 1.0.0 |
|
|
11
|
+
| agent | Dev Agent |
|
|
12
|
+
| framework | [Jest/Pytest/Playwright] |
|
|
13
|
+
| estimated_duration | 45s |
|
|
14
|
+
|
|
15
|
+
## Description
|
|
16
|
+
|
|
17
|
+
Test that the registration system handles new user creation, validation, and confirmation correctly.
|
|
18
|
+
|
|
19
|
+
## Preconditions
|
|
20
|
+
|
|
21
|
+
- User is not logged in
|
|
22
|
+
- Database is clean or has known state
|
|
23
|
+
- Email service is mocked or test-ready
|
|
24
|
+
|
|
25
|
+
## Script
|
|
26
|
+
|
|
27
|
+
### Test Case 1: Successful Registration
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
USER: I want to create a new account
|
|
31
|
+
AGENT: [Should prompt for registration details]
|
|
32
|
+
USER:
|
|
33
|
+
- email: newuser@example.com
|
|
34
|
+
- password: SecurePass123!
|
|
35
|
+
- name: New User
|
|
36
|
+
AGENT: [Should validate and create account]
|
|
37
|
+
-> VERIFY: user created in database
|
|
38
|
+
-> VERIFY: confirmation email sent
|
|
39
|
+
-> VERIFY: success message shown
|
|
40
|
+
-> VERIFY: user NOT automatically logged in (email verification required)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Test Case 2: Duplicate Email
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
USER: I want to register
|
|
47
|
+
AGENT: [Should prompt for details]
|
|
48
|
+
USER: email: existing@example.com (already in DB), password: Test123!
|
|
49
|
+
AGENT: [Should reject duplicate]
|
|
50
|
+
-> VERIFY: error message about email already exists
|
|
51
|
+
-> VERIFY: no account created
|
|
52
|
+
-> VERIFY: password NOT in error message (security)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Test Case 3: Invalid Email Format
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
USER: I want to register
|
|
59
|
+
AGENT: [Should prompt for details]
|
|
60
|
+
USER: email: not-an-email, password: Test123!
|
|
61
|
+
AGENT: [Should validate format]
|
|
62
|
+
-> VERIFY: error about invalid email format
|
|
63
|
+
-> VERIFY: no account created
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Test Case 4: Weak Password
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
USER: I want to register
|
|
70
|
+
AGENT: [Should prompt for details]
|
|
71
|
+
USER: email: valid@example.com, password: 123
|
|
72
|
+
AGENT: [Should reject weak password]
|
|
73
|
+
-> VERIFY: error about password requirements
|
|
74
|
+
-> VERIFY: no account created
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Test Case 5: Email Confirmation
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
USER: I've received my confirmation email
|
|
81
|
+
AGENT: [Should provide link or code input]
|
|
82
|
+
USER: Confirmation code: ABC123
|
|
83
|
+
AGENT: [Should verify and activate account]
|
|
84
|
+
-> VERIFY: account marked as active
|
|
85
|
+
-> VERIFY: user can now log in
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Success Criteria
|
|
89
|
+
|
|
90
|
+
- [ ] Valid registration creates account
|
|
91
|
+
- [ ] Duplicate email is rejected securely
|
|
92
|
+
- [ ] Invalid email format is rejected
|
|
93
|
+
- [ ] Weak passwords are rejected
|
|
94
|
+
- [ ] Email confirmation activates account
|
|
95
|
+
- [ ] Password requirements are enforced
|
|
96
|
+
- [ ] No sensitive data in error messages
|
|
97
|
+
|
|
98
|
+
## Security Checks
|
|
99
|
+
|
|
100
|
+
- Password not logged or exposed in errors
|
|
101
|
+
- Email enumeration prevention
|
|
102
|
+
- Rate limiting on registration attempts
|
|
103
|
+
- SQL injection prevention in form inputs
|
|
104
|
+
|
|
105
|
+
## Notes
|
|
106
|
+
|
|
107
|
+
This scenario covers the complete registration flow. Adjust validation rules based on project requirements.
|