agentv 3.9.2 → 3.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OIVGGWJ3.js → chunk-GWHHM6X2.js} +25 -14
- package/dist/chunk-GWHHM6X2.js.map +1 -0
- package/dist/{chunk-6ZAFWUBT.js → chunk-JLFFYTZA.js} +4 -4
- package/dist/{chunk-JGMJL2LV.js → chunk-TXCVDTEE.js} +8 -7
- package/dist/{chunk-JGMJL2LV.js.map → chunk-TXCVDTEE.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/{dist-PUPHGVKL.js → dist-FPC7J7KQ.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-BD56NB23.js → interactive-N463HRIL.js} +3 -3
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +458 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +12753 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.agentv/.env.example +25 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +177 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +316 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +137 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +27 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +115 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +278 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +333 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +79 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +298 -0
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +5 -0
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +4 -0
- package/package.json +3 -3
- package/dist/chunk-OIVGGWJ3.js.map +0 -1
- /package/dist/{chunk-6ZAFWUBT.js.map → chunk-JLFFYTZA.js.map} +0 -0
- /package/dist/{dist-PUPHGVKL.js.map → dist-FPC7J7KQ.js.map} +0 -0
- /package/dist/{interactive-BD56NB23.js.map → interactive-N463HRIL.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
3
3
|
import {
|
|
4
4
|
runCli
|
|
5
|
-
} from "./chunk-
|
|
6
|
-
import "./chunk-
|
|
7
|
-
import "./chunk-
|
|
5
|
+
} from "./chunk-JLFFYTZA.js";
|
|
6
|
+
import "./chunk-TXCVDTEE.js";
|
|
7
|
+
import "./chunk-GWHHM6X2.js";
|
|
8
8
|
import "./chunk-C5GOHBQM.js";
|
|
9
9
|
import "./chunk-JK6V4KVD.js";
|
|
10
10
|
import "./chunk-HQDCIXVH.js";
|
|
@@ -141,7 +141,7 @@ import {
|
|
|
141
141
|
transpileEvalYaml,
|
|
142
142
|
transpileEvalYamlFile,
|
|
143
143
|
trimBaselineResult
|
|
144
|
-
} from "./chunk-
|
|
144
|
+
} from "./chunk-GWHHM6X2.js";
|
|
145
145
|
import {
|
|
146
146
|
OtlpJsonFileExporter
|
|
147
147
|
} from "./chunk-C5GOHBQM.js";
|
|
@@ -300,4 +300,4 @@ export {
|
|
|
300
300
|
transpileEvalYamlFile,
|
|
301
301
|
trimBaselineResult
|
|
302
302
|
};
|
|
303
|
-
//# sourceMappingURL=dist-
|
|
303
|
+
//# sourceMappingURL=dist-FPC7J7KQ.js.map
|
package/dist/index.js
CHANGED
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
app,
|
|
4
4
|
preprocessArgv,
|
|
5
5
|
runCli
|
|
6
|
-
} from "./chunk-
|
|
7
|
-
import "./chunk-
|
|
8
|
-
import "./chunk-
|
|
6
|
+
} from "./chunk-JLFFYTZA.js";
|
|
7
|
+
import "./chunk-TXCVDTEE.js";
|
|
8
|
+
import "./chunk-GWHHM6X2.js";
|
|
9
9
|
import "./chunk-C5GOHBQM.js";
|
|
10
10
|
import "./chunk-JK6V4KVD.js";
|
|
11
11
|
import "./chunk-HQDCIXVH.js";
|
|
@@ -4,14 +4,14 @@ import {
|
|
|
4
4
|
fileExists,
|
|
5
5
|
findRepoRoot,
|
|
6
6
|
runEvalCommand
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-TXCVDTEE.js";
|
|
8
8
|
import {
|
|
9
9
|
DEFAULT_EVAL_PATTERNS,
|
|
10
10
|
getAgentvHome,
|
|
11
11
|
listTargetNames,
|
|
12
12
|
loadConfig,
|
|
13
13
|
readTargetDefinitions
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-GWHHM6X2.js";
|
|
15
15
|
import "./chunk-C5GOHBQM.js";
|
|
16
16
|
import "./chunk-JK6V4KVD.js";
|
|
17
17
|
import "./chunk-HQDCIXVH.js";
|
|
@@ -371,4 +371,4 @@ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
|
|
|
371
371
|
export {
|
|
372
372
|
launchInteractiveWizard
|
|
373
373
|
};
|
|
374
|
-
//# sourceMappingURL=interactive-
|
|
374
|
+
//# sourceMappingURL=interactive-N463HRIL.js.map
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# agentv-chat-to-eval
|
|
2
|
+
|
|
3
|
+
An AgentV skill that converts chat conversations into evaluation YAML files.
|
|
4
|
+
|
|
5
|
+
## What It Does
|
|
6
|
+
|
|
7
|
+
This skill takes a chat transcript — either as markdown conversation or JSON messages — and generates an AgentV-compatible eval file with test cases derived from the exchanges.
|
|
8
|
+
|
|
9
|
+
The LLM analyzes the conversation to:
|
|
10
|
+
1. Identify test-worthy exchanges (factual Q&A, task completion, edge cases)
|
|
11
|
+
2. Derive evaluation criteria from context
|
|
12
|
+
3. Generate valid YAML with `tests:`, `assert` evaluators, and rubrics
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
Provide a chat transcript and ask the agent to convert it:
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
Convert this conversation into an AgentV eval file:
|
|
20
|
+
|
|
21
|
+
User: What's the capital of France?
|
|
22
|
+
Assistant: The capital of France is Paris.
|
|
23
|
+
|
|
24
|
+
User: How do I reverse a list in Python?
|
|
25
|
+
Assistant: Use the `reverse()` method or slicing: `my_list[::-1]`
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or provide a JSON message array:
|
|
29
|
+
|
|
30
|
+
```json
|
|
31
|
+
[
|
|
32
|
+
{"role": "user", "content": "What's the capital of France?"},
|
|
33
|
+
{"role": "assistant", "content": "The capital of France is Paris."}
|
|
34
|
+
]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Example Output
|
|
38
|
+
|
|
39
|
+
```yaml
|
|
40
|
+
description: "General knowledge and coding Q&A"
|
|
41
|
+
|
|
42
|
+
tests:
|
|
43
|
+
- id: capital-of-france
|
|
44
|
+
criteria: "Correctly identify the capital of France"
|
|
45
|
+
input: "What's the capital of France?"
|
|
46
|
+
expected_output: "The capital of France is Paris."
|
|
47
|
+
assert:
|
|
48
|
+
- type: rubrics
|
|
49
|
+
criteria:
|
|
50
|
+
- States Paris as the capital
|
|
51
|
+
- Response is concise and direct
|
|
52
|
+
|
|
53
|
+
- id: python-reverse-list
|
|
54
|
+
criteria: "Explain how to reverse a list in Python"
|
|
55
|
+
input: "How do I reverse a list in Python?"
|
|
56
|
+
expected_output: "Use the `reverse()` method or slicing: `my_list[::-1]`"
|
|
57
|
+
assert:
|
|
58
|
+
- type: rubrics
|
|
59
|
+
criteria:
|
|
60
|
+
- Provides at least one valid method to reverse a list
|
|
61
|
+
- Code syntax is correct
|
|
62
|
+
- Explanation is clear and actionable
|
|
63
|
+
|
|
64
|
+
# Suggested additional evaluators:
|
|
65
|
+
# assert:
|
|
66
|
+
# - name: quality
|
|
67
|
+
# type: llm_judge
|
|
68
|
+
# prompt: ./prompts/quality.md
|
|
69
|
+
# - name: accuracy
|
|
70
|
+
# type: code_judge
|
|
71
|
+
# command: [./scripts/check_accuracy.py]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## When to Use
|
|
75
|
+
|
|
76
|
+
- You have a real conversation that demonstrates desired agent behavior
|
|
77
|
+
- You want to create regression tests from production interactions
|
|
78
|
+
- You're bootstrapping an eval suite from existing chat logs
|
|
79
|
+
- You need to convert Q&A pairs into structured test cases
|
|
80
|
+
|
|
81
|
+
## Related Skills
|
|
82
|
+
|
|
83
|
+
- **agentv-eval-builder** — Create eval files from scratch with full schema reference
|
|
84
|
+
- **agentv-eval-orchestrator** — Run evaluations without API keys
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentv-chat-to-eval
|
|
3
|
+
description: Convert chat conversations into AgentV evaluation YAML files. Use this skill when you have a chat transcript (markdown or JSON messages) and want to generate eval test cases from it.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Chat-to-Eval Converter
|
|
7
|
+
|
|
8
|
+
Convert chat transcripts into AgentV evaluation YAML files by extracting test-worthy exchanges.
|
|
9
|
+
|
|
10
|
+
## Input Variables
|
|
11
|
+
|
|
12
|
+
- `transcript`: Chat transcript as markdown conversation or JSON message array
|
|
13
|
+
- `eval-path` (optional): Output path for the generated YAML file
|
|
14
|
+
|
|
15
|
+
## Workflow
|
|
16
|
+
|
|
17
|
+
### 1. Parse the Transcript
|
|
18
|
+
|
|
19
|
+
Accept input in either format:
|
|
20
|
+
|
|
21
|
+
**Markdown conversation:**
|
|
22
|
+
```
|
|
23
|
+
User: How do I reset my password?
|
|
24
|
+
Assistant: Go to Settings > Security > Reset Password...
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**JSON messages:**
|
|
28
|
+
```json
|
|
29
|
+
[
|
|
30
|
+
{"role": "user", "content": "How do I reset my password?"},
|
|
31
|
+
{"role": "assistant", "content": "Go to Settings > Security > Reset Password..."}
|
|
32
|
+
]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Normalize both formats into `{role, content}` message pairs.
|
|
36
|
+
|
|
37
|
+
### 2. Identify Test-Worthy Exchanges
|
|
38
|
+
|
|
39
|
+
Extract exchanges that are good eval candidates. Prioritize:
|
|
40
|
+
|
|
41
|
+
- **Factual Q&A** — User asks a question, agent gives a verifiable answer
|
|
42
|
+
- **Task completion** — User requests an action, agent performs it
|
|
43
|
+
- **Multi-turn reasoning** — Exchanges where context from earlier turns matters
|
|
44
|
+
- **Edge cases** — Unusual inputs, error handling, boundary conditions
|
|
45
|
+
- **Domain expertise** — Responses requiring specialized knowledge
|
|
46
|
+
|
|
47
|
+
Skip:
|
|
48
|
+
- Greetings and small talk (unless testing social behavior)
|
|
49
|
+
- Acknowledgments without substance ("OK", "Got it")
|
|
50
|
+
- Repeated or redundant exchanges
|
|
51
|
+
|
|
52
|
+
### 3. Derive Criteria and Rubrics
|
|
53
|
+
|
|
54
|
+
For each selected exchange, infer evaluation criteria from the conversation context:
|
|
55
|
+
|
|
56
|
+
- What the user implicitly expected
|
|
57
|
+
- Quality signals in the assistant's response (accuracy, completeness, tone)
|
|
58
|
+
- Any corrections or follow-ups that reveal what "good" looks like
|
|
59
|
+
|
|
60
|
+
Generate rubrics that capture these quality dimensions.
|
|
61
|
+
|
|
62
|
+
### 4. Generate EVAL YAML
|
|
63
|
+
|
|
64
|
+
Produce a valid AgentV eval file using **`tests:`** (not `cases:`).
|
|
65
|
+
|
|
66
|
+
**Structure:**
|
|
67
|
+
|
|
68
|
+
```yaml
|
|
69
|
+
description: "<Summarize what this eval covers>"
|
|
70
|
+
|
|
71
|
+
tests:
|
|
72
|
+
- id: <kebab-case-id>
|
|
73
|
+
criteria: "<What the response should accomplish>"
|
|
74
|
+
input: "<User message>"
|
|
75
|
+
expected_output: "<Assistant response from transcript>"
|
|
76
|
+
assert:
|
|
77
|
+
- type: rubrics
|
|
78
|
+
criteria:
|
|
79
|
+
- <Quality criterion 1>
|
|
80
|
+
- <Quality criterion 2>
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Rules:**
|
|
84
|
+
- Use `tests:` as the top-level array key — never `cases:`
|
|
85
|
+
- Generate kebab-case `id` values derived from the exchange topic
|
|
86
|
+
- Write `criteria` as a concise statement of what a good response achieves
|
|
87
|
+
- Use `input` for single user messages; use `input` for multi-turn
|
|
88
|
+
- Set `expected_output` to the actual assistant response from the transcript
|
|
89
|
+
- Include 2–4 rubrics per test as `type: rubrics` under `assert` capturing distinct quality dimensions
|
|
90
|
+
|
|
91
|
+
### 5. Suggest Evaluators
|
|
92
|
+
|
|
93
|
+
Append a commented evaluator configuration based on the test content:
|
|
94
|
+
|
|
95
|
+
```yaml
|
|
96
|
+
# Suggested additional evaluators:
|
|
97
|
+
# assert:
|
|
98
|
+
# - name: quality
|
|
99
|
+
# type: llm_judge
|
|
100
|
+
# prompt: ./prompts/quality.md
|
|
101
|
+
# - name: accuracy
|
|
102
|
+
# type: code_judge
|
|
103
|
+
# command: [./scripts/check_accuracy.py]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
- Recommend `llm_judge` for subjective quality (tone, helpfulness, completeness)
|
|
107
|
+
- Recommend `code_judge` for deterministic checks (format, required fields, exact values)
|
|
108
|
+
- Recommend `field_accuracy` when expected output has structured fields
|
|
109
|
+
|
|
110
|
+
### 6. Write Output
|
|
111
|
+
|
|
112
|
+
- If `eval-path` is provided, write the YAML to that path
|
|
113
|
+
- Otherwise, output the YAML to the conversation for the user to copy
|
|
114
|
+
|
|
115
|
+
## Multi-Turn Conversations
|
|
116
|
+
|
|
117
|
+
For conversations with context dependencies across turns, use `input`:
|
|
118
|
+
|
|
119
|
+
```yaml
|
|
120
|
+
tests:
|
|
121
|
+
- id: multi-turn-context
|
|
122
|
+
criteria: "Agent remembers prior context"
|
|
123
|
+
input:
|
|
124
|
+
- role: user
|
|
125
|
+
content: "My name is Alice"
|
|
126
|
+
- role: assistant
|
|
127
|
+
content: "Nice to meet you, Alice!"
|
|
128
|
+
- role: user
|
|
129
|
+
content: "What's my name?"
|
|
130
|
+
expected_output: "Your name is Alice."
|
|
131
|
+
assert:
|
|
132
|
+
- type: rubrics
|
|
133
|
+
criteria:
|
|
134
|
+
- Correctly recalls the user's name from earlier in the conversation
|
|
135
|
+
- Response is natural and conversational
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Guidelines
|
|
139
|
+
|
|
140
|
+
- **Preserve original wording** in `expected_output` — use the actual transcript text
|
|
141
|
+
- **Be selective** — not every exchange makes a good test; aim for 5–15 tests per transcript
|
|
142
|
+
- **Diverse coverage** — pick exchanges that test different capabilities
|
|
143
|
+
- **Actionable rubrics** — each rubric should be independently evaluable (pass/fail)
|
|
144
|
+
- **Validate output** — the generated YAML must pass `agentv validate <file>`
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Example: JSON Messages Input
|
|
2
|
+
|
|
3
|
+
## Input
|
|
4
|
+
|
|
5
|
+
```json
|
|
6
|
+
[
|
|
7
|
+
{"role": "user", "content": "Convert 72°F to Celsius"},
|
|
8
|
+
{"role": "assistant", "content": "72°F is approximately 22.2°C. The formula is: (°F - 32) × 5/9 = °C"},
|
|
9
|
+
{"role": "user", "content": "What about -40?"},
|
|
10
|
+
{"role": "assistant", "content": "-40°F equals exactly -40°C. This is the unique point where Fahrenheit and Celsius scales intersect."},
|
|
11
|
+
{"role": "user", "content": "Is 0 Kelvin the same as -273.15°C?"},
|
|
12
|
+
{"role": "assistant", "content": "Yes, 0 Kelvin (absolute zero) equals -273.15°C. It's the theoretical lowest possible temperature where all molecular motion stops."}
|
|
13
|
+
]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Output
|
|
17
|
+
|
|
18
|
+
```yaml
|
|
19
|
+
description: "Temperature conversion and thermodynamics knowledge"
|
|
20
|
+
|
|
21
|
+
tests:
|
|
22
|
+
- id: fahrenheit-to-celsius
|
|
23
|
+
criteria: "Convert Fahrenheit to Celsius with correct formula"
|
|
24
|
+
input: "Convert 72°F to Celsius"
|
|
25
|
+
expected_output: "72°F is approximately 22.2°C. The formula is: (°F - 32) × 5/9 = °C"
|
|
26
|
+
assert:
|
|
27
|
+
- type: rubrics
|
|
28
|
+
criteria:
|
|
29
|
+
- Provides the correct converted value (approximately 22.2°C)
|
|
30
|
+
- Shows or references the conversion formula
|
|
31
|
+
- Uses appropriate precision
|
|
32
|
+
|
|
33
|
+
- id: negative-forty-intersection
|
|
34
|
+
criteria: "Identify the Fahrenheit-Celsius intersection point"
|
|
35
|
+
input:
|
|
36
|
+
- role: user
|
|
37
|
+
content: "Convert 72°F to Celsius"
|
|
38
|
+
- role: assistant
|
|
39
|
+
content: "72°F is approximately 22.2°C."
|
|
40
|
+
- role: user
|
|
41
|
+
content: "What about -40?"
|
|
42
|
+
expected_output: "-40°F equals exactly -40°C. This is the unique point where Fahrenheit and Celsius scales intersect."
|
|
43
|
+
assert:
|
|
44
|
+
- type: rubrics
|
|
45
|
+
criteria:
|
|
46
|
+
- States that -40°F equals -40°C
|
|
47
|
+
- Mentions this is the intersection point of both scales
|
|
48
|
+
|
|
49
|
+
- id: absolute-zero
|
|
50
|
+
criteria: "Confirm absolute zero equivalence and explain its significance"
|
|
51
|
+
input: "Is 0 Kelvin the same as -273.15°C?"
|
|
52
|
+
expected_output: "Yes, 0 Kelvin (absolute zero) equals -273.15°C. It's the theoretical lowest possible temperature where all molecular motion stops."
|
|
53
|
+
assert:
|
|
54
|
+
- type: rubrics
|
|
55
|
+
criteria:
|
|
56
|
+
- Confirms the equivalence of 0K and -273.15°C
|
|
57
|
+
- Explains the physical significance of absolute zero
|
|
58
|
+
|
|
59
|
+
# Suggested additional evaluators:
|
|
60
|
+
# assert:
|
|
61
|
+
# - name: accuracy
|
|
62
|
+
# type: code_judge
|
|
63
|
+
# command: [./scripts/check_conversion.py]
|
|
64
|
+
# - name: quality
|
|
65
|
+
# type: llm_judge
|
|
66
|
+
# prompt: ./prompts/quality.md
|
|
67
|
+
```
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Example: Markdown Transcript Input
|
|
2
|
+
|
|
3
|
+
## Input
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
User: What are the SOLID principles in software engineering?
|
|
7
|
+
Assistant: The SOLID principles are five design principles for object-oriented programming:
|
|
8
|
+
|
|
9
|
+
1. **Single Responsibility** — A class should have only one reason to change
|
|
10
|
+
2. **Open/Closed** — Open for extension, closed for modification
|
|
11
|
+
3. **Liskov Substitution** — Subtypes must be substitutable for their base types
|
|
12
|
+
4. **Interface Segregation** — Prefer small, specific interfaces over large ones
|
|
13
|
+
5. **Dependency Inversion** — Depend on abstractions, not concretions
|
|
14
|
+
|
|
15
|
+
User: Can you give me an example of the Single Responsibility Principle?
|
|
16
|
+
Assistant: Sure! Instead of a class that handles both user authentication and email sending, split them:
|
|
17
|
+
|
|
18
|
+
Bad: `UserService` handles login, registration, AND sends welcome emails.
|
|
19
|
+
Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
|
|
20
|
+
|
|
21
|
+
Each class has one job, making them easier to test and maintain.
|
|
22
|
+
|
|
23
|
+
User: What's the difference between an interface and an abstract class?
|
|
24
|
+
Assistant: Key differences:
|
|
25
|
+
|
|
26
|
+
- **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
|
|
27
|
+
- **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
|
|
28
|
+
|
|
29
|
+
Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
```yaml
|
|
35
|
+
description: "Software engineering concepts — SOLID principles and OOP"
|
|
36
|
+
|
|
37
|
+
tests:
|
|
38
|
+
- id: solid-principles-overview
|
|
39
|
+
criteria: "List and briefly explain all five SOLID principles"
|
|
40
|
+
input: "What are the SOLID principles in software engineering?"
|
|
41
|
+
expected_output: |
|
|
42
|
+
The SOLID principles are five design principles for object-oriented programming:
|
|
43
|
+
|
|
44
|
+
1. **Single Responsibility** — A class should have only one reason to change
|
|
45
|
+
2. **Open/Closed** — Open for extension, closed for modification
|
|
46
|
+
3. **Liskov Substitution** — Subtypes must be substitutable for their base types
|
|
47
|
+
4. **Interface Segregation** — Prefer small, specific interfaces over large ones
|
|
48
|
+
5. **Dependency Inversion** — Depend on abstractions, not concretions
|
|
49
|
+
assert:
|
|
50
|
+
- type: rubrics
|
|
51
|
+
criteria:
|
|
52
|
+
- Lists all five SOLID principles by name
|
|
53
|
+
- Provides a concise explanation for each principle
|
|
54
|
+
- Explanations are technically accurate
|
|
55
|
+
|
|
56
|
+
- id: srp-example
|
|
57
|
+
criteria: "Provide a concrete example of the Single Responsibility Principle"
|
|
58
|
+
input:
|
|
59
|
+
- role: user
|
|
60
|
+
content: "What are the SOLID principles in software engineering?"
|
|
61
|
+
- role: assistant
|
|
62
|
+
content: "The SOLID principles are five design principles..."
|
|
63
|
+
- role: user
|
|
64
|
+
content: "Can you give me an example of the Single Responsibility Principle?"
|
|
65
|
+
expected_output: |
|
|
66
|
+
Sure! Instead of a class that handles both user authentication and email sending, split them:
|
|
67
|
+
|
|
68
|
+
Bad: `UserService` handles login, registration, AND sends welcome emails.
|
|
69
|
+
Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
|
|
70
|
+
|
|
71
|
+
Each class has one job, making them easier to test and maintain.
|
|
72
|
+
assert:
|
|
73
|
+
- type: rubrics
|
|
74
|
+
criteria:
|
|
75
|
+
- Shows a bad example that violates SRP
|
|
76
|
+
- Shows a good example that follows SRP
|
|
77
|
+
- Explains why the separation is beneficial
|
|
78
|
+
|
|
79
|
+
- id: interface-vs-abstract-class
|
|
80
|
+
criteria: "Explain the difference between interfaces and abstract classes"
|
|
81
|
+
input: "What's the difference between an interface and an abstract class?"
|
|
82
|
+
expected_output: |
|
|
83
|
+
Key differences:
|
|
84
|
+
|
|
85
|
+
- **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
|
|
86
|
+
- **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
|
|
87
|
+
|
|
88
|
+
Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
|
|
89
|
+
assert:
|
|
90
|
+
- type: rubrics
|
|
91
|
+
criteria:
|
|
92
|
+
- Correctly distinguishes interfaces from abstract classes
|
|
93
|
+
- Mentions multiple inheritance support for interfaces
|
|
94
|
+
- Provides guidance on when to use each
|
|
95
|
+
|
|
96
|
+
# Suggested additional evaluators:
|
|
97
|
+
# assert:
|
|
98
|
+
# - name: quality
|
|
99
|
+
# type: llm_judge
|
|
100
|
+
# prompt: ./prompts/quality.md
|
|
101
|
+
```
|