agentv 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +380 -0
- package/dist/chunk-S3RN2GSO.js +14542 -0
- package/dist/chunk-S3RN2GSO.js.map +1 -0
- package/dist/cli.js +8 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/templates/eval-build.prompt.md +100 -0
- package/dist/templates/eval-schema.json +182 -0
- package/package.json +40 -0
package/dist/cli.js
ADDED
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from './index.js';\r\n\r\nvoid runCli();\r\n"],"mappings":";;;;;;AAGA,KAAK,OAAO;","names":[]}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: 'Apply when writing evals in YAML format'
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Schema Reference
|
|
6
|
+
- Schema: #file:../contexts/eval-schema.json (JSON Schema for validation and tooling)
|
|
7
|
+
- Format: YAML with structured content arrays
|
|
8
|
+
|
|
9
|
+
## Structure Requirements
|
|
10
|
+
- Root level: `version` (required: "2.0"), `description` (optional), `target` (optional), `evalcases` (required)
|
|
11
|
+
- Eval case fields: `id` (required), `outcome` (required), `input_messages` (required), `expected_messages` (required)
|
|
12
|
+
- Optional fields: `conversation_id`, `note`, `execution`
|
|
13
|
+
- Message fields: `role` (required), `content` (required)
|
|
14
|
+
- Message roles: `system`, `user`, `assistant`, `tool`
|
|
15
|
+
- Content types: `text` (inline), `file` (relative or absolute path)
|
|
16
|
+
- File paths must start with "/" for absolute paths (e.g., "/prompts/file.md")
|
|
17
|
+
|
|
18
|
+
## Example
|
|
19
|
+
```yaml
|
|
20
|
+
version: 2.0
|
|
21
|
+
description: Example showing basic features and conversation threading
|
|
22
|
+
target: default
|
|
23
|
+
|
|
24
|
+
evalcases:
|
|
25
|
+
# Basic eval case with file references
|
|
26
|
+
- id: code-review-basic
|
|
27
|
+
outcome: Assistant provides helpful code analysis
|
|
28
|
+
|
|
29
|
+
input_messages:
|
|
30
|
+
- role: system
|
|
31
|
+
content: You are an expert code reviewer.
|
|
32
|
+
- role: user
|
|
33
|
+
content:
|
|
34
|
+
- type: text
|
|
35
|
+
value: |-
|
|
36
|
+
Review this function:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
def add(a, b):
|
|
40
|
+
return a + b
|
|
41
|
+
```
|
|
42
|
+
# File paths can be relative or absolute
|
|
43
|
+
- type: file
|
|
44
|
+
value: /prompts/python.instructions.md
|
|
45
|
+
|
|
46
|
+
expected_messages:
|
|
47
|
+
- role: assistant
|
|
48
|
+
content: |-
|
|
49
|
+
The function is simple and correct. Suggestions:
|
|
50
|
+
- Add type hints: `def add(a: int, b: int) -> int:`
|
|
51
|
+
- Add docstring
|
|
52
|
+
- Consider validation for edge cases
|
|
53
|
+
|
|
54
|
+
# Advanced: conversation threading, multiple evaluators
|
|
55
|
+
- id: python-coding-session
|
|
56
|
+
conversation_id: python-coding-session
|
|
57
|
+
outcome: Generates correct code with proper error handling
|
|
58
|
+
|
|
59
|
+
execution:
|
|
60
|
+
target: azure_base
|
|
61
|
+
evaluators:
|
|
62
|
+
- name: keyword_check
|
|
63
|
+
type: code
|
|
64
|
+
script: /evaluators/scripts/check_keywords.py
|
|
65
|
+
- name: semantic_judge
|
|
66
|
+
type: llm_judge
|
|
67
|
+
prompt: /evaluators/prompts/correctness.md
|
|
68
|
+
model: gpt-5-chat
|
|
69
|
+
|
|
70
|
+
input_messages:
|
|
71
|
+
- role: system
|
|
72
|
+
content: You are a code generator.
|
|
73
|
+
- role: user
|
|
74
|
+
content:
|
|
75
|
+
- type: text
|
|
76
|
+
value: Create a function to find the second largest number in a list.
|
|
77
|
+
- type: file
|
|
78
|
+
value: /prompts/python.instructions.md
|
|
79
|
+
|
|
80
|
+
expected_messages:
|
|
81
|
+
- role: assistant
|
|
82
|
+
content: |-
|
|
83
|
+
```python
|
|
84
|
+
from typing import List, Union
|
|
85
|
+
|
|
86
|
+
def find_second_largest(numbers: List[int]) -> Union[int, None]:
|
|
87
|
+
"""Find the second largest number."""
|
|
88
|
+
if not isinstance(numbers, list):
|
|
89
|
+
raise TypeError("Input must be a list")
|
|
90
|
+
if not numbers:
|
|
91
|
+
raise ValueError("List cannot be empty")
|
|
92
|
+
|
|
93
|
+
unique = list(set(numbers))
|
|
94
|
+
if len(unique) < 2:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
unique.sort(reverse=True)
|
|
98
|
+
return unique[1]
|
|
99
|
+
```
|
|
100
|
+
```
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Eval Schema",
|
|
4
|
+
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"version": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema version",
|
|
10
|
+
"enum": ["2.0"]
|
|
11
|
+
},
|
|
12
|
+
"description": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Description of what this eval suite covers"
|
|
15
|
+
},
|
|
16
|
+
"target": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
19
|
+
},
|
|
20
|
+
"evalcases": {
|
|
21
|
+
"type": "array",
|
|
22
|
+
"description": "Array of evaluation cases",
|
|
23
|
+
"minItems": 1,
|
|
24
|
+
"items": {
|
|
25
|
+
"type": "object",
|
|
26
|
+
"properties": {
|
|
27
|
+
"id": {
|
|
28
|
+
"type": "string",
|
|
29
|
+
"description": "Unique identifier for the eval case"
|
|
30
|
+
},
|
|
31
|
+
"conversation_id": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
34
|
+
},
|
|
35
|
+
"outcome": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"description": "Description of what the AI should accomplish in this eval"
|
|
38
|
+
},
|
|
39
|
+
"note": {
|
|
40
|
+
"type": "string",
|
|
41
|
+
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
42
|
+
},
|
|
43
|
+
"input_messages": {
|
|
44
|
+
"type": "array",
|
|
45
|
+
"description": "Input messages for the conversation",
|
|
46
|
+
"minItems": 1,
|
|
47
|
+
"items": {
|
|
48
|
+
"type": "object",
|
|
49
|
+
"properties": {
|
|
50
|
+
"role": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
53
|
+
"description": "Message role"
|
|
54
|
+
},
|
|
55
|
+
"content": {
|
|
56
|
+
"oneOf": [
|
|
57
|
+
{
|
|
58
|
+
"type": "string",
|
|
59
|
+
"description": "Simple text content"
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"type": "array",
|
|
63
|
+
"description": "Mixed content items (text and file references)",
|
|
64
|
+
"items": {
|
|
65
|
+
"type": "object",
|
|
66
|
+
"properties": {
|
|
67
|
+
"type": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"enum": ["text", "file"],
|
|
70
|
+
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
71
|
+
},
|
|
72
|
+
"value": {
|
|
73
|
+
"type": "string",
|
|
74
|
+
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
75
|
+
}
|
|
76
|
+
},
|
|
77
|
+
"required": ["type", "value"],
|
|
78
|
+
"additionalProperties": false
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
"required": ["role", "content"],
|
|
85
|
+
"additionalProperties": false
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
"expected_messages": {
|
|
89
|
+
"type": "array",
|
|
90
|
+
"description": "Expected response messages",
|
|
91
|
+
"minItems": 1,
|
|
92
|
+
"items": {
|
|
93
|
+
"type": "object",
|
|
94
|
+
"properties": {
|
|
95
|
+
"role": {
|
|
96
|
+
"type": "string",
|
|
97
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
98
|
+
"description": "Message role"
|
|
99
|
+
},
|
|
100
|
+
"content": {
|
|
101
|
+
"oneOf": [
|
|
102
|
+
{
|
|
103
|
+
"type": "string",
|
|
104
|
+
"description": "Simple text content"
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"type": "array",
|
|
108
|
+
"description": "Mixed content items",
|
|
109
|
+
"items": {
|
|
110
|
+
"type": "object",
|
|
111
|
+
"properties": {
|
|
112
|
+
"type": {
|
|
113
|
+
"type": "string",
|
|
114
|
+
"enum": ["text", "file"]
|
|
115
|
+
},
|
|
116
|
+
"value": {
|
|
117
|
+
"type": "string"
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
"required": ["type", "value"],
|
|
121
|
+
"additionalProperties": false
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"required": ["role", "content"],
|
|
128
|
+
"additionalProperties": false
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
"execution": {
|
|
132
|
+
"type": "object",
|
|
133
|
+
"description": "Per-case execution configuration",
|
|
134
|
+
"properties": {
|
|
135
|
+
"target": {
|
|
136
|
+
"type": "string",
|
|
137
|
+
"description": "Override target for this specific eval case"
|
|
138
|
+
},
|
|
139
|
+
"evaluators": {
|
|
140
|
+
"type": "array",
|
|
141
|
+
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
142
|
+
"items": {
|
|
143
|
+
"type": "object",
|
|
144
|
+
"properties": {
|
|
145
|
+
"name": {
|
|
146
|
+
"type": "string",
|
|
147
|
+
"description": "Evaluator name/identifier"
|
|
148
|
+
},
|
|
149
|
+
"type": {
|
|
150
|
+
"type": "string",
|
|
151
|
+
"enum": ["code", "llm_judge"],
|
|
152
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
153
|
+
},
|
|
154
|
+
"script": {
|
|
155
|
+
"type": "string",
|
|
156
|
+
"description": "Path to evaluator script (for type: code)"
|
|
157
|
+
},
|
|
158
|
+
"prompt": {
|
|
159
|
+
"type": "string",
|
|
160
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
161
|
+
},
|
|
162
|
+
"model": {
|
|
163
|
+
"type": "string",
|
|
164
|
+
"description": "Model to use for LLM judge (for type: llm_judge)"
|
|
165
|
+
}
|
|
166
|
+
},
|
|
167
|
+
"required": ["name", "type"],
|
|
168
|
+
"additionalProperties": true
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"additionalProperties": true
|
|
173
|
+
}
|
|
174
|
+
},
|
|
175
|
+
"required": ["id", "outcome", "input_messages", "expected_messages"],
|
|
176
|
+
"additionalProperties": false
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
},
|
|
180
|
+
"required": ["evalcases"],
|
|
181
|
+
"additionalProperties": false
|
|
182
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "agentv",
|
|
3
|
+
"version": "0.2.3",
|
|
4
|
+
"description": "CLI entry point for AgentV",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/EntityProcess/agentv.git"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://github.com/EntityProcess/agentv#readme",
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/EntityProcess/agentv/issues"
|
|
13
|
+
},
|
|
14
|
+
"bin": {
|
|
15
|
+
"agentv": "./dist/cli.js"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"README.md"
|
|
20
|
+
],
|
|
21
|
+
"dependencies": {
|
|
22
|
+
"commander": "^12.1.0",
|
|
23
|
+
"dotenv": "^16.4.5",
|
|
24
|
+
"log-update": "^7.0.1",
|
|
25
|
+
"yaml": "^2.6.1",
|
|
26
|
+
"@agentv/core": "0.2.3"
|
|
27
|
+
},
|
|
28
|
+
"devDependencies": {
|
|
29
|
+
"execa": "^9.3.0"
|
|
30
|
+
},
|
|
31
|
+
"scripts": {
|
|
32
|
+
"dev": "tsx watch src/index.ts",
|
|
33
|
+
"build": "tsup",
|
|
34
|
+
"typecheck": "tsc --noEmit",
|
|
35
|
+
"lint": "eslint . --ext .ts",
|
|
36
|
+
"test": "vitest run",
|
|
37
|
+
"test:watch": "vitest",
|
|
38
|
+
"test:coverage": "vitest run --coverage"
|
|
39
|
+
}
|
|
40
|
+
}
|