agentv 4.26.1 → 4.27.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-JA4WQNE6.js → chunk-47JX7NNZ.js} +10 -2
- package/dist/chunk-47JX7NNZ.js.map +1 -0
- package/dist/{chunk-XBUHMRX2.js → chunk-V3LWJB5X.js} +431 -49
- package/dist/chunk-V3LWJB5X.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-YMKWKPD7.js → interactive-L6PIIFNQ.js} +2 -2
- package/dist/skills/agentv-bench/LICENSE.txt +202 -0
- package/dist/skills/agentv-bench/SKILL.md +459 -0
- package/dist/skills/agentv-bench/agents/analyzer.md +177 -0
- package/dist/skills/agentv-bench/agents/comparator.md +247 -0
- package/dist/skills/agentv-bench/agents/executor.md +30 -0
- package/dist/skills/agentv-bench/agents/grader.md +238 -0
- package/dist/skills/agentv-bench/agents/mutator.md +172 -0
- package/dist/skills/agentv-bench/references/autoresearch.md +309 -0
- package/dist/skills/agentv-bench/references/description-optimization.md +66 -0
- package/dist/skills/agentv-bench/references/environment-adaptation.md +82 -0
- package/dist/skills/agentv-bench/references/eval-yaml-spec.md +338 -0
- package/dist/skills/agentv-bench/references/migrating-from-skill-creator.md +103 -0
- package/dist/skills/agentv-bench/references/schemas.md +432 -0
- package/dist/skills/agentv-bench/references/subagent-pipeline.md +181 -0
- package/dist/skills/agentv-bench/scripts/trajectory.html +462 -0
- package/dist/skills/agentv-eval-review/SKILL.md +53 -0
- package/dist/skills/agentv-eval-review/scripts/lint_eval.py +239 -0
- package/dist/skills/agentv-eval-writer/SKILL.md +707 -0
- package/dist/skills/agentv-eval-writer/references/config-schema.json +63 -0
- package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +119 -0
- package/dist/skills/agentv-eval-writer/references/eval-schema.json +19077 -0
- package/dist/skills/agentv-eval-writer/references/rubric-evaluator.md +114 -0
- package/dist/skills/agentv-governance/SKILL.md +79 -0
- package/dist/skills/agentv-governance/references/eu-ai-act-risk-tiers.md +37 -0
- package/dist/skills/agentv-governance/references/governance-yaml-shape.md +125 -0
- package/dist/skills/agentv-governance/references/iso-42001-controls.md +46 -0
- package/dist/skills/agentv-governance/references/lint-rules.md +169 -0
- package/dist/skills/agentv-governance/references/mitre-atlas.md +38 -0
- package/dist/skills/agentv-governance/references/owasp-agentic-top-10-2025.md +28 -0
- package/dist/skills/agentv-governance/references/owasp-llm-top-10-2025.md +25 -0
- package/dist/skills/agentv-trace-analyst/SKILL.md +161 -0
- package/package.json +1 -1
- package/dist/chunk-JA4WQNE6.js.map +0 -1
- package/dist/chunk-XBUHMRX2.js.map +0 -1
- /package/dist/{interactive-YMKWKPD7.js.map → interactive-L6PIIFNQ.js.map} +0 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Lint AgentV eval YAML files for common issues.
|
|
3
|
+
|
|
4
|
+
Usage: python lint_eval.py <path-to-eval-dir-or-file> [--json]
|
|
5
|
+
|
|
6
|
+
Checks:
|
|
7
|
+
- File uses .eval.yaml extension
|
|
8
|
+
- description field present
|
|
9
|
+
- Each test has id, input, criteria
|
|
10
|
+
- File paths in type:file use leading /
|
|
11
|
+
- assertions blocks present (not relying solely on expected_output)
|
|
12
|
+
- expected_output does not contain evaluation criteria prose
|
|
13
|
+
- Repeated file inputs across tests (should use top-level input)
|
|
14
|
+
- Naming prefix consistency across eval files in same directory
|
|
15
|
+
|
|
16
|
+
Exit code: 0 if no issues, 1 if issues found.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import yaml
|
|
27
|
+
except ImportError:
|
|
28
|
+
# Fall back to basic YAML parsing if PyYAML not available
|
|
29
|
+
yaml = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_yaml_basic(text: str) -> dict:
|
|
33
|
+
"""Minimal YAML-ish parser for when PyYAML is unavailable."""
|
|
34
|
+
# This is a best-effort fallback; recommend installing PyYAML
|
|
35
|
+
import ast
|
|
36
|
+
# Try json first (YAML is a superset of JSON)
|
|
37
|
+
try:
|
|
38
|
+
return json.loads(text)
|
|
39
|
+
except Exception:
|
|
40
|
+
pass
|
|
41
|
+
return {}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_yaml(path: Path) -> dict:
|
|
45
|
+
text = path.read_text(encoding="utf-8")
|
|
46
|
+
if yaml:
|
|
47
|
+
return yaml.safe_load(text) or {}
|
|
48
|
+
return parse_yaml_basic(text)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def lint_file(path: Path) -> list[dict]:
|
|
52
|
+
issues = []
|
|
53
|
+
|
|
54
|
+
def issue(severity: str, msg: str, line: int | None = None):
|
|
55
|
+
issues.append({"file": str(path), "severity": severity, "message": msg, "line": line})
|
|
56
|
+
|
|
57
|
+
# Check extension
|
|
58
|
+
if not path.name.endswith(".eval.yaml"):
|
|
59
|
+
issue("error", f"File should use .eval.yaml extension, got: {path.name}")
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
data = load_yaml(path)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
issue("error", f"Failed to parse YAML: {e}")
|
|
65
|
+
return issues
|
|
66
|
+
|
|
67
|
+
if not isinstance(data, dict):
|
|
68
|
+
issue("error", "Root element is not a mapping")
|
|
69
|
+
return issues
|
|
70
|
+
|
|
71
|
+
# Check description
|
|
72
|
+
if "description" not in data:
|
|
73
|
+
issue("warning", "Missing top-level 'description' field")
|
|
74
|
+
|
|
75
|
+
tests = data.get("tests", [])
|
|
76
|
+
if not isinstance(tests, list):
|
|
77
|
+
issue("error", "'tests' is not a list")
|
|
78
|
+
return issues
|
|
79
|
+
|
|
80
|
+
if not tests:
|
|
81
|
+
issue("warning", "No tests defined")
|
|
82
|
+
return issues
|
|
83
|
+
|
|
84
|
+
# Check for top-level input (shared file references)
|
|
85
|
+
top_level_input = data.get("input")
|
|
86
|
+
|
|
87
|
+
# Collect file values across tests to detect repetition
|
|
88
|
+
file_values_per_test: list[list[str]] = []
|
|
89
|
+
|
|
90
|
+
for i, test in enumerate(tests):
|
|
91
|
+
test_id = test.get("id", f"test-{i}")
|
|
92
|
+
|
|
93
|
+
if "id" not in test:
|
|
94
|
+
issue("error", f"Test at index {i} missing 'id'")
|
|
95
|
+
|
|
96
|
+
if "input" not in test and top_level_input is None:
|
|
97
|
+
issue("error", f"Test '{test_id}' missing 'input' and no top-level input defined")
|
|
98
|
+
|
|
99
|
+
has_criteria = "criteria" in test
|
|
100
|
+
has_expected = "expected_output" in test
|
|
101
|
+
has_assertions = "assertions" in test
|
|
102
|
+
|
|
103
|
+
if not has_criteria and not has_expected and not has_assertions:
|
|
104
|
+
issue("error", f"Test '{test_id}' needs at least one of: criteria, expected_output, assertions")
|
|
105
|
+
|
|
106
|
+
# Check assertions present
|
|
107
|
+
if not has_assertions and has_expected:
|
|
108
|
+
issue("warning", f"Test '{test_id}' has expected_output but no assertions — add deterministic assertions where possible")
|
|
109
|
+
|
|
110
|
+
# Check expected_output for prose patterns
|
|
111
|
+
if has_expected:
|
|
112
|
+
expected = test["expected_output"]
|
|
113
|
+
expected_text = ""
|
|
114
|
+
if isinstance(expected, str):
|
|
115
|
+
expected_text = expected
|
|
116
|
+
elif isinstance(expected, list):
|
|
117
|
+
for msg in expected:
|
|
118
|
+
if isinstance(msg, dict):
|
|
119
|
+
content = msg.get("content", "")
|
|
120
|
+
if isinstance(content, str):
|
|
121
|
+
expected_text += content
|
|
122
|
+
|
|
123
|
+
prose_patterns = [
|
|
124
|
+
r"[Tt]he agent should",
|
|
125
|
+
r"[Ss]hould identify",
|
|
126
|
+
r"[Ss]hould flag",
|
|
127
|
+
r"[Ss]hould recommend",
|
|
128
|
+
r"[Ss]hould produce",
|
|
129
|
+
r"[Ss]hould detect",
|
|
130
|
+
r"[Ss]hould load",
|
|
131
|
+
r"[Ss]hould run",
|
|
132
|
+
]
|
|
133
|
+
for pat in prose_patterns:
|
|
134
|
+
if re.search(pat, expected_text):
|
|
135
|
+
issue("warning", f"Test '{test_id}' expected_output contains evaluation criteria prose ('{pat.lstrip('[Tt]').lstrip('[Ss]')}...') — use criteria or assertions instead")
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
# Collect file values from input
|
|
139
|
+
test_files = extract_file_values(test.get("input", []))
|
|
140
|
+
file_values_per_test.append(test_files)
|
|
141
|
+
|
|
142
|
+
# Check file paths for leading /
|
|
143
|
+
for fv in test_files:
|
|
144
|
+
if not fv.startswith("/"):
|
|
145
|
+
issue("warning", f"Test '{test_id}' file path missing leading '/': {fv}")
|
|
146
|
+
|
|
147
|
+
# Check for repeated file inputs
|
|
148
|
+
if len(file_values_per_test) >= 2 and not top_level_input:
|
|
149
|
+
common_files = set(file_values_per_test[0])
|
|
150
|
+
for fvs in file_values_per_test[1:]:
|
|
151
|
+
common_files &= set(fvs)
|
|
152
|
+
if common_files:
|
|
153
|
+
issue("info", f"File input repeated in every test: {', '.join(sorted(common_files))} — consider using top-level input")
|
|
154
|
+
|
|
155
|
+
return issues
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def extract_file_values(input_data) -> list[str]:
|
|
159
|
+
"""Extract type:file values from input structure."""
|
|
160
|
+
files = []
|
|
161
|
+
if isinstance(input_data, list):
|
|
162
|
+
for item in input_data:
|
|
163
|
+
if isinstance(item, dict):
|
|
164
|
+
content = item.get("content", [])
|
|
165
|
+
if isinstance(content, list):
|
|
166
|
+
for c in content:
|
|
167
|
+
if isinstance(c, dict) and c.get("type") == "file":
|
|
168
|
+
v = c.get("value", "")
|
|
169
|
+
if v:
|
|
170
|
+
files.append(v)
|
|
171
|
+
return files
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def lint_directory(path: Path) -> list[dict]:
|
|
175
|
+
issues = []
|
|
176
|
+
eval_files = sorted(path.rglob("*.yaml")) + sorted(path.rglob("*.yml"))
|
|
177
|
+
|
|
178
|
+
if not eval_files:
|
|
179
|
+
issues.append({"file": str(path), "severity": "warning", "message": "No eval files found", "line": None})
|
|
180
|
+
return issues
|
|
181
|
+
|
|
182
|
+
# Check naming prefix consistency
|
|
183
|
+
prefixes = set()
|
|
184
|
+
for f in eval_files:
|
|
185
|
+
name = f.stem.replace(".eval", "")
|
|
186
|
+
parts = name.split("-")
|
|
187
|
+
if len(parts) >= 2:
|
|
188
|
+
prefixes.add(parts[0])
|
|
189
|
+
|
|
190
|
+
if len(prefixes) > 1:
|
|
191
|
+
issues.append({
|
|
192
|
+
"file": str(path),
|
|
193
|
+
"severity": "info",
|
|
194
|
+
"message": f"Inconsistent naming prefixes: {', '.join(sorted(prefixes))}",
|
|
195
|
+
"line": None,
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
for f in eval_files:
|
|
199
|
+
issues.extend(lint_file(f))
|
|
200
|
+
|
|
201
|
+
return issues
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def main():
|
|
205
|
+
if len(sys.argv) < 2:
|
|
206
|
+
print(f"Usage: {sys.argv[0]} <path> [--json]", file=sys.stderr)
|
|
207
|
+
sys.exit(2)
|
|
208
|
+
|
|
209
|
+
target = Path(sys.argv[1])
|
|
210
|
+
output_json = "--json" in sys.argv
|
|
211
|
+
|
|
212
|
+
if target.is_file():
|
|
213
|
+
issues = lint_file(target)
|
|
214
|
+
elif target.is_dir():
|
|
215
|
+
issues = lint_directory(target)
|
|
216
|
+
else:
|
|
217
|
+
print(f"Error: {target} not found", file=sys.stderr)
|
|
218
|
+
sys.exit(2)
|
|
219
|
+
|
|
220
|
+
if output_json:
|
|
221
|
+
print(json.dumps(issues, indent=2))
|
|
222
|
+
else:
|
|
223
|
+
for iss in issues:
|
|
224
|
+
line = f":{iss['line']}" if iss.get("line") else ""
|
|
225
|
+
print(f"[{iss['severity'].upper()}] {iss['file']}{line}: {iss['message']}")
|
|
226
|
+
|
|
227
|
+
counts = {}
|
|
228
|
+
for iss in issues:
|
|
229
|
+
counts[iss["severity"]] = counts.get(iss["severity"], 0) + 1
|
|
230
|
+
if issues:
|
|
231
|
+
print(f"\n{len(issues)} issues: {', '.join(f'{v} {k}' for k, v in sorted(counts.items()))}")
|
|
232
|
+
else:
|
|
233
|
+
print("No issues found.")
|
|
234
|
+
|
|
235
|
+
sys.exit(1 if any(i["severity"] == "error" for i in issues) else 0)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
if __name__ == "__main__":
|
|
239
|
+
main()
|