adversarial-workflow 0.6.6__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adversarial_workflow/__init__.py +1 -1
- adversarial_workflow/cli.py +351 -5
- adversarial_workflow/evaluators/__init__.py +11 -2
- adversarial_workflow/evaluators/config.py +39 -2
- adversarial_workflow/evaluators/discovery.py +97 -9
- adversarial_workflow/evaluators/resolver.py +211 -0
- adversarial_workflow/evaluators/runner.py +36 -13
- adversarial_workflow/library/__init__.py +56 -0
- adversarial_workflow/library/cache.py +184 -0
- adversarial_workflow/library/client.py +224 -0
- adversarial_workflow/library/commands.py +849 -0
- adversarial_workflow/library/config.py +81 -0
- adversarial_workflow/library/models.py +129 -0
- adversarial_workflow/utils/citations.py +643 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/METADATA +160 -3
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/RECORD +20 -12
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/WHEEL +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/entry_points.txt +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {adversarial_workflow-0.6.6.dist-info → adversarial_workflow-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,10 @@ YAML parsing and discovery for custom evaluators.
|
|
|
4
4
|
This module handles discovering evaluator definitions from
|
|
5
5
|
.adversarial/evaluators/*.yml files and parsing them into
|
|
6
6
|
EvaluatorConfig objects.
|
|
7
|
+
|
|
8
|
+
Supports dual-field model specification (ADV-0015):
|
|
9
|
+
- Legacy: model + api_key_env fields (backwards compatible)
|
|
10
|
+
- New: model_requirement field (resolved via ModelResolver)
|
|
7
11
|
"""
|
|
8
12
|
|
|
9
13
|
from __future__ import annotations
|
|
@@ -14,7 +18,7 @@ from pathlib import Path
|
|
|
14
18
|
|
|
15
19
|
import yaml
|
|
16
20
|
|
|
17
|
-
from .config import EvaluatorConfig
|
|
21
|
+
from .config import EvaluatorConfig, ModelRequirement
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -54,26 +58,39 @@ def parse_evaluator_yaml(yml_file: Path) -> EvaluatorConfig:
|
|
|
54
58
|
raise EvaluatorParseError(f"YAML must be a mapping, got {type(data).__name__}: {yml_file}")
|
|
55
59
|
|
|
56
60
|
# Validate required fields exist
|
|
57
|
-
required
|
|
61
|
+
# model and api_key_env are only required if model_requirement is not present
|
|
62
|
+
always_required = [
|
|
58
63
|
"name",
|
|
59
64
|
"description",
|
|
60
|
-
"model",
|
|
61
|
-
"api_key_env",
|
|
62
65
|
"prompt",
|
|
63
66
|
"output_suffix",
|
|
64
67
|
]
|
|
65
|
-
|
|
68
|
+
has_model_requirement = "model_requirement" in data
|
|
69
|
+
if not has_model_requirement:
|
|
70
|
+
# Legacy format: model and api_key_env are required
|
|
71
|
+
always_required.extend(["model", "api_key_env"])
|
|
72
|
+
|
|
73
|
+
missing = [f for f in always_required if f not in data]
|
|
66
74
|
if missing:
|
|
67
75
|
raise EvaluatorParseError(f"Missing required fields: {', '.join(missing)}")
|
|
68
76
|
|
|
69
77
|
# Validate required fields are strings (YAML can parse 'yes' as bool, '123' as int)
|
|
70
|
-
for field in
|
|
78
|
+
for field in always_required:
|
|
71
79
|
value = data[field]
|
|
72
80
|
if not isinstance(value, str):
|
|
73
81
|
raise EvaluatorParseError(
|
|
74
82
|
f"Field '{field}' must be a string, got {type(value).__name__}: {value!r}"
|
|
75
83
|
)
|
|
76
84
|
|
|
85
|
+
# Validate model and api_key_env are strings if present (even when optional)
|
|
86
|
+
for field in ["model", "api_key_env"]:
|
|
87
|
+
if field in data and data[field] is not None:
|
|
88
|
+
value = data[field]
|
|
89
|
+
if not isinstance(value, str):
|
|
90
|
+
raise EvaluatorParseError(
|
|
91
|
+
f"Field '{field}' must be a string, got {type(value).__name__}: {value!r}"
|
|
92
|
+
)
|
|
93
|
+
|
|
77
94
|
# Validate name format (valid CLI command name)
|
|
78
95
|
name = data["name"]
|
|
79
96
|
if not re.match(r"^[a-zA-Z][a-zA-Z0-9_-]*$", name):
|
|
@@ -143,6 +160,67 @@ def parse_evaluator_yaml(yml_file: Path) -> EvaluatorConfig:
|
|
|
143
160
|
)
|
|
144
161
|
data["timeout"] = 600
|
|
145
162
|
|
|
163
|
+
# Parse model_requirement if present (ADV-0015)
|
|
164
|
+
model_requirement = None
|
|
165
|
+
if "model_requirement" in data:
|
|
166
|
+
req_data = data["model_requirement"]
|
|
167
|
+
|
|
168
|
+
# Validate model_requirement is a mapping
|
|
169
|
+
if not isinstance(req_data, dict):
|
|
170
|
+
raise EvaluatorParseError(
|
|
171
|
+
f"model_requirement must be a mapping, got {type(req_data).__name__}"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Validate required fields in model_requirement
|
|
175
|
+
if "family" not in req_data:
|
|
176
|
+
raise EvaluatorParseError("model_requirement.family is required")
|
|
177
|
+
if "tier" not in req_data:
|
|
178
|
+
raise EvaluatorParseError("model_requirement.tier is required")
|
|
179
|
+
|
|
180
|
+
# Validate family and tier are strings
|
|
181
|
+
family = req_data["family"]
|
|
182
|
+
tier = req_data["tier"]
|
|
183
|
+
if not isinstance(family, str):
|
|
184
|
+
raise EvaluatorParseError(
|
|
185
|
+
f"model_requirement.family must be a string, got {type(family).__name__}"
|
|
186
|
+
)
|
|
187
|
+
if not isinstance(tier, str):
|
|
188
|
+
raise EvaluatorParseError(
|
|
189
|
+
f"model_requirement.tier must be a string, got {type(tier).__name__}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Validate optional min_version is string if present
|
|
193
|
+
min_version = req_data.get("min_version", "")
|
|
194
|
+
# Reject booleans explicitly (YAML parses 'yes'/'no'/'true'/'false' as bool)
|
|
195
|
+
if isinstance(min_version, bool):
|
|
196
|
+
raise EvaluatorParseError(
|
|
197
|
+
f"model_requirement.min_version must be a string, got bool: {min_version!r}"
|
|
198
|
+
)
|
|
199
|
+
# Convert integers to strings (YAML parses '0' as int 0)
|
|
200
|
+
if isinstance(min_version, int):
|
|
201
|
+
min_version = str(min_version)
|
|
202
|
+
elif min_version and not isinstance(min_version, str):
|
|
203
|
+
raise EvaluatorParseError(
|
|
204
|
+
f"model_requirement.min_version must be a string, got {type(min_version).__name__}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Validate optional min_context is integer if present
|
|
208
|
+
min_context = req_data.get("min_context", 0)
|
|
209
|
+
# Reject booleans explicitly (YAML parses 'yes'/'no'/'true'/'false' as bool)
|
|
210
|
+
if isinstance(min_context, bool):
|
|
211
|
+
raise EvaluatorParseError("model_requirement.min_context must be an integer, got bool")
|
|
212
|
+
if min_context and not isinstance(min_context, int):
|
|
213
|
+
raise EvaluatorParseError(
|
|
214
|
+
f"model_requirement.min_context must be an integer, got {type(min_context).__name__}"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
model_requirement = ModelRequirement(
|
|
218
|
+
family=family,
|
|
219
|
+
tier=tier,
|
|
220
|
+
min_version=min_version,
|
|
221
|
+
min_context=min_context,
|
|
222
|
+
)
|
|
223
|
+
|
|
146
224
|
# Filter to known fields only (log unknown fields)
|
|
147
225
|
known_fields = {
|
|
148
226
|
"name",
|
|
@@ -156,17 +234,27 @@ def parse_evaluator_yaml(yml_file: Path) -> EvaluatorConfig:
|
|
|
156
234
|
"aliases",
|
|
157
235
|
"version",
|
|
158
236
|
"timeout",
|
|
237
|
+
"model_requirement", # ADV-0015
|
|
159
238
|
}
|
|
160
239
|
unknown = set(data.keys()) - known_fields
|
|
161
240
|
if unknown:
|
|
162
241
|
logger.warning("Unknown fields in %s: %s", yml_file.name, ", ".join(sorted(unknown)))
|
|
163
242
|
|
|
164
|
-
# Build filtered data dict
|
|
165
|
-
|
|
243
|
+
# Build filtered data dict (exclude model_requirement as it's handled separately)
|
|
244
|
+
scalar_fields = known_fields - {"model_requirement"}
|
|
245
|
+
filtered_data = {k: v for k, v in data.items() if k in scalar_fields}
|
|
246
|
+
|
|
247
|
+
# Set defaults for optional model/api_key_env when model_requirement is present
|
|
248
|
+
# Also handle explicit null values (YAML parses empty or null as None)
|
|
249
|
+
if "model" not in filtered_data or filtered_data["model"] is None:
|
|
250
|
+
filtered_data["model"] = ""
|
|
251
|
+
if "api_key_env" not in filtered_data or filtered_data["api_key_env"] is None:
|
|
252
|
+
filtered_data["api_key_env"] = ""
|
|
166
253
|
|
|
167
|
-
# Create config with metadata
|
|
254
|
+
# Create config with metadata and model_requirement
|
|
168
255
|
config = EvaluatorConfig(
|
|
169
256
|
**filtered_data,
|
|
257
|
+
model_requirement=model_requirement,
|
|
170
258
|
source="local",
|
|
171
259
|
config_file=str(yml_file),
|
|
172
260
|
)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model resolver for evaluator configurations (ADV-0015: Model Routing Layer - Phase 1).
|
|
3
|
+
|
|
4
|
+
This module provides the ModelResolver class that resolves model requirements
|
|
5
|
+
to actual model IDs using an embedded registry. It supports:
|
|
6
|
+
- model_requirement field (new structured format)
|
|
7
|
+
- model field (legacy string format)
|
|
8
|
+
- Fallback from model_requirement to model on resolution failure
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import warnings
|
|
14
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from adversarial_workflow.evaluators.config import EvaluatorConfig, ModelRequirement
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ResolutionError(Exception):
|
|
21
|
+
"""Raised when model resolution fails."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ModelResolver:
|
|
25
|
+
"""Resolves model requirements to actual model IDs.
|
|
26
|
+
|
|
27
|
+
Uses an embedded registry (matching adversarial-evaluator-library/providers/registry.yml)
|
|
28
|
+
to map family/tier pairs to concrete model identifiers.
|
|
29
|
+
|
|
30
|
+
Resolution order:
|
|
31
|
+
1. If model_requirement present: resolve via registry
|
|
32
|
+
2. If resolution fails AND model present: warn + fallback to legacy
|
|
33
|
+
3. If resolution fails AND no model: raise ResolutionError
|
|
34
|
+
4. If no model_requirement AND model present: use legacy directly
|
|
35
|
+
5. If neither: raise ResolutionError
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Default registry - matches adversarial-evaluator-library/providers/registry.yml
|
|
39
|
+
# Updated 2026-02-03 per Library team handoff (ADR-0005)
|
|
40
|
+
DEFAULT_REGISTRY: ClassVar[dict[str, dict[str, dict[str, list[str] | str]]]] = {
|
|
41
|
+
"claude": {
|
|
42
|
+
"opus": {
|
|
43
|
+
"models": ["claude-4-opus-20260115", "claude-opus-4-5-20251101"],
|
|
44
|
+
"prefix": "anthropic/",
|
|
45
|
+
},
|
|
46
|
+
"sonnet": {
|
|
47
|
+
"models": ["claude-4-sonnet-20260115"],
|
|
48
|
+
"prefix": "anthropic/",
|
|
49
|
+
},
|
|
50
|
+
"haiku": {
|
|
51
|
+
"models": ["claude-4-haiku-20260115"],
|
|
52
|
+
"prefix": "anthropic/",
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
"gpt": {
|
|
56
|
+
"flagship": {
|
|
57
|
+
"models": ["gpt-4o", "gpt-4o-2024-08-06"],
|
|
58
|
+
"prefix": "",
|
|
59
|
+
},
|
|
60
|
+
"standard": {
|
|
61
|
+
"models": ["gpt-4-turbo", "gpt-4"],
|
|
62
|
+
"prefix": "",
|
|
63
|
+
},
|
|
64
|
+
"mini": {
|
|
65
|
+
"models": ["gpt-4o-mini"],
|
|
66
|
+
"prefix": "",
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
"o": {
|
|
70
|
+
"flagship": {
|
|
71
|
+
"models": ["o1", "o1-2024-12-17"],
|
|
72
|
+
"prefix": "",
|
|
73
|
+
},
|
|
74
|
+
"mini": {
|
|
75
|
+
"models": ["o3-mini"],
|
|
76
|
+
"prefix": "",
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
"gemini": {
|
|
80
|
+
"pro": {
|
|
81
|
+
"models": ["gemini-2.5-pro"],
|
|
82
|
+
"prefix": "gemini/",
|
|
83
|
+
},
|
|
84
|
+
"flash": {
|
|
85
|
+
"models": ["gemini-2.5-flash"],
|
|
86
|
+
"prefix": "gemini/",
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
"mistral": {
|
|
90
|
+
"large": {
|
|
91
|
+
"models": ["mistral-large-latest"],
|
|
92
|
+
"prefix": "mistral/",
|
|
93
|
+
},
|
|
94
|
+
"small": {
|
|
95
|
+
"models": ["mistral-small-latest"],
|
|
96
|
+
"prefix": "mistral/",
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
"codestral": {
|
|
100
|
+
"latest": {
|
|
101
|
+
"models": ["codestral-latest"],
|
|
102
|
+
"prefix": "mistral/",
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
"llama": {
|
|
106
|
+
"large": {
|
|
107
|
+
"models": ["llama-3.3-70b"],
|
|
108
|
+
"prefix": "", # varies by host
|
|
109
|
+
},
|
|
110
|
+
"medium": {
|
|
111
|
+
"models": ["llama-3.1-8b"],
|
|
112
|
+
"prefix": "",
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# API key environment variable mapping by family
|
|
118
|
+
API_KEY_MAP: ClassVar[dict[str, str]] = {
|
|
119
|
+
"claude": "ANTHROPIC_API_KEY",
|
|
120
|
+
"gpt": "OPENAI_API_KEY",
|
|
121
|
+
"o": "OPENAI_API_KEY",
|
|
122
|
+
"gemini": "GEMINI_API_KEY",
|
|
123
|
+
"mistral": "MISTRAL_API_KEY",
|
|
124
|
+
"codestral": "MISTRAL_API_KEY",
|
|
125
|
+
"llama": "TOGETHER_API_KEY",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
def resolve(self, config: EvaluatorConfig) -> tuple[str, str]:
|
|
129
|
+
"""Resolve evaluator config to (model_id, api_key_env).
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
config: EvaluatorConfig with model and/or model_requirement
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
(model_id, api_key_env) tuple
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
ResolutionError: If resolution fails and no fallback available
|
|
139
|
+
"""
|
|
140
|
+
if config.model_requirement:
|
|
141
|
+
try:
|
|
142
|
+
return self._resolve_requirement(config.model_requirement)
|
|
143
|
+
except ResolutionError as e:
|
|
144
|
+
if config.model:
|
|
145
|
+
# Fall back to legacy with warning
|
|
146
|
+
warnings.warn(
|
|
147
|
+
f"model_requirement resolution failed for {config.name}: {e}. "
|
|
148
|
+
f"Falling back to legacy model field: {config.model}",
|
|
149
|
+
UserWarning,
|
|
150
|
+
stacklevel=2,
|
|
151
|
+
)
|
|
152
|
+
return (config.model, config.api_key_env)
|
|
153
|
+
raise
|
|
154
|
+
|
|
155
|
+
# Legacy only
|
|
156
|
+
if config.model:
|
|
157
|
+
return (config.model, config.api_key_env)
|
|
158
|
+
|
|
159
|
+
raise ResolutionError("No model or model_requirement specified")
|
|
160
|
+
|
|
161
|
+
def _resolve_requirement(self, req: ModelRequirement) -> tuple[str, str]:
|
|
162
|
+
"""Resolve requirement to model ID using registry.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
req: ModelRequirement with family and tier
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
(model_id, api_key_env) tuple
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ResolutionError: If family or tier not found in registry
|
|
172
|
+
"""
|
|
173
|
+
# TODO(Phase 2): ModelRequirement.min_version and ModelRequirement.min_context
|
|
174
|
+
# are currently parsed but not used for filtering. Phase 1 only performs
|
|
175
|
+
# family/tier matching. Phase 2 will implement filtering by min_version
|
|
176
|
+
# and min_context requirements.
|
|
177
|
+
family = self.DEFAULT_REGISTRY.get(req.family)
|
|
178
|
+
if not family:
|
|
179
|
+
raise ResolutionError(f"Unknown model family: {req.family}")
|
|
180
|
+
|
|
181
|
+
tier_data = family.get(req.tier)
|
|
182
|
+
if not tier_data:
|
|
183
|
+
raise ResolutionError(f"Unknown tier '{req.tier}' for family '{req.family}'")
|
|
184
|
+
|
|
185
|
+
# Return first (latest) model in tier
|
|
186
|
+
models = tier_data.get("models", [])
|
|
187
|
+
if not models:
|
|
188
|
+
raise ResolutionError(f"No models defined for {req.family}/{req.tier}")
|
|
189
|
+
# Registry type is list[str] | str for flexibility; actual values are always lists
|
|
190
|
+
model_id = models[0] # type: ignore[index]
|
|
191
|
+
|
|
192
|
+
# Apply provider prefix for LiteLLM compatibility
|
|
193
|
+
prefix = tier_data.get("prefix", "")
|
|
194
|
+
if prefix:
|
|
195
|
+
model_id = f"{prefix}{model_id}"
|
|
196
|
+
|
|
197
|
+
# Determine API key env from family
|
|
198
|
+
api_key_env = self._get_api_key_env(req.family)
|
|
199
|
+
|
|
200
|
+
return (model_id, api_key_env)
|
|
201
|
+
|
|
202
|
+
def _get_api_key_env(self, family: str) -> str:
|
|
203
|
+
"""Get default API key environment variable for family.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
family: Model family name
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Environment variable name for API key
|
|
210
|
+
"""
|
|
211
|
+
return self.API_KEY_MAP.get(family, f"{family.upper()}_API_KEY")
|
|
@@ -1,4 +1,9 @@
|
|
|
1
|
-
"""Generic evaluator runner.
|
|
1
|
+
"""Generic evaluator runner.
|
|
2
|
+
|
|
3
|
+
Supports dual-field model specification (ADV-0015):
|
|
4
|
+
- Legacy: model + api_key_env fields (backwards compatible)
|
|
5
|
+
- New: model_requirement field (resolved via ModelResolver)
|
|
6
|
+
"""
|
|
2
7
|
|
|
3
8
|
from __future__ import annotations
|
|
4
9
|
|
|
@@ -14,6 +19,7 @@ from ..utils.colors import BOLD, GREEN, RED, RESET, YELLOW
|
|
|
14
19
|
from ..utils.config import load_config
|
|
15
20
|
from ..utils.validation import validate_evaluation_output
|
|
16
21
|
from .config import EvaluatorConfig
|
|
22
|
+
from .resolver import ModelResolver, ResolutionError
|
|
17
23
|
|
|
18
24
|
|
|
19
25
|
def run_evaluator(config: EvaluatorConfig, file_path: str, timeout: int = 180) -> int:
|
|
@@ -43,20 +49,28 @@ def run_evaluator(config: EvaluatorConfig, file_path: str, timeout: int = 180) -
|
|
|
43
49
|
return 1
|
|
44
50
|
project_config = load_config()
|
|
45
51
|
|
|
46
|
-
# 3.
|
|
52
|
+
# 3. Resolve model (ADV-0015: dual-field support)
|
|
53
|
+
resolver = ModelResolver()
|
|
54
|
+
try:
|
|
55
|
+
resolved_model, resolved_api_key_env = resolver.resolve(config)
|
|
56
|
+
except ResolutionError as e:
|
|
57
|
+
print(f"{RED}Error: {e}{RESET}")
|
|
58
|
+
return 1
|
|
59
|
+
|
|
60
|
+
# 4. Check aider available
|
|
47
61
|
if not shutil.which("aider"):
|
|
48
62
|
print(f"{RED}Error: Aider not found{RESET}")
|
|
49
63
|
_print_aider_help()
|
|
50
64
|
return 1
|
|
51
65
|
|
|
52
|
-
#
|
|
53
|
-
api_key = os.environ.get(
|
|
66
|
+
# 5. Check API key (using resolved api_key_env)
|
|
67
|
+
api_key = os.environ.get(resolved_api_key_env)
|
|
54
68
|
if not api_key:
|
|
55
|
-
print(f"{RED}Error: {
|
|
56
|
-
print(f" Set in .env or export {
|
|
69
|
+
print(f"{RED}Error: {resolved_api_key_env} not set{RESET}")
|
|
70
|
+
print(f" Set in .env or export {resolved_api_key_env}=your-key")
|
|
57
71
|
return 1
|
|
58
72
|
|
|
59
|
-
#
|
|
73
|
+
# 6. Pre-flight file size check
|
|
60
74
|
line_count, estimated_tokens = _check_file_size(file_path)
|
|
61
75
|
if line_count > 500 or estimated_tokens > 20000:
|
|
62
76
|
_warn_large_file(line_count, estimated_tokens)
|
|
@@ -65,11 +79,11 @@ def run_evaluator(config: EvaluatorConfig, file_path: str, timeout: int = 180) -
|
|
|
65
79
|
print("Evaluation cancelled.")
|
|
66
80
|
return 0
|
|
67
81
|
|
|
68
|
-
#
|
|
82
|
+
# 7. Determine execution method
|
|
69
83
|
if config.source == "builtin":
|
|
70
84
|
return _run_builtin_evaluator(config, file_path, project_config, timeout)
|
|
71
85
|
else:
|
|
72
|
-
return _run_custom_evaluator(config, file_path, project_config, timeout)
|
|
86
|
+
return _run_custom_evaluator(config, file_path, project_config, timeout, resolved_model)
|
|
73
87
|
|
|
74
88
|
|
|
75
89
|
def _run_builtin_evaluator(
|
|
@@ -99,8 +113,17 @@ def _run_custom_evaluator(
|
|
|
99
113
|
file_path: str,
|
|
100
114
|
project_config: dict,
|
|
101
115
|
timeout: int,
|
|
116
|
+
resolved_model: str,
|
|
102
117
|
) -> int:
|
|
103
|
-
"""Run a custom evaluator by invoking aider directly.
|
|
118
|
+
"""Run a custom evaluator by invoking aider directly.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
config: Evaluator configuration
|
|
122
|
+
file_path: Path to file to evaluate
|
|
123
|
+
project_config: Project configuration dict
|
|
124
|
+
timeout: Timeout in seconds
|
|
125
|
+
resolved_model: Resolved model ID from ModelResolver
|
|
126
|
+
"""
|
|
104
127
|
# Prepare output path
|
|
105
128
|
logs_dir = Path(project_config["log_directory"])
|
|
106
129
|
logs_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -131,13 +154,13 @@ def _run_custom_evaluator(
|
|
|
131
154
|
prefix = config.log_prefix or config.name.upper()
|
|
132
155
|
|
|
133
156
|
try:
|
|
134
|
-
print(f"{prefix}: Using model {
|
|
157
|
+
print(f"{prefix}: Using model {resolved_model}")
|
|
135
158
|
|
|
136
159
|
# Build aider command
|
|
137
160
|
cmd = [
|
|
138
161
|
"aider",
|
|
139
162
|
"--model",
|
|
140
|
-
|
|
163
|
+
resolved_model,
|
|
141
164
|
"--yes",
|
|
142
165
|
"--no-detect-urls",
|
|
143
166
|
"--no-git",
|
|
@@ -168,7 +191,7 @@ def _run_custom_evaluator(
|
|
|
168
191
|
|
|
169
192
|
**Source**: {file_path}
|
|
170
193
|
**Evaluator**: {config.name}
|
|
171
|
-
**Model**: {
|
|
194
|
+
**Model**: {resolved_model}
|
|
172
195
|
**Generated**: {timestamp}
|
|
173
196
|
|
|
174
197
|
---
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Evaluator library client for adversarial-workflow.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to browse, install, and update evaluator
|
|
4
|
+
configurations from the community adversarial-evaluator-library.
|
|
5
|
+
|
|
6
|
+
Philosophy: "Copy, Don't Link"
|
|
7
|
+
- Evaluators are copied to projects, not referenced at runtime
|
|
8
|
+
- Projects remain self-contained and work offline
|
|
9
|
+
- Users can customize their local copies freely
|
|
10
|
+
- Updates are explicit and user-controlled
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .cache import DEFAULT_CACHE_DIR, DEFAULT_CACHE_TTL, CacheManager
|
|
14
|
+
from .client import (
|
|
15
|
+
DEFAULT_LIBRARY_URL,
|
|
16
|
+
LibraryClient,
|
|
17
|
+
LibraryClientError,
|
|
18
|
+
NetworkError,
|
|
19
|
+
ParseError,
|
|
20
|
+
)
|
|
21
|
+
from .commands import (
|
|
22
|
+
library_check_updates,
|
|
23
|
+
library_info,
|
|
24
|
+
library_install,
|
|
25
|
+
library_list,
|
|
26
|
+
library_update,
|
|
27
|
+
)
|
|
28
|
+
from .config import LibraryConfig, get_library_config
|
|
29
|
+
from .models import EvaluatorEntry, IndexData, InstalledEvaluatorMeta, UpdateInfo
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Client
|
|
33
|
+
"LibraryClient",
|
|
34
|
+
"LibraryClientError",
|
|
35
|
+
"NetworkError",
|
|
36
|
+
"ParseError",
|
|
37
|
+
"DEFAULT_LIBRARY_URL",
|
|
38
|
+
# Models
|
|
39
|
+
"EvaluatorEntry",
|
|
40
|
+
"IndexData",
|
|
41
|
+
"InstalledEvaluatorMeta",
|
|
42
|
+
"UpdateInfo",
|
|
43
|
+
# Cache
|
|
44
|
+
"CacheManager",
|
|
45
|
+
"DEFAULT_CACHE_DIR",
|
|
46
|
+
"DEFAULT_CACHE_TTL",
|
|
47
|
+
# Config
|
|
48
|
+
"LibraryConfig",
|
|
49
|
+
"get_library_config",
|
|
50
|
+
# Commands
|
|
51
|
+
"library_list",
|
|
52
|
+
"library_info",
|
|
53
|
+
"library_install",
|
|
54
|
+
"library_check_updates",
|
|
55
|
+
"library_update",
|
|
56
|
+
]
|