adversarial-workflow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,212 @@
1
+ """
2
+ YAML parsing and discovery for custom evaluators.
3
+
4
+ This module handles discovering evaluator definitions from
5
+ .adversarial/evaluators/*.yml files and parsing them into
6
+ EvaluatorConfig objects.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import re
13
+ from pathlib import Path
14
+
15
+ import yaml
16
+
17
+ from .config import EvaluatorConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class EvaluatorParseError(Exception):
23
+ """Raised when evaluator YAML is invalid."""
24
+
25
+
26
+ def parse_evaluator_yaml(yml_file: Path) -> EvaluatorConfig:
27
+ """Parse a YAML file into an EvaluatorConfig.
28
+
29
+ Args:
30
+ yml_file: Path to the YAML file
31
+
32
+ Returns:
33
+ EvaluatorConfig instance
34
+
35
+ Raises:
36
+ EvaluatorParseError: If YAML is invalid or missing required fields
37
+ yaml.YAMLError: If YAML syntax is invalid
38
+ """
39
+ # Read file with explicit UTF-8 encoding
40
+ try:
41
+ content = yml_file.read_text(encoding="utf-8")
42
+ except UnicodeDecodeError as e:
43
+ raise EvaluatorParseError(
44
+ f"File encoding error (not UTF-8): {yml_file}"
45
+ ) from e
46
+
47
+ # Parse YAML
48
+ data = yaml.safe_load(content)
49
+
50
+ # Check for empty YAML
51
+ if data is None or (isinstance(data, str) and not data.strip()):
52
+ raise EvaluatorParseError(f"Empty or invalid YAML file: {yml_file}")
53
+
54
+ # Ensure parsed data is a dict (YAML can parse scalars, lists, etc.)
55
+ if not isinstance(data, dict):
56
+ raise EvaluatorParseError(
57
+ f"YAML must be a mapping, got {type(data).__name__}: {yml_file}"
58
+ )
59
+
60
+ # Validate required fields exist
61
+ required = ["name", "description", "model", "api_key_env", "prompt", "output_suffix"]
62
+ missing = [f for f in required if f not in data]
63
+ if missing:
64
+ raise EvaluatorParseError(f"Missing required fields: {', '.join(missing)}")
65
+
66
+ # Validate required fields are strings (YAML can parse 'yes' as bool, '123' as int)
67
+ for field in required:
68
+ value = data[field]
69
+ if not isinstance(value, str):
70
+ raise EvaluatorParseError(
71
+ f"Field '{field}' must be a string, got {type(value).__name__}: {value!r}"
72
+ )
73
+
74
+ # Validate name format (valid CLI command name)
75
+ name = data["name"]
76
+ if not re.match(r"^[a-zA-Z][a-zA-Z0-9_-]*$", name):
77
+ raise EvaluatorParseError(
78
+ f"Invalid evaluator name '{name}': must start with letter, "
79
+ "contain only letters, numbers, hyphens, underscores"
80
+ )
81
+
82
+ # Normalize aliases (handle None, string, or list)
83
+ aliases = data.get("aliases")
84
+ if aliases is None:
85
+ data["aliases"] = []
86
+ elif isinstance(aliases, str):
87
+ data["aliases"] = [aliases]
88
+ elif not isinstance(aliases, list):
89
+ raise EvaluatorParseError(
90
+ f"aliases must be string or list, got {type(aliases).__name__}"
91
+ )
92
+
93
+ # Validate alias names - must be strings with valid format
94
+ for alias in data.get("aliases", []):
95
+ if not isinstance(alias, str):
96
+ raise EvaluatorParseError(
97
+ f"Alias must be a string, got {type(alias).__name__}: {alias!r}"
98
+ )
99
+ if not re.match(r"^[a-zA-Z][a-zA-Z0-9_-]*$", alias):
100
+ raise EvaluatorParseError(
101
+ f"Invalid alias '{alias}': must start with letter, "
102
+ "contain only letters, numbers, hyphens, underscores"
103
+ )
104
+
105
+ # Validate prompt is non-empty
106
+ prompt = data.get("prompt", "")
107
+ if not prompt or not prompt.strip():
108
+ raise EvaluatorParseError("prompt cannot be empty")
109
+
110
+ # Validate optional string fields if present (YAML can parse '2' as int, 'yes' as bool)
111
+ optional_string_fields = ["log_prefix", "fallback_model", "version"]
112
+ for field in optional_string_fields:
113
+ if field in data and data[field] is not None:
114
+ value = data[field]
115
+ if not isinstance(value, str):
116
+ raise EvaluatorParseError(
117
+ f"Field '{field}' must be a string, got {type(value).__name__}: {value!r}"
118
+ )
119
+
120
+ # Filter to known fields only (log unknown fields)
121
+ known_fields = {
122
+ "name",
123
+ "description",
124
+ "model",
125
+ "api_key_env",
126
+ "prompt",
127
+ "output_suffix",
128
+ "log_prefix",
129
+ "fallback_model",
130
+ "aliases",
131
+ "version",
132
+ }
133
+ unknown = set(data.keys()) - known_fields
134
+ if unknown:
135
+ logger.warning(
136
+ "Unknown fields in %s: %s", yml_file.name, ", ".join(sorted(unknown))
137
+ )
138
+
139
+ # Build filtered data dict
140
+ filtered_data = {k: v for k, v in data.items() if k in known_fields}
141
+
142
+ # Create config with metadata
143
+ config = EvaluatorConfig(
144
+ **filtered_data,
145
+ source="local",
146
+ config_file=str(yml_file),
147
+ )
148
+
149
+ return config
150
+
151
+
152
+ def discover_local_evaluators(
153
+ base_path: Path | None = None,
154
+ ) -> dict[str, EvaluatorConfig]:
155
+ """Discover evaluators from .adversarial/evaluators/*.yml
156
+
157
+ Args:
158
+ base_path: Project root (default: current directory)
159
+
160
+ Returns:
161
+ Dict mapping evaluator name (and aliases) to EvaluatorConfig
162
+ """
163
+ if base_path is None:
164
+ base_path = Path.cwd()
165
+
166
+ evaluators: dict[str, EvaluatorConfig] = {}
167
+ local_dir = base_path / ".adversarial" / "evaluators"
168
+
169
+ if not local_dir.exists():
170
+ return evaluators
171
+
172
+ # Get yml files with error handling for permission/access issues
173
+ try:
174
+ yml_files = sorted(local_dir.glob("*.yml"))
175
+ except OSError as e:
176
+ logger.warning("Could not read evaluators directory: %s", e)
177
+ return evaluators
178
+
179
+ for yml_file in yml_files:
180
+ try:
181
+ config = parse_evaluator_yaml(yml_file)
182
+
183
+ # Check for name conflicts
184
+ if config.name in evaluators:
185
+ logger.warning(
186
+ "Evaluator '%s' in %s conflicts with existing; skipping",
187
+ config.name,
188
+ yml_file.name,
189
+ )
190
+ continue
191
+
192
+ # Register primary name
193
+ evaluators[config.name] = config
194
+
195
+ # Register aliases (point to same config object)
196
+ for alias in config.aliases:
197
+ if alias in evaluators:
198
+ logger.warning(
199
+ "Alias '%s' conflicts with existing evaluator; skipping alias",
200
+ alias,
201
+ )
202
+ continue
203
+ evaluators[alias] = config
204
+
205
+ except EvaluatorParseError as e:
206
+ logger.warning("Skipping %s: %s", yml_file.name, e)
207
+ except yaml.YAMLError as e:
208
+ logger.warning("Skipping %s: YAML syntax error: %s", yml_file.name, e)
209
+ except OSError as e:
210
+ logger.warning("Could not load %s: %s", yml_file.name, e)
211
+
212
+ return evaluators
@@ -0,0 +1,313 @@
1
+ """Generic evaluator runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import platform
7
+ import shutil
8
+ import subprocess
9
+ import tempfile
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+
13
+ from .config import EvaluatorConfig
14
+ from ..utils.colors import RESET, BOLD, GREEN, YELLOW, RED
15
+ from ..utils.config import load_config
16
+ from ..utils.validation import validate_evaluation_output
17
+
18
+
19
+ def run_evaluator(config: EvaluatorConfig, file_path: str, timeout: int = 180) -> int:
20
+ """Run an evaluator on a file.
21
+
22
+ Args:
23
+ config: Evaluator configuration
24
+ file_path: Path to file to evaluate
25
+ timeout: Timeout in seconds (default: 180)
26
+
27
+ Returns:
28
+ 0 on success, non-zero on failure
29
+ """
30
+ prefix = config.log_prefix or config.name.upper()
31
+ print(f"{prefix}: Evaluating {file_path}")
32
+ print()
33
+
34
+ # 1. Validate file exists
35
+ if not os.path.exists(file_path):
36
+ print(f"{RED}Error: File not found: {file_path}{RESET}")
37
+ return 1
38
+
39
+ # 2. Load project config (check initialization first)
40
+ config_path = Path(".adversarial/config.yml")
41
+ if not config_path.exists():
42
+ print(f"{RED}Error: Not initialized. Run 'adversarial init' first.{RESET}")
43
+ return 1
44
+ project_config = load_config()
45
+
46
+ # 3. Check aider available
47
+ if not shutil.which("aider"):
48
+ print(f"{RED}Error: Aider not found{RESET}")
49
+ _print_aider_help()
50
+ return 1
51
+
52
+ # 4. Check API key
53
+ api_key = os.environ.get(config.api_key_env)
54
+ if not api_key:
55
+ print(f"{RED}Error: {config.api_key_env} not set{RESET}")
56
+ print(f" Set in .env or export {config.api_key_env}=your-key")
57
+ return 1
58
+
59
+ # 5. Pre-flight file size check
60
+ line_count, estimated_tokens = _check_file_size(file_path)
61
+ if line_count > 500 or estimated_tokens > 20000:
62
+ _warn_large_file(line_count, estimated_tokens)
63
+ if line_count > 700:
64
+ if not _confirm_continue():
65
+ print("Evaluation cancelled.")
66
+ return 0
67
+
68
+ # 6. Determine execution method
69
+ if config.source == "builtin":
70
+ return _run_builtin_evaluator(config, file_path, project_config, timeout)
71
+ else:
72
+ return _run_custom_evaluator(config, file_path, project_config, timeout)
73
+
74
+
75
+ def _run_builtin_evaluator(
76
+ config: EvaluatorConfig,
77
+ file_path: str,
78
+ project_config: dict,
79
+ timeout: int,
80
+ ) -> int:
81
+ """Run a built-in evaluator using existing shell scripts."""
82
+ script_map = {
83
+ "evaluate": ".adversarial/scripts/evaluate_plan.sh",
84
+ "proofread": ".adversarial/scripts/proofread_content.sh",
85
+ "review": ".adversarial/scripts/code_review.sh",
86
+ }
87
+
88
+ script = script_map.get(config.name)
89
+ if not script or not os.path.exists(script):
90
+ print(f"{RED}Error: Script not found: {script}{RESET}")
91
+ print(" Fix: Run 'adversarial init' to reinstall scripts")
92
+ return 1
93
+
94
+ return _execute_script(script, file_path, config, project_config, timeout)
95
+
96
+
97
+ def _run_custom_evaluator(
98
+ config: EvaluatorConfig,
99
+ file_path: str,
100
+ project_config: dict,
101
+ timeout: int,
102
+ ) -> int:
103
+ """Run a custom evaluator by invoking aider directly."""
104
+ # Prepare output path
105
+ logs_dir = Path(project_config["log_directory"])
106
+ logs_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ file_basename = Path(file_path).stem
109
+ output_file = logs_dir / f"{file_basename}-{config.output_suffix}.md"
110
+
111
+ # Read input file
112
+ file_content = Path(file_path).read_text()
113
+
114
+ # Build full prompt
115
+ full_prompt = f"""{config.prompt}
116
+
117
+ ---
118
+
119
+ ## Document to Evaluate
120
+
121
+ **File**: {file_path}
122
+
123
+ {file_content}
124
+ """
125
+
126
+ # Create temp file for prompt
127
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
128
+ f.write(full_prompt)
129
+ prompt_file = f.name
130
+
131
+ prefix = config.log_prefix or config.name.upper()
132
+
133
+ try:
134
+ print(f"{prefix}: Using model {config.model}")
135
+
136
+ # Build aider command
137
+ cmd = [
138
+ "aider",
139
+ "--model", config.model,
140
+ "--yes",
141
+ "--no-git",
142
+ "--no-auto-commits",
143
+ "--message-file", prompt_file,
144
+ "--read", file_path,
145
+ ]
146
+
147
+ result = subprocess.run(
148
+ cmd,
149
+ capture_output=True,
150
+ text=True,
151
+ timeout=timeout,
152
+ env=os.environ,
153
+ )
154
+
155
+ # Check for errors
156
+ output = result.stdout + result.stderr
157
+ if "RateLimitError" in output or "tokens per min" in output:
158
+ _print_rate_limit_error(file_path)
159
+ return 1
160
+
161
+ # Write output
162
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
163
+ header = f"""# {config.output_suffix.replace('-', ' ').replace('_', ' ').title()}
164
+
165
+ **Source**: {file_path}
166
+ **Evaluator**: {config.name}
167
+ **Model**: {config.model}
168
+ **Generated**: {timestamp}
169
+
170
+ ---
171
+
172
+ """
173
+ output_file.write_text(header + result.stdout)
174
+
175
+ print(f"{prefix}: Output written to {output_file}")
176
+
177
+ # Validate output and determine verdict
178
+ is_valid, verdict, message = validate_evaluation_output(str(output_file))
179
+
180
+ if not is_valid:
181
+ print(f"{RED}Evaluation failed: {message}{RESET}")
182
+ return 1
183
+
184
+ return _report_verdict(verdict, output_file, config)
185
+
186
+ except subprocess.TimeoutExpired:
187
+ _print_timeout_error(timeout)
188
+ return 1
189
+ except FileNotFoundError:
190
+ _print_platform_error()
191
+ return 1
192
+ finally:
193
+ Path(prompt_file).unlink(missing_ok=True)
194
+
195
+
196
+ def _execute_script(
197
+ script: str,
198
+ file_path: str,
199
+ config: EvaluatorConfig,
200
+ project_config: dict,
201
+ timeout: int,
202
+ ) -> int:
203
+ """Execute a shell script evaluator."""
204
+ try:
205
+ result = subprocess.run(
206
+ [script, file_path],
207
+ text=True,
208
+ capture_output=True,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Check for rate limit errors
213
+ output = result.stdout + result.stderr
214
+ if "RateLimitError" in output or "tokens per min" in output:
215
+ _print_rate_limit_error(file_path)
216
+ return 1
217
+
218
+ except subprocess.TimeoutExpired:
219
+ _print_timeout_error(timeout)
220
+ return 1
221
+ except FileNotFoundError:
222
+ _print_platform_error()
223
+ return 1
224
+
225
+ # Validate output
226
+ file_basename = Path(file_path).stem
227
+ log_file = Path(project_config["log_directory"]) / f"{file_basename}-{config.output_suffix}.md"
228
+
229
+ is_valid, verdict, message = validate_evaluation_output(str(log_file))
230
+
231
+ if not is_valid:
232
+ print(f"{RED}Evaluation failed: {message}{RESET}")
233
+ return 1
234
+
235
+ return _report_verdict(verdict, log_file, config)
236
+
237
+
238
+ def _report_verdict(verdict: str | None, log_file: Path, config: EvaluatorConfig) -> int:
239
+ """Report the evaluation verdict to terminal."""
240
+ print()
241
+ if verdict == "APPROVED":
242
+ print(f"{GREEN}Evaluation APPROVED!{RESET}")
243
+ print(f" Review output: {log_file}")
244
+ return 0
245
+ elif verdict == "NEEDS_REVISION":
246
+ print(f"{YELLOW}Evaluation NEEDS_REVISION{RESET}")
247
+ print(f" Details: {log_file}")
248
+ return 1
249
+ elif verdict == "REJECTED":
250
+ print(f"{RED}Evaluation REJECTED{RESET}")
251
+ print(f" Details: {log_file}")
252
+ return 1
253
+ else:
254
+ print(f"{YELLOW}Evaluation complete (verdict: {verdict}){RESET}")
255
+ print(f" Review output: {log_file}")
256
+ return 0
257
+
258
+
259
+ # Helper functions
260
+ def _check_file_size(file_path: str) -> tuple[int, int]:
261
+ """Return (line_count, estimated_tokens)."""
262
+ with open(file_path, "r") as f:
263
+ lines = f.readlines()
264
+ f.seek(0)
265
+ content = f.read()
266
+ return len(lines), len(content) // 4
267
+
268
+
269
+ def _warn_large_file(line_count: int, tokens: int) -> None:
270
+ """Print large file warning."""
271
+ print(f"{YELLOW}Large file detected:{RESET}")
272
+ print(f" Lines: {line_count:,}")
273
+ print(f" Estimated tokens: ~{tokens:,}")
274
+ print()
275
+
276
+
277
+ def _confirm_continue() -> bool:
278
+ """Ask user to confirm continuing with large file."""
279
+ response = input("Continue anyway? [y/N]: ").strip().lower()
280
+ return response in ["y", "yes"]
281
+
282
+
283
+ def _print_aider_help() -> None:
284
+ """Print aider installation help."""
285
+ print()
286
+ print(f"{BOLD}FIX:{RESET}")
287
+ print(" 1. Install aider: pip install aider-chat")
288
+ print(" 2. Verify: aider --version")
289
+
290
+
291
+ def _print_rate_limit_error(file_path: str) -> None:
292
+ """Print rate limit error with suggestions."""
293
+ print(f"{RED}Error: API rate limit exceeded{RESET}")
294
+ print()
295
+ print(f"{BOLD}SOLUTIONS:{RESET}")
296
+ print(" 1. Split into smaller documents (<500 lines)")
297
+ print(" 2. Upgrade your API tier")
298
+ print(" 3. Wait and retry")
299
+
300
+
301
+ def _print_timeout_error(timeout: int) -> None:
302
+ """Print timeout error."""
303
+ print(f"{RED}Error: Evaluation timed out (>{timeout}s){RESET}")
304
+
305
+
306
+ def _print_platform_error() -> None:
307
+ """Print platform compatibility error."""
308
+ if platform.system() == "Windows":
309
+ print(f"{RED}Error: Windows not supported{RESET}")
310
+ print(" Use WSL (Windows Subsystem for Linux)")
311
+ else:
312
+ print(f"{RED}Error: Script not found{RESET}")
313
+ print(" Run: adversarial init")
@@ -0,0 +1,17 @@
1
+ """Shared utilities for adversarial-workflow."""
2
+
3
+ from .colors import BOLD, CYAN, GRAY, GREEN, RED, RESET, YELLOW
4
+ from .config import load_config
5
+ from .validation import validate_evaluation_output
6
+
7
+ __all__ = [
8
+ "BOLD",
9
+ "CYAN",
10
+ "GRAY",
11
+ "GREEN",
12
+ "RED",
13
+ "RESET",
14
+ "YELLOW",
15
+ "load_config",
16
+ "validate_evaluation_output",
17
+ ]
@@ -0,0 +1,9 @@
1
+ """Terminal color constants."""
2
+
3
+ RESET = "\033[0m"
4
+ BOLD = "\033[1m"
5
+ GREEN = "\033[92m"
6
+ YELLOW = "\033[93m"
7
+ RED = "\033[91m"
8
+ CYAN = "\033[96m"
9
+ GRAY = "\033[90m"
@@ -0,0 +1,44 @@
1
+ """Configuration loading utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Any
7
+
8
+ import yaml
9
+
10
+
11
+ def load_config(config_path: str = ".adversarial/config.yml") -> dict[str, Any]:
12
+ """Load configuration from YAML file with environment variable overrides."""
13
+ # Default configuration
14
+ config: dict[str, Any] = {
15
+ "evaluator_model": "gpt-4o",
16
+ "task_directory": "tasks/",
17
+ "test_command": "pytest",
18
+ "log_directory": ".adversarial/logs/",
19
+ "artifacts_directory": ".adversarial/artifacts/",
20
+ }
21
+
22
+ # Load from file if exists
23
+ if os.path.exists(config_path):
24
+ with open(config_path) as f:
25
+ file_config = yaml.safe_load(f) or {}
26
+ if not isinstance(file_config, dict):
27
+ raise ValueError(
28
+ f"Config file must be a mapping, got {type(file_config).__name__}"
29
+ )
30
+ config.update(file_config)
31
+
32
+ # Override with environment variables
33
+ env_overrides = {
34
+ "ADVERSARIAL_EVALUATOR_MODEL": "evaluator_model",
35
+ "ADVERSARIAL_TEST_COMMAND": "test_command",
36
+ "ADVERSARIAL_LOG_DIR": "log_directory",
37
+ }
38
+
39
+ for env_var, config_key in env_overrides.items():
40
+ value = os.getenv(env_var)
41
+ if value:
42
+ config[config_key] = value
43
+
44
+ return config