adversarial-workflow 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adversarial_workflow/__init__.py +1 -1
- adversarial_workflow/cli.py +155 -246
- adversarial_workflow/evaluators/__init__.py +45 -0
- adversarial_workflow/evaluators/builtins.py +36 -0
- adversarial_workflow/evaluators/config.py +49 -0
- adversarial_workflow/evaluators/discovery.py +212 -0
- adversarial_workflow/evaluators/runner.py +313 -0
- adversarial_workflow/utils/__init__.py +17 -0
- adversarial_workflow/utils/colors.py +9 -0
- adversarial_workflow/utils/config.py +44 -0
- adversarial_workflow/utils/file_splitter.py +378 -0
- adversarial_workflow/utils/validation.py +76 -0
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/METADATA +94 -4
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/RECORD +18 -8
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/WHEEL +1 -1
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/entry_points.txt +0 -0
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {adversarial_workflow-0.5.0.dist-info → adversarial_workflow-0.6.1.dist-info}/top_level.txt +0 -0
adversarial_workflow/__init__.py
CHANGED
adversarial_workflow/cli.py
CHANGED
|
@@ -27,9 +27,9 @@ from pathlib import Path
|
|
|
27
27
|
from typing import Dict, List, Optional, Tuple
|
|
28
28
|
|
|
29
29
|
import yaml
|
|
30
|
-
from dotenv import load_dotenv
|
|
30
|
+
from dotenv import load_dotenv, dotenv_values
|
|
31
31
|
|
|
32
|
-
__version__ = "0.
|
|
32
|
+
__version__ = "0.6.1"
|
|
33
33
|
|
|
34
34
|
# ANSI color codes for better output
|
|
35
35
|
RESET = "\033[0m"
|
|
@@ -800,26 +800,37 @@ def check() -> int:
|
|
|
800
800
|
issues: List[Dict] = []
|
|
801
801
|
good_checks: List[str] = []
|
|
802
802
|
|
|
803
|
-
# Check for .env file
|
|
803
|
+
# Check for .env file (note: already loaded by main() at startup)
|
|
804
804
|
env_file = Path(".env")
|
|
805
805
|
env_loaded = False
|
|
806
|
-
env_keys_before = set(os.environ.keys())
|
|
807
806
|
|
|
808
807
|
if env_file.exists():
|
|
809
808
|
try:
|
|
809
|
+
# Load .env into environment (idempotent - safe to call again after main())
|
|
810
810
|
load_dotenv(env_file)
|
|
811
|
-
|
|
812
|
-
|
|
811
|
+
# Use dotenv_values() to count variables directly from file
|
|
812
|
+
# This gives accurate count regardless of what was already in environment
|
|
813
|
+
env_vars = dotenv_values(env_file)
|
|
813
814
|
env_loaded = True
|
|
814
815
|
good_checks.append(
|
|
815
|
-
f".env file found
|
|
816
|
+
f".env file found ({len(env_vars)} variables configured)"
|
|
816
817
|
)
|
|
817
|
-
except
|
|
818
|
+
except (FileNotFoundError, PermissionError) as e:
|
|
819
|
+
# File access errors
|
|
818
820
|
issues.append(
|
|
819
821
|
{
|
|
820
822
|
"severity": "WARNING",
|
|
821
|
-
"message": f".env file found but could not be
|
|
822
|
-
"fix": "Check .env file
|
|
823
|
+
"message": f".env file found but could not be read: {e}",
|
|
824
|
+
"fix": "Check .env file permissions",
|
|
825
|
+
}
|
|
826
|
+
)
|
|
827
|
+
except (OSError, ValueError) as e:
|
|
828
|
+
# Covers UnicodeDecodeError (ValueError subclass) and other OS errors
|
|
829
|
+
issues.append(
|
|
830
|
+
{
|
|
831
|
+
"severity": "WARNING",
|
|
832
|
+
"message": f".env file found but could not be parsed: {e}",
|
|
833
|
+
"fix": "Check .env file encoding (should be UTF-8)",
|
|
823
834
|
}
|
|
824
835
|
)
|
|
825
836
|
else:
|
|
@@ -2086,225 +2097,8 @@ def evaluate(task_file: str) -> int:
|
|
|
2086
2097
|
return 0
|
|
2087
2098
|
|
|
2088
2099
|
|
|
2089
|
-
def proofread(doc_file: str) -> int:
|
|
2090
|
-
"""Run proofreading review on teaching/documentation content."""
|
|
2091
|
-
|
|
2092
|
-
print(f"📖 Proofreading document: {doc_file}")
|
|
2093
|
-
print()
|
|
2094
|
-
|
|
2095
|
-
# Error 1: Document file not found
|
|
2096
|
-
if not os.path.exists(doc_file):
|
|
2097
|
-
print(f"{RED}❌ ERROR: Document file not found: {doc_file}{RESET}")
|
|
2098
|
-
print(" Usage: adversarial proofread <document_file>")
|
|
2099
|
-
print(" Example: adversarial proofread docs/guide/concept.md")
|
|
2100
|
-
return 1
|
|
2101
|
-
|
|
2102
|
-
# Error 2: Config not loaded
|
|
2103
|
-
try:
|
|
2104
|
-
config = load_config()
|
|
2105
|
-
except FileNotFoundError:
|
|
2106
|
-
print(f"{RED}❌ ERROR: Not initialized. Run 'adversarial init' first.{RESET}")
|
|
2107
|
-
return 1
|
|
2108
|
-
|
|
2109
|
-
# Error 3: Aider not available
|
|
2110
|
-
if not shutil.which("aider"):
|
|
2111
|
-
print(f"{RED}❌ ERROR: Aider not found{RESET}")
|
|
2112
|
-
print()
|
|
2113
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2114
|
-
print(" Proofreader uses aider (AI pair programming tool) to:")
|
|
2115
|
-
print(" • Review teaching content quality")
|
|
2116
|
-
print(" • Check clarity and accuracy")
|
|
2117
|
-
print(" • Evaluate pedagogical effectiveness")
|
|
2118
|
-
print()
|
|
2119
|
-
print(f"{BOLD}FIX:{RESET}")
|
|
2120
|
-
print(" 1. Install aider: pip install aider-chat")
|
|
2121
|
-
print(" 2. Verify installation: aider --version")
|
|
2122
|
-
print(" 3. Then retry: adversarial proofread ...")
|
|
2123
|
-
print()
|
|
2124
|
-
print(f"{BOLD}HELP:{RESET}")
|
|
2125
|
-
print(" Aider docs: https://aider.chat/docs/install.html")
|
|
2126
|
-
return 1
|
|
2127
|
-
|
|
2128
|
-
# Pre-flight check for file size
|
|
2129
|
-
with open(doc_file, "r") as f:
|
|
2130
|
-
line_count = len(f.readlines())
|
|
2131
|
-
f.seek(0)
|
|
2132
|
-
file_size = len(f.read())
|
|
2133
|
-
|
|
2134
|
-
# Estimate tokens (1 token ≈ 4 characters)
|
|
2135
|
-
estimated_tokens = file_size // 4
|
|
2136
|
-
|
|
2137
|
-
# Warn if file is large (>500 lines or >20k tokens)
|
|
2138
|
-
if line_count > 500 or estimated_tokens > 20000:
|
|
2139
|
-
print(f"{YELLOW}⚠️ Large file detected:{RESET}")
|
|
2140
|
-
print(f" Lines: {line_count:,}")
|
|
2141
|
-
print(f" Estimated tokens: ~{estimated_tokens:,}")
|
|
2142
|
-
print()
|
|
2143
|
-
print(f"{BOLD}Note:{RESET} Files over 500 lines may exceed OpenAI rate limits.")
|
|
2144
|
-
print(
|
|
2145
|
-
f" If proofreading fails, consider splitting into smaller documents."
|
|
2146
|
-
)
|
|
2147
|
-
print()
|
|
2148
|
-
|
|
2149
|
-
# Give user a chance to cancel for very large files
|
|
2150
|
-
if line_count > 700:
|
|
2151
|
-
print(f"{RED}⚠️ WARNING: File is very large (>{line_count} lines){RESET}")
|
|
2152
|
-
print(f" This will likely fail on Tier 1 OpenAI accounts (30k TPM limit)")
|
|
2153
|
-
print(f" Recommended: Split into files <500 lines each")
|
|
2154
|
-
print()
|
|
2155
|
-
response = input("Continue anyway? [y/N]: ").strip().lower()
|
|
2156
|
-
if response not in ["y", "yes"]:
|
|
2157
|
-
print("Proofreading cancelled.")
|
|
2158
|
-
return 0
|
|
2159
|
-
print()
|
|
2160
|
-
|
|
2161
|
-
# Error 4: Script execution fails
|
|
2162
|
-
script = ".adversarial/scripts/proofread_content.sh"
|
|
2163
|
-
if not os.path.exists(script):
|
|
2164
|
-
print(f"{RED}❌ ERROR: Script not found: {script}{RESET}")
|
|
2165
|
-
print(" Fix: Run 'adversarial init' to reinstall scripts")
|
|
2166
|
-
return 1
|
|
2167
|
-
|
|
2168
|
-
try:
|
|
2169
|
-
result = subprocess.run(
|
|
2170
|
-
[script, doc_file], text=True, capture_output=True, timeout=180 # 3 minutes
|
|
2171
|
-
)
|
|
2172
|
-
|
|
2173
|
-
# Check for rate limit errors in output
|
|
2174
|
-
output = result.stdout + result.stderr
|
|
2175
|
-
if "RateLimitError" in output or "tokens per min (TPM)" in output:
|
|
2176
|
-
print(f"{RED}❌ ERROR: OpenAI rate limit exceeded{RESET}")
|
|
2177
|
-
print()
|
|
2178
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2179
|
-
print(
|
|
2180
|
-
" Your document file is too large for your OpenAI organization's rate limit"
|
|
2181
|
-
)
|
|
2182
|
-
print()
|
|
2183
|
-
|
|
2184
|
-
# Extract file size for helpful message
|
|
2185
|
-
with open(doc_file, "r") as f:
|
|
2186
|
-
line_count = len(f.readlines())
|
|
2187
|
-
|
|
2188
|
-
print(f"{BOLD}FILE SIZE:{RESET}")
|
|
2189
|
-
print(f" Lines: {line_count:,}")
|
|
2190
|
-
print(f" Recommended limit: 500 lines")
|
|
2191
|
-
print()
|
|
2192
|
-
print(f"{BOLD}SOLUTIONS:{RESET}")
|
|
2193
|
-
print(" 1. Split your document into smaller files (<500 lines each)")
|
|
2194
|
-
print(" 2. Upgrade your OpenAI tier (Tier 2 supports ~1,000 lines)")
|
|
2195
|
-
print(" 3. Use manual review for this comprehensive document")
|
|
2196
|
-
print()
|
|
2197
|
-
print(f"{BOLD}MORE INFO:{RESET}")
|
|
2198
|
-
print(" https://platform.openai.com/docs/guides/rate-limits")
|
|
2199
|
-
return 1
|
|
2200
|
-
|
|
2201
|
-
except subprocess.TimeoutExpired:
|
|
2202
|
-
print(f"{RED}❌ ERROR: Proofreading timed out (>3 minutes){RESET}")
|
|
2203
|
-
print()
|
|
2204
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2205
|
-
print(" The AI model took too long to respond")
|
|
2206
|
-
print()
|
|
2207
|
-
print(f"{BOLD}POSSIBLE CAUSES:{RESET}")
|
|
2208
|
-
print(" • Network issues connecting to API")
|
|
2209
|
-
print(" • Document file too large (>1000 lines)")
|
|
2210
|
-
print(" • API rate limiting")
|
|
2211
|
-
print()
|
|
2212
|
-
print(f"{BOLD}FIX:{RESET}")
|
|
2213
|
-
print(" 1. Check your network connection")
|
|
2214
|
-
print(" 2. Try a smaller document file")
|
|
2215
|
-
print(" 3. Wait a few minutes and retry")
|
|
2216
|
-
return 1
|
|
2217
|
-
except FileNotFoundError as e:
|
|
2218
|
-
# Check if this is a bash/platform issue
|
|
2219
|
-
if platform.system() == "Windows":
|
|
2220
|
-
print(f"{RED}❌ ERROR: Cannot execute workflow scripts{RESET}")
|
|
2221
|
-
print()
|
|
2222
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2223
|
-
print(" Native Windows (PowerShell/CMD) cannot run bash scripts")
|
|
2224
|
-
print(" This package requires Unix shell (bash) for workflow automation")
|
|
2225
|
-
print()
|
|
2226
|
-
print(f"{BOLD}FIX:{RESET}")
|
|
2227
|
-
print(" Option 1 (RECOMMENDED): Use WSL (Windows Subsystem for Linux)")
|
|
2228
|
-
print(
|
|
2229
|
-
" 1. Install WSL: https://learn.microsoft.com/windows/wsl/install"
|
|
2230
|
-
)
|
|
2231
|
-
print(" 2. Open WSL terminal")
|
|
2232
|
-
print(" 3. Reinstall package in WSL: pip install adversarial-workflow")
|
|
2233
|
-
print()
|
|
2234
|
-
print(" Option 2: Try Git Bash (not officially supported)")
|
|
2235
|
-
print(" • May have compatibility issues")
|
|
2236
|
-
print(" • WSL is strongly recommended")
|
|
2237
|
-
print()
|
|
2238
|
-
print(f"{BOLD}HELP:{RESET}")
|
|
2239
|
-
print(" See platform requirements: README.md#platform-support")
|
|
2240
|
-
else:
|
|
2241
|
-
print(f"{RED}❌ ERROR: Script not found: {script}{RESET}")
|
|
2242
|
-
print()
|
|
2243
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2244
|
-
print(" Workflow scripts are missing or corrupted")
|
|
2245
|
-
print()
|
|
2246
|
-
print(f"{BOLD}FIX:{RESET}")
|
|
2247
|
-
print(" Run: adversarial init")
|
|
2248
|
-
print(" This will reinstall all workflow scripts")
|
|
2249
|
-
return 1
|
|
2250
|
-
|
|
2251
|
-
# Error 5: Proofreading rejected
|
|
2252
|
-
if result.returncode != 0:
|
|
2253
|
-
print()
|
|
2254
|
-
print("📋 Proofreading complete (needs revision)")
|
|
2255
|
-
print(f" Details: {config['log_directory']}")
|
|
2256
|
-
return result.returncode
|
|
2257
|
-
|
|
2258
|
-
# Error 6: Validation - Check if proofreading actually ran (not just empty output)
|
|
2259
|
-
# Extract document name from filename to find log file
|
|
2260
|
-
doc_basename = os.path.basename(doc_file)
|
|
2261
|
-
doc_name = os.path.splitext(doc_basename)[0]
|
|
2262
2100
|
|
|
2263
|
-
log_file = os.path.join(config["log_directory"], f"{doc_name}-PROOFREADING.md")
|
|
2264
2101
|
|
|
2265
|
-
is_valid, verdict, message = validate_evaluation_output(log_file)
|
|
2266
|
-
if not is_valid:
|
|
2267
|
-
print()
|
|
2268
|
-
print(f"{RED}❌ Proofreading failed: {message}{RESET}")
|
|
2269
|
-
print()
|
|
2270
|
-
print(f"{BOLD}WHY:{RESET}")
|
|
2271
|
-
print(" The proofreading script ran but didn't produce valid output")
|
|
2272
|
-
print(" This usually means Aider encountered an error before running GPT-4o")
|
|
2273
|
-
print()
|
|
2274
|
-
print(f"{BOLD}LOG FILE:{RESET}")
|
|
2275
|
-
print(f" {log_file}")
|
|
2276
|
-
print()
|
|
2277
|
-
print(f"{BOLD}FIX:{RESET}")
|
|
2278
|
-
print(" 1. Check the log file for error messages")
|
|
2279
|
-
print(" 2. Ensure your API keys are valid: adversarial check")
|
|
2280
|
-
print(" 3. Try running the proofreading again")
|
|
2281
|
-
print()
|
|
2282
|
-
return 1
|
|
2283
|
-
|
|
2284
|
-
# Verify token count (warn if suspiciously low)
|
|
2285
|
-
verify_token_count(doc_file, log_file)
|
|
2286
|
-
|
|
2287
|
-
# Report based on actual verdict from proofreading
|
|
2288
|
-
print()
|
|
2289
|
-
if verdict == "APPROVED":
|
|
2290
|
-
print(f"{GREEN}✅ Proofreading APPROVED!{RESET}")
|
|
2291
|
-
print(f" Document is ready for publication")
|
|
2292
|
-
print(f" Review output: {log_file}")
|
|
2293
|
-
return 0
|
|
2294
|
-
elif verdict == "NEEDS_REVISION":
|
|
2295
|
-
print(f"{YELLOW}⚠️ Proofreading NEEDS_REVISION{RESET}")
|
|
2296
|
-
print(f" Review feedback and update document")
|
|
2297
|
-
print(f" Details: {log_file}")
|
|
2298
|
-
return 1
|
|
2299
|
-
elif verdict == "REJECTED":
|
|
2300
|
-
print(f"{RED}❌ Proofreading REJECTED{RESET}")
|
|
2301
|
-
print(f" Document has fundamental issues - major revision needed")
|
|
2302
|
-
print(f" Details: {log_file}")
|
|
2303
|
-
return 1
|
|
2304
|
-
else: # UNKNOWN or other
|
|
2305
|
-
print(f"{YELLOW}⚠️ Proofreading complete (verdict: {verdict}){RESET}")
|
|
2306
|
-
print(f" Review output: {log_file}")
|
|
2307
|
-
return 0
|
|
2308
2102
|
|
|
2309
2103
|
|
|
2310
2104
|
def review() -> int:
|
|
@@ -3041,8 +2835,74 @@ def split(task_file: str, strategy: str = "sections", max_lines: int = 500, dry_
|
|
|
3041
2835
|
print(f"{RED}Error during file splitting: {e}{RESET}")
|
|
3042
2836
|
return 1
|
|
3043
2837
|
|
|
2838
|
+
|
|
2839
|
+
def list_evaluators() -> int:
|
|
2840
|
+
"""List all available evaluators (built-in and local)."""
|
|
2841
|
+
from adversarial_workflow.evaluators import (
|
|
2842
|
+
BUILTIN_EVALUATORS,
|
|
2843
|
+
discover_local_evaluators,
|
|
2844
|
+
)
|
|
2845
|
+
|
|
2846
|
+
# Print built-in evaluators
|
|
2847
|
+
print(f"{BOLD}Built-in Evaluators:{RESET}")
|
|
2848
|
+
for name, config in sorted(BUILTIN_EVALUATORS.items()):
|
|
2849
|
+
print(f" {name:14} {config.description}")
|
|
2850
|
+
|
|
2851
|
+
print()
|
|
2852
|
+
|
|
2853
|
+
# Print local evaluators
|
|
2854
|
+
local_evaluators = discover_local_evaluators()
|
|
2855
|
+
if local_evaluators:
|
|
2856
|
+
print(f"{BOLD}Local Evaluators{RESET} (.adversarial/evaluators/):")
|
|
2857
|
+
|
|
2858
|
+
# Group by primary name (skip aliases)
|
|
2859
|
+
seen_configs = set()
|
|
2860
|
+
for _, config in sorted(local_evaluators.items()):
|
|
2861
|
+
if id(config) in seen_configs:
|
|
2862
|
+
continue
|
|
2863
|
+
seen_configs.add(id(config))
|
|
2864
|
+
|
|
2865
|
+
print(f" {config.name:14} {config.description}")
|
|
2866
|
+
if config.aliases:
|
|
2867
|
+
print(f" aliases: {', '.join(config.aliases)}")
|
|
2868
|
+
print(f" model: {config.model}")
|
|
2869
|
+
if config.version != "1.0.0":
|
|
2870
|
+
print(f" version: {config.version}")
|
|
2871
|
+
else:
|
|
2872
|
+
print(f"{GRAY}No local evaluators found.{RESET}")
|
|
2873
|
+
print()
|
|
2874
|
+
print("Create .adversarial/evaluators/*.yml to add custom evaluators.")
|
|
2875
|
+
print("See: https://github.com/movito/adversarial-workflow#custom-evaluators")
|
|
2876
|
+
|
|
2877
|
+
return 0
|
|
2878
|
+
|
|
3044
2879
|
def main():
|
|
3045
2880
|
"""Main CLI entry point."""
|
|
2881
|
+
import logging
|
|
2882
|
+
import sys
|
|
2883
|
+
|
|
2884
|
+
# Load .env file before any commands run
|
|
2885
|
+
# Wrapped in try/except so CLI remains usable even with malformed .env
|
|
2886
|
+
try:
|
|
2887
|
+
load_dotenv()
|
|
2888
|
+
except Exception as e:
|
|
2889
|
+
print(f"Warning: Could not load .env file: {e}", file=sys.stderr)
|
|
2890
|
+
|
|
2891
|
+
from adversarial_workflow.evaluators import (
|
|
2892
|
+
get_all_evaluators,
|
|
2893
|
+
run_evaluator,
|
|
2894
|
+
BUILTIN_EVALUATORS,
|
|
2895
|
+
)
|
|
2896
|
+
|
|
2897
|
+
logger = logging.getLogger(__name__)
|
|
2898
|
+
|
|
2899
|
+
# Commands that cannot be overridden by evaluators
|
|
2900
|
+
# Note: 'review' is special - it reviews git changes without a file argument
|
|
2901
|
+
STATIC_COMMANDS = {
|
|
2902
|
+
"init", "check", "doctor", "health", "quickstart",
|
|
2903
|
+
"agent", "split", "validate", "review", "list-evaluators"
|
|
2904
|
+
}
|
|
2905
|
+
|
|
3046
2906
|
parser = argparse.ArgumentParser(
|
|
3047
2907
|
description="Adversarial Workflow - Multi-stage AI code review",
|
|
3048
2908
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
@@ -3112,17 +2972,7 @@ For more information: https://github.com/movito/adversarial-workflow
|
|
|
3112
2972
|
"--path", default=".", help="Project path (default: current directory)"
|
|
3113
2973
|
)
|
|
3114
2974
|
|
|
3115
|
-
#
|
|
3116
|
-
eval_parser = subparsers.add_parser("evaluate", help="Run Phase 1: Plan evaluation")
|
|
3117
|
-
eval_parser.add_argument("task_file", help="Task file to evaluate")
|
|
3118
|
-
|
|
3119
|
-
# proofread command
|
|
3120
|
-
proofread_parser = subparsers.add_parser(
|
|
3121
|
-
"proofread", help="Proofread teaching content and documentation"
|
|
3122
|
-
)
|
|
3123
|
-
proofread_parser.add_argument("doc_file", help="Document file to proofread")
|
|
3124
|
-
|
|
3125
|
-
# review command
|
|
2975
|
+
# review command (static - reviews git changes, no file argument)
|
|
3126
2976
|
subparsers.add_parser("review", help="Run Phase 3: Code review")
|
|
3127
2977
|
|
|
3128
2978
|
# validate command
|
|
@@ -3151,13 +3001,74 @@ For more information: https://github.com/movito/adversarial-workflow
|
|
|
3151
3001
|
help="Preview splits without creating files"
|
|
3152
3002
|
)
|
|
3153
3003
|
|
|
3004
|
+
# list-evaluators command
|
|
3005
|
+
subparsers.add_parser(
|
|
3006
|
+
"list-evaluators",
|
|
3007
|
+
help="List all available evaluators (built-in and local)",
|
|
3008
|
+
)
|
|
3009
|
+
|
|
3010
|
+
# Dynamic evaluator registration
|
|
3011
|
+
try:
|
|
3012
|
+
evaluators = get_all_evaluators()
|
|
3013
|
+
except Exception as e:
|
|
3014
|
+
logger.warning("Evaluator discovery failed: %s", e)
|
|
3015
|
+
evaluators = BUILTIN_EVALUATORS
|
|
3016
|
+
|
|
3017
|
+
registered_configs = set() # Track by id() to avoid duplicate alias registration
|
|
3018
|
+
|
|
3019
|
+
for name, config in evaluators.items():
|
|
3020
|
+
# Skip if name conflicts with static command
|
|
3021
|
+
if name in STATIC_COMMANDS:
|
|
3022
|
+
logger.warning("Evaluator '%s' conflicts with CLI command; skipping", name)
|
|
3023
|
+
# Mark as registered to prevent alias re-registration attempts
|
|
3024
|
+
registered_configs.add(id(config))
|
|
3025
|
+
continue
|
|
3026
|
+
|
|
3027
|
+
# Skip if this config was already registered (aliases share config object)
|
|
3028
|
+
if id(config) in registered_configs:
|
|
3029
|
+
continue
|
|
3030
|
+
registered_configs.add(id(config))
|
|
3031
|
+
|
|
3032
|
+
# Filter aliases that conflict with static commands
|
|
3033
|
+
aliases = [a for a in (config.aliases or []) if a not in STATIC_COMMANDS]
|
|
3034
|
+
if config.aliases and len(aliases) != len(config.aliases):
|
|
3035
|
+
skipped = [a for a in config.aliases if a in STATIC_COMMANDS]
|
|
3036
|
+
logger.warning(
|
|
3037
|
+
"Skipping evaluator aliases that conflict with static commands: %s",
|
|
3038
|
+
skipped,
|
|
3039
|
+
)
|
|
3040
|
+
|
|
3041
|
+
# Create subparser for this evaluator
|
|
3042
|
+
eval_parser = subparsers.add_parser(
|
|
3043
|
+
config.name,
|
|
3044
|
+
help=config.description,
|
|
3045
|
+
aliases=aliases,
|
|
3046
|
+
)
|
|
3047
|
+
eval_parser.add_argument("file", help="File to evaluate")
|
|
3048
|
+
eval_parser.add_argument(
|
|
3049
|
+
"--timeout", "-t",
|
|
3050
|
+
type=int,
|
|
3051
|
+
default=180,
|
|
3052
|
+
help="Timeout in seconds (default: 180)"
|
|
3053
|
+
)
|
|
3054
|
+
# Store config for later execution
|
|
3055
|
+
eval_parser.set_defaults(evaluator_config=config)
|
|
3056
|
+
|
|
3154
3057
|
args = parser.parse_args()
|
|
3155
3058
|
|
|
3156
3059
|
if not args.command:
|
|
3157
3060
|
parser.print_help()
|
|
3158
3061
|
return 0
|
|
3159
3062
|
|
|
3160
|
-
#
|
|
3063
|
+
# Check for evaluator command first (has evaluator_config attribute)
|
|
3064
|
+
if hasattr(args, "evaluator_config"):
|
|
3065
|
+
return run_evaluator(
|
|
3066
|
+
args.evaluator_config,
|
|
3067
|
+
args.file,
|
|
3068
|
+
timeout=args.timeout,
|
|
3069
|
+
)
|
|
3070
|
+
|
|
3071
|
+
# Execute static commands
|
|
3161
3072
|
if args.command == "init":
|
|
3162
3073
|
if args.interactive:
|
|
3163
3074
|
return init_interactive(args.path)
|
|
@@ -3177,21 +3088,19 @@ For more information: https://github.com/movito/adversarial-workflow
|
|
|
3177
3088
|
print(f"{RED}Error: agent command requires a subcommand{RESET}")
|
|
3178
3089
|
print("Usage: adversarial agent onboard")
|
|
3179
3090
|
return 1
|
|
3180
|
-
elif args.command == "evaluate":
|
|
3181
|
-
return evaluate(args.task_file)
|
|
3182
|
-
elif args.command == "proofread":
|
|
3183
|
-
return proofread(args.doc_file)
|
|
3184
3091
|
elif args.command == "review":
|
|
3185
3092
|
return review()
|
|
3186
3093
|
elif args.command == "validate":
|
|
3187
3094
|
return validate(args.test_command)
|
|
3188
3095
|
elif args.command == "split":
|
|
3189
3096
|
return split(
|
|
3190
|
-
args.task_file,
|
|
3191
|
-
strategy=args.strategy,
|
|
3192
|
-
max_lines=args.max_lines,
|
|
3097
|
+
args.task_file,
|
|
3098
|
+
strategy=args.strategy,
|
|
3099
|
+
max_lines=args.max_lines,
|
|
3193
3100
|
dry_run=args.dry_run
|
|
3194
3101
|
)
|
|
3102
|
+
elif args.command == "list-evaluators":
|
|
3103
|
+
return list_evaluators()
|
|
3195
3104
|
else:
|
|
3196
3105
|
parser.print_help()
|
|
3197
3106
|
return 1
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Evaluators module for adversarial-workflow plugin architecture."""
|
|
2
|
+
|
|
3
|
+
from .config import EvaluatorConfig
|
|
4
|
+
from .discovery import (
|
|
5
|
+
discover_local_evaluators,
|
|
6
|
+
parse_evaluator_yaml,
|
|
7
|
+
EvaluatorParseError,
|
|
8
|
+
)
|
|
9
|
+
from .runner import run_evaluator
|
|
10
|
+
from .builtins import BUILTIN_EVALUATORS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_all_evaluators() -> dict[str, EvaluatorConfig]:
|
|
14
|
+
"""Get all available evaluators (built-in + local).
|
|
15
|
+
|
|
16
|
+
Local evaluators override built-in evaluators with the same name.
|
|
17
|
+
Aliases from local evaluators are also included in the returned dictionary.
|
|
18
|
+
"""
|
|
19
|
+
import logging
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
evaluators: dict[str, EvaluatorConfig] = {}
|
|
23
|
+
|
|
24
|
+
# Add built-in evaluators first
|
|
25
|
+
evaluators.update(BUILTIN_EVALUATORS)
|
|
26
|
+
|
|
27
|
+
# Discover and add local evaluators (may override built-ins)
|
|
28
|
+
local = discover_local_evaluators()
|
|
29
|
+
for name, config in local.items():
|
|
30
|
+
if name in BUILTIN_EVALUATORS:
|
|
31
|
+
logger.info("Local evaluator '%s' overrides built-in", name)
|
|
32
|
+
evaluators[name] = config
|
|
33
|
+
|
|
34
|
+
return evaluators
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"EvaluatorConfig",
|
|
39
|
+
"EvaluatorParseError",
|
|
40
|
+
"run_evaluator",
|
|
41
|
+
"get_all_evaluators",
|
|
42
|
+
"discover_local_evaluators",
|
|
43
|
+
"parse_evaluator_yaml",
|
|
44
|
+
"BUILTIN_EVALUATORS",
|
|
45
|
+
]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Built-in evaluator configurations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .config import EvaluatorConfig
|
|
6
|
+
|
|
7
|
+
# Built-in evaluators use shell scripts - prompts are in the scripts
|
|
8
|
+
BUILTIN_EVALUATORS: dict[str, EvaluatorConfig] = {
|
|
9
|
+
"evaluate": EvaluatorConfig(
|
|
10
|
+
name="evaluate",
|
|
11
|
+
description="Plan evaluation (GPT-4o)",
|
|
12
|
+
model="gpt-4o",
|
|
13
|
+
api_key_env="OPENAI_API_KEY",
|
|
14
|
+
prompt="", # Prompt is in shell script
|
|
15
|
+
output_suffix="PLAN-EVALUATION",
|
|
16
|
+
source="builtin",
|
|
17
|
+
),
|
|
18
|
+
"proofread": EvaluatorConfig(
|
|
19
|
+
name="proofread",
|
|
20
|
+
description="Teaching content review (GPT-4o)",
|
|
21
|
+
model="gpt-4o",
|
|
22
|
+
api_key_env="OPENAI_API_KEY",
|
|
23
|
+
prompt="", # Prompt is in shell script
|
|
24
|
+
output_suffix="PROOFREADING",
|
|
25
|
+
source="builtin",
|
|
26
|
+
),
|
|
27
|
+
"review": EvaluatorConfig(
|
|
28
|
+
name="review",
|
|
29
|
+
description="Code review (GPT-4o)",
|
|
30
|
+
model="gpt-4o",
|
|
31
|
+
api_key_env="OPENAI_API_KEY",
|
|
32
|
+
prompt="", # Prompt is in shell script
|
|
33
|
+
output_suffix="CODE-REVIEW",
|
|
34
|
+
source="builtin",
|
|
35
|
+
),
|
|
36
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EvaluatorConfig dataclass for evaluator definitions.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class EvaluatorConfig:
|
|
12
|
+
"""Configuration for an evaluator (built-in or custom).
|
|
13
|
+
|
|
14
|
+
This dataclass represents the configuration for any evaluator,
|
|
15
|
+
whether built-in (evaluate, proofread, review) or custom
|
|
16
|
+
(defined in .adversarial/evaluators/*.yml).
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
name: Command name (e.g., "evaluate", "athena")
|
|
20
|
+
description: Help text shown in CLI
|
|
21
|
+
model: Model to use (e.g., "gpt-4o", "gemini-2.5-pro")
|
|
22
|
+
api_key_env: Environment variable name for API key
|
|
23
|
+
prompt: The evaluation prompt template
|
|
24
|
+
output_suffix: Log file suffix (e.g., "PLAN-EVALUATION")
|
|
25
|
+
log_prefix: CLI output prefix (e.g., "ATHENA")
|
|
26
|
+
fallback_model: Fallback model if primary fails
|
|
27
|
+
aliases: Alternative command names
|
|
28
|
+
version: Evaluator version
|
|
29
|
+
source: "builtin" or "local" (set internally)
|
|
30
|
+
config_file: Path to YAML file if local (set internally)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Required fields
|
|
34
|
+
name: str
|
|
35
|
+
description: str
|
|
36
|
+
model: str
|
|
37
|
+
api_key_env: str
|
|
38
|
+
prompt: str
|
|
39
|
+
output_suffix: str
|
|
40
|
+
|
|
41
|
+
# Optional fields with defaults
|
|
42
|
+
log_prefix: str = ""
|
|
43
|
+
fallback_model: str | None = None
|
|
44
|
+
aliases: list[str] = field(default_factory=list)
|
|
45
|
+
version: str = "1.0.0"
|
|
46
|
+
|
|
47
|
+
# Metadata (set internally during discovery, not from YAML)
|
|
48
|
+
source: str = "builtin"
|
|
49
|
+
config_file: str | None = None
|