arbiter-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbiter/__init__.py +3 -0
- arbiter/cli/__init__.py +0 -0
- arbiter/cli/app.py +699 -0
- arbiter/cli/display.py +381 -0
- arbiter/core/__init__.py +0 -0
- arbiter/core/benchmarks.py +804 -0
- arbiter/core/config.py +137 -0
- arbiter/core/discover.py +184 -0
- arbiter/core/judge.py +193 -0
- arbiter/core/leaderboard.py +197 -0
- arbiter/core/metrics.py +367 -0
- arbiter/core/providers/__init__.py +19 -0
- arbiter/core/providers/anthropic_provider.py +133 -0
- arbiter/core/providers/base.py +62 -0
- arbiter/core/providers/factory.py +79 -0
- arbiter/core/providers/google_provider.py +126 -0
- arbiter/core/providers/ollama.py +103 -0
- arbiter/core/providers/openai_provider.py +120 -0
- arbiter/core/runner.py +257 -0
- arbiter/core/swe/__init__.py +1 -0
- arbiter/core/swe/container.py +158 -0
- arbiter/core/swe/runner.py +220 -0
- arbiter/core/swe/sandbox.py +111 -0
- arbiter/core/swe/test_packs.py +548 -0
- arbiter/dashboard/__init__.py +0 -0
- arbiter/dashboard/frontend/dist/assets/index-1tkxJouQ.css +1 -0
- arbiter/dashboard/frontend/dist/assets/index-dHa4zmvw.js +298 -0
- arbiter/dashboard/frontend/dist/index.html +16 -0
- arbiter/dashboard/server.py +426 -0
- arbiter_cli-0.1.0.dist-info/METADATA +299 -0
- arbiter_cli-0.1.0.dist-info/RECORD +35 -0
- arbiter_cli-0.1.0.dist-info/WHEEL +5 -0
- arbiter_cli-0.1.0.dist-info/entry_points.txt +2 -0
- arbiter_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- arbiter_cli-0.1.0.dist-info/top_level.txt +1 -0
arbiter/core/config.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Configuration and defaults for Arbiter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
ARBITER_DIR = Path.home() / ".arbiter"
|
|
13
|
+
LEADERBOARD_FILE = ARBITER_DIR / "leaderboard.json"
|
|
14
|
+
CONFIG_FILE = ARBITER_DIR / "config.json"
|
|
15
|
+
|
|
16
|
+
# Ollama defaults
|
|
17
|
+
DEFAULT_OLLAMA_HOST = "http://localhost:11434"
|
|
18
|
+
|
|
19
|
+
# Provider identifiers
|
|
20
|
+
PROVIDER_OLLAMA = "ollama"
|
|
21
|
+
PROVIDER_OPENAI = "openai"
|
|
22
|
+
PROVIDER_ANTHROPIC = "anthropic"
|
|
23
|
+
PROVIDER_GOOGLE = "google"
|
|
24
|
+
PROVIDER_OPENAI_COMPAT = "openai-compatible"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ProviderConfig:
|
|
29
|
+
"""Configuration for a single LLM provider."""
|
|
30
|
+
|
|
31
|
+
provider: str
|
|
32
|
+
api_key: Optional[str] = None
|
|
33
|
+
base_url: Optional[str] = None
|
|
34
|
+
extra: dict = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def ensure_arbiter_dir() -> Path:
|
|
38
|
+
"""Create ~/.arbiter/ if it doesn't exist."""
|
|
39
|
+
ARBITER_DIR.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
return ARBITER_DIR
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_ollama_host() -> str:
|
|
44
|
+
"""Get the Ollama API host from env or default."""
|
|
45
|
+
return os.environ.get("OLLAMA_HOST", DEFAULT_OLLAMA_HOST)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def detect_provider(model_name: str) -> tuple[str, str]:
|
|
49
|
+
"""Detect the provider and clean model name from a model specifier.
|
|
50
|
+
|
|
51
|
+
Formats:
|
|
52
|
+
"gemma4" -> (ollama, gemma4)
|
|
53
|
+
"ollama:gemma4" -> (ollama, gemma4)
|
|
54
|
+
"openai:gpt-4o" -> (openai, gpt-4o)
|
|
55
|
+
"anthropic:claude-sonnet-4-20250514" -> (anthropic, claude-sonnet-4-20250514)
|
|
56
|
+
"google:gemini-2.0-flash" -> (google, gemini-2.0-flash)
|
|
57
|
+
"http://host:port/v1:model" -> (openai-compatible, model) with base_url
|
|
58
|
+
"""
|
|
59
|
+
if ":" in model_name:
|
|
60
|
+
prefix, _, rest = model_name.partition(":")
|
|
61
|
+
|
|
62
|
+
# Check for known providers
|
|
63
|
+
if prefix in (PROVIDER_OLLAMA, PROVIDER_OPENAI, PROVIDER_ANTHROPIC, PROVIDER_GOOGLE):
|
|
64
|
+
return prefix, rest
|
|
65
|
+
|
|
66
|
+
# Check for URL-based custom endpoint (openai-compatible)
|
|
67
|
+
if prefix in ("http", "https"):
|
|
68
|
+
# Format: http://host:port/v1:model_name
|
|
69
|
+
# Find the last colon that separates URL from model
|
|
70
|
+
full = model_name
|
|
71
|
+
last_colon = full.rfind(":")
|
|
72
|
+
# Walk backwards to find the actual model separator
|
|
73
|
+
# URLs have colons in protocol and port, so we look for the pattern
|
|
74
|
+
# where what's after the colon looks like a model name (no slashes)
|
|
75
|
+
parts = full.rsplit(":", 1)
|
|
76
|
+
if len(parts) == 2 and "/" not in parts[1]:
|
|
77
|
+
base_url = parts[0]
|
|
78
|
+
model = parts[1]
|
|
79
|
+
return PROVIDER_OPENAI_COMPAT, f"{base_url}||{model}"
|
|
80
|
+
|
|
81
|
+
# Ollama model with tag (e.g. "gemma4:latest" or "qwen2.5:7b")
|
|
82
|
+
return PROVIDER_OLLAMA, model_name
|
|
83
|
+
|
|
84
|
+
return PROVIDER_OLLAMA, model_name
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_api_key(provider: str) -> Optional[str]:
|
|
88
|
+
"""Get API key for a provider from environment variables."""
|
|
89
|
+
env_map = {
|
|
90
|
+
PROVIDER_OPENAI: "OPENAI_API_KEY",
|
|
91
|
+
PROVIDER_ANTHROPIC: "ANTHROPIC_API_KEY",
|
|
92
|
+
PROVIDER_GOOGLE: "GOOGLE_API_KEY",
|
|
93
|
+
PROVIDER_OPENAI_COMPAT: "OPENAI_API_KEY",
|
|
94
|
+
}
|
|
95
|
+
env_var = env_map.get(provider)
|
|
96
|
+
if env_var:
|
|
97
|
+
return os.environ.get(env_var)
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def resolve_model(model_spec: str) -> ProviderConfig:
|
|
102
|
+
"""Resolve a model specifier into a full provider config.
|
|
103
|
+
|
|
104
|
+
Returns a ProviderConfig with provider name, API key, base URL, and
|
|
105
|
+
the clean model name stored in extra["model"].
|
|
106
|
+
"""
|
|
107
|
+
provider, model_name = detect_provider(model_spec)
|
|
108
|
+
|
|
109
|
+
base_url = None
|
|
110
|
+
if provider == PROVIDER_OPENAI_COMPAT and "||" in model_name:
|
|
111
|
+
base_url, model_name = model_name.split("||", 1)
|
|
112
|
+
elif provider == PROVIDER_OLLAMA:
|
|
113
|
+
base_url = get_ollama_host()
|
|
114
|
+
|
|
115
|
+
api_key = get_api_key(provider)
|
|
116
|
+
|
|
117
|
+
return ProviderConfig(
|
|
118
|
+
provider=provider,
|
|
119
|
+
api_key=api_key,
|
|
120
|
+
base_url=base_url,
|
|
121
|
+
extra={"model": model_name, "original_spec": model_spec},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def load_config() -> dict:
|
|
126
|
+
"""Load user config from ~/.arbiter/config.json."""
|
|
127
|
+
if CONFIG_FILE.exists():
|
|
128
|
+
with open(CONFIG_FILE) as f:
|
|
129
|
+
return json.load(f)
|
|
130
|
+
return {}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def save_config(config: dict) -> None:
|
|
134
|
+
"""Save user config to ~/.arbiter/config.json."""
|
|
135
|
+
ensure_arbiter_dir()
|
|
136
|
+
with open(CONFIG_FILE, "w") as f:
|
|
137
|
+
json.dump(config, f, indent=2)
|
arbiter/core/discover.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Model discovery -- list all available models across providers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from arbiter.core.config import get_api_key, PROVIDER_OPENAI, PROVIDER_ANTHROPIC, PROVIDER_GOOGLE
|
|
9
|
+
from arbiter.core.providers.ollama import OllamaProvider
|
|
10
|
+
from arbiter.core.providers.openai_provider import OpenAIProvider
|
|
11
|
+
from arbiter.core.providers.anthropic_provider import AnthropicProvider
|
|
12
|
+
from arbiter.core.providers.google_provider import GoogleProvider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_system_memory() -> dict:
|
|
16
|
+
"""Get system memory info in GB."""
|
|
17
|
+
import psutil
|
|
18
|
+
mem = psutil.virtual_memory()
|
|
19
|
+
return {
|
|
20
|
+
"total_gb": round(mem.total / (1024**3), 1),
|
|
21
|
+
"available_gb": round(mem.available / (1024**3), 1),
|
|
22
|
+
"used_gb": round(mem.used / (1024**3), 1),
|
|
23
|
+
"percent": mem.percent,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_max_model_size_gb() -> float:
|
|
28
|
+
"""Get the max model size that can safely run on this machine.
|
|
29
|
+
|
|
30
|
+
Rule of thumb: model should use at most 70% of available RAM
|
|
31
|
+
to leave room for OS, Ollama overhead, and the app itself.
|
|
32
|
+
"""
|
|
33
|
+
mem = get_system_memory()
|
|
34
|
+
return round(mem["available_gb"] * 0.70, 1)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class DiscoveredModel:
|
|
39
|
+
"""A model discovered from a provider."""
|
|
40
|
+
|
|
41
|
+
name: str
|
|
42
|
+
provider: str
|
|
43
|
+
size: Optional[int] = None
|
|
44
|
+
parameter_size: Optional[str] = None
|
|
45
|
+
quantization: Optional[str] = None
|
|
46
|
+
family: Optional[str] = None
|
|
47
|
+
multimodal: bool = False
|
|
48
|
+
display_name: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def size_gb(self) -> Optional[float]:
|
|
52
|
+
if self.size:
|
|
53
|
+
return round(self.size / (1024**3), 1)
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def fits_in_memory(self) -> bool:
|
|
58
|
+
"""Check if this model can run without swapping."""
|
|
59
|
+
if self.size_gb is None:
|
|
60
|
+
return True # cloud models, no size info
|
|
61
|
+
max_size = get_max_model_size_gb()
|
|
62
|
+
return self.size_gb <= max_size
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def memory_warning(self) -> Optional[str]:
|
|
66
|
+
"""Return a warning if the model is too large for this machine."""
|
|
67
|
+
if self.size_gb is None:
|
|
68
|
+
return None
|
|
69
|
+
mem = get_system_memory()
|
|
70
|
+
if self.size_gb > mem["available_gb"]:
|
|
71
|
+
return f"Model is {self.size_gb}GB but only {mem['available_gb']}GB available. Will be extremely slow."
|
|
72
|
+
if self.size_gb > mem["available_gb"] * 0.7:
|
|
73
|
+
return f"Model is {self.size_gb}GB which is tight for {mem['total_gb']}GB RAM. May be slow."
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def spec(self) -> str:
|
|
78
|
+
"""Return the full model specifier for use with arbiter."""
|
|
79
|
+
if self.provider == "ollama":
|
|
80
|
+
return self.name
|
|
81
|
+
return f"{self.provider}:{self.name}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def discover_ollama() -> list[DiscoveredModel]:
|
|
85
|
+
"""Discover models from local Ollama installation."""
|
|
86
|
+
ollama = OllamaProvider()
|
|
87
|
+
if not await ollama.check_connection():
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
raw_models = await ollama.list_models()
|
|
91
|
+
models = []
|
|
92
|
+
for m in raw_models:
|
|
93
|
+
families = m.get("families") or []
|
|
94
|
+
multimodal = any(f in families for f in ["clip", "mllama"])
|
|
95
|
+
models.append(
|
|
96
|
+
DiscoveredModel(
|
|
97
|
+
name=m["name"],
|
|
98
|
+
provider="ollama",
|
|
99
|
+
size=m.get("size"),
|
|
100
|
+
parameter_size=m.get("parameter_size"),
|
|
101
|
+
quantization=m.get("quantization"),
|
|
102
|
+
family=m.get("family"),
|
|
103
|
+
multimodal=multimodal,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
return models
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
async def discover_openai() -> list[DiscoveredModel]:
|
|
110
|
+
"""Discover models from OpenAI (if API key is set)."""
|
|
111
|
+
api_key = get_api_key(PROVIDER_OPENAI)
|
|
112
|
+
if not api_key:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
provider = OpenAIProvider(api_key=api_key)
|
|
116
|
+
raw_models = await provider.list_models()
|
|
117
|
+
return [
|
|
118
|
+
DiscoveredModel(
|
|
119
|
+
name=m["name"],
|
|
120
|
+
provider="openai",
|
|
121
|
+
display_name=m["name"],
|
|
122
|
+
)
|
|
123
|
+
for m in raw_models
|
|
124
|
+
if any(
|
|
125
|
+
prefix in m["name"]
|
|
126
|
+
for prefix in ("gpt-", "o1", "o3", "o4", "chatgpt")
|
|
127
|
+
)
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def discover_anthropic() -> list[DiscoveredModel]:
|
|
132
|
+
"""Discover models from Anthropic (if API key is set)."""
|
|
133
|
+
api_key = get_api_key(PROVIDER_ANTHROPIC)
|
|
134
|
+
if not api_key:
|
|
135
|
+
return []
|
|
136
|
+
|
|
137
|
+
provider = AnthropicProvider(api_key=api_key)
|
|
138
|
+
raw_models = await provider.list_models()
|
|
139
|
+
return [
|
|
140
|
+
DiscoveredModel(
|
|
141
|
+
name=m["name"],
|
|
142
|
+
provider="anthropic",
|
|
143
|
+
display_name=m["name"],
|
|
144
|
+
multimodal=True,
|
|
145
|
+
)
|
|
146
|
+
for m in raw_models
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
async def discover_google() -> list[DiscoveredModel]:
|
|
151
|
+
"""Discover models from Google (if API key is set)."""
|
|
152
|
+
api_key = get_api_key(PROVIDER_GOOGLE)
|
|
153
|
+
if not api_key:
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
provider = GoogleProvider(api_key=api_key)
|
|
157
|
+
raw_models = await provider.list_models()
|
|
158
|
+
return [
|
|
159
|
+
DiscoveredModel(
|
|
160
|
+
name=m["name"],
|
|
161
|
+
provider="google",
|
|
162
|
+
display_name=m.get("display_name", m["name"]),
|
|
163
|
+
)
|
|
164
|
+
for m in raw_models
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def discover_all() -> list[DiscoveredModel]:
|
|
169
|
+
"""Discover all available models across all providers."""
|
|
170
|
+
import asyncio
|
|
171
|
+
|
|
172
|
+
results = await asyncio.gather(
|
|
173
|
+
discover_ollama(),
|
|
174
|
+
discover_openai(),
|
|
175
|
+
discover_anthropic(),
|
|
176
|
+
discover_google(),
|
|
177
|
+
return_exceptions=True,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
all_models = []
|
|
181
|
+
for result in results:
|
|
182
|
+
if isinstance(result, list):
|
|
183
|
+
all_models.extend(result)
|
|
184
|
+
return all_models
|
arbiter/core/judge.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Auto-judge system -- uses a model to score all outputs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from arbiter.core.config import resolve_model
|
|
10
|
+
from arbiter.core.metrics import ComparisonResult, ModelMetrics
|
|
11
|
+
from arbiter.core.providers.factory import create_provider
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
JUDGE_SYSTEM_PROMPT = """You are an expert judge evaluating AI model outputs. You will be given a prompt and multiple model responses. Score each response on these dimensions:
|
|
15
|
+
|
|
16
|
+
1. **Correctness** (1-10): Is the answer factually accurate and free of errors?
|
|
17
|
+
2. **Completeness** (1-10): Does it fully address the prompt?
|
|
18
|
+
3. **Clarity** (1-10): Is it well-organized and easy to understand?
|
|
19
|
+
4. **Code Quality** (1-10): If code is present, is it clean, efficient, and correct? If no code, score based on writing quality.
|
|
20
|
+
|
|
21
|
+
Respond ONLY with valid JSON in this exact format:
|
|
22
|
+
{
|
|
23
|
+
"results": [
|
|
24
|
+
{
|
|
25
|
+
"model": "<model_name>",
|
|
26
|
+
"correctness": <1-10>,
|
|
27
|
+
"completeness": <1-10>,
|
|
28
|
+
"clarity": <1-10>,
|
|
29
|
+
"code_quality": <1-10>,
|
|
30
|
+
"overall": <1-10>,
|
|
31
|
+
"reasoning": "<brief explanation>"
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"winner": "<model_name>"
|
|
35
|
+
}"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _build_judge_prompt(prompt: str, models: list[ModelMetrics]) -> str:
|
|
39
|
+
"""Build the prompt for the judge model."""
|
|
40
|
+
parts = [f'Original prompt: """{prompt}"""\n']
|
|
41
|
+
|
|
42
|
+
for i, m in enumerate(models, 1):
|
|
43
|
+
output = m.output
|
|
44
|
+
if output.startswith("[ERROR]"):
|
|
45
|
+
output = "(Model failed to generate a response)"
|
|
46
|
+
# Truncate very long outputs for the judge
|
|
47
|
+
if len(output) > 4000:
|
|
48
|
+
output = output[:4000] + "\n... (truncated)"
|
|
49
|
+
parts.append(f'--- Response from {m.model} ---\n"""{output}"""\n')
|
|
50
|
+
|
|
51
|
+
parts.append(
|
|
52
|
+
"Score each response. Remember to respond ONLY with valid JSON."
|
|
53
|
+
)
|
|
54
|
+
return "\n".join(parts)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _parse_judge_response(text: str, models: list[ModelMetrics]) -> dict:
|
|
58
|
+
"""Parse the judge's JSON response, handling common formatting issues."""
|
|
59
|
+
# Try to extract JSON from the response
|
|
60
|
+
# Sometimes models wrap JSON in markdown code blocks
|
|
61
|
+
json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
|
62
|
+
if json_match:
|
|
63
|
+
text = json_match.group(1)
|
|
64
|
+
else:
|
|
65
|
+
# Try to find raw JSON
|
|
66
|
+
brace_start = text.find("{")
|
|
67
|
+
brace_end = text.rfind("}") + 1
|
|
68
|
+
if brace_start >= 0 and brace_end > brace_start:
|
|
69
|
+
text = text[brace_start:brace_end]
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
data = json.loads(text)
|
|
73
|
+
except json.JSONDecodeError:
|
|
74
|
+
# Fallback: return equal scores
|
|
75
|
+
model_names = [m.model for m in models]
|
|
76
|
+
return {
|
|
77
|
+
"results": [
|
|
78
|
+
{
|
|
79
|
+
"model": name,
|
|
80
|
+
"correctness": 5,
|
|
81
|
+
"completeness": 5,
|
|
82
|
+
"clarity": 5,
|
|
83
|
+
"code_quality": 5,
|
|
84
|
+
"overall": 5,
|
|
85
|
+
"reasoning": "Judge response could not be parsed",
|
|
86
|
+
}
|
|
87
|
+
for name in model_names
|
|
88
|
+
],
|
|
89
|
+
"winner": model_names[0] if model_names else None,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return data
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def judge_comparison(
|
|
96
|
+
comparison: ComparisonResult,
|
|
97
|
+
judge_model: str = "auto",
|
|
98
|
+
) -> ComparisonResult:
|
|
99
|
+
"""Score all model outputs using a judge model.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
comparison: The comparison result to judge
|
|
103
|
+
judge_model: Model spec for the judge. "auto" picks the best available.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The same ComparisonResult with quality_scores and winner filled in.
|
|
107
|
+
"""
|
|
108
|
+
# Filter out models that errored
|
|
109
|
+
valid_models = [m for m in comparison.models if not m.output.startswith("[ERROR]")]
|
|
110
|
+
if not valid_models:
|
|
111
|
+
return comparison
|
|
112
|
+
|
|
113
|
+
# Resolve judge model
|
|
114
|
+
if judge_model == "auto":
|
|
115
|
+
judge_model = await _pick_judge_model(comparison.models)
|
|
116
|
+
|
|
117
|
+
config = resolve_model(judge_model)
|
|
118
|
+
provider, model_name = create_provider(config)
|
|
119
|
+
|
|
120
|
+
judge_prompt = _build_judge_prompt(comparison.prompt, valid_models)
|
|
121
|
+
|
|
122
|
+
# Generate judge response with timeout and token limit
|
|
123
|
+
import time
|
|
124
|
+
output_parts = []
|
|
125
|
+
token_count = 0
|
|
126
|
+
start = time.perf_counter()
|
|
127
|
+
async for chunk in provider.stream_generate(
|
|
128
|
+
model=model_name,
|
|
129
|
+
prompt=judge_prompt,
|
|
130
|
+
system=JUDGE_SYSTEM_PROMPT,
|
|
131
|
+
):
|
|
132
|
+
output_parts.append(chunk.text)
|
|
133
|
+
token_count += 1
|
|
134
|
+
if chunk.done:
|
|
135
|
+
break
|
|
136
|
+
# Hard limits to prevent hanging
|
|
137
|
+
if token_count >= 1500:
|
|
138
|
+
break
|
|
139
|
+
if time.perf_counter() - start > 90:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
judge_text = "".join(output_parts)
|
|
143
|
+
parsed = _parse_judge_response(judge_text, valid_models)
|
|
144
|
+
|
|
145
|
+
# Apply scores to metrics
|
|
146
|
+
results_by_model = {}
|
|
147
|
+
for r in parsed.get("results", []):
|
|
148
|
+
results_by_model[r.get("model", "")] = r
|
|
149
|
+
|
|
150
|
+
for m in comparison.models:
|
|
151
|
+
scores = results_by_model.get(m.model, {})
|
|
152
|
+
if scores:
|
|
153
|
+
m.quality_scores = {
|
|
154
|
+
"correctness": scores.get("correctness", 5),
|
|
155
|
+
"completeness": scores.get("completeness", 5),
|
|
156
|
+
"clarity": scores.get("clarity", 5),
|
|
157
|
+
"code_quality": scores.get("code_quality", 5),
|
|
158
|
+
}
|
|
159
|
+
m.overall_score = scores.get("overall", 5)
|
|
160
|
+
|
|
161
|
+
# Judge no longer picks the winner -- composite scoring does that
|
|
162
|
+
comparison.judge_model = judge_model
|
|
163
|
+
|
|
164
|
+
return comparison
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
async def _pick_judge_model(models: list[ModelMetrics]) -> str:
|
|
168
|
+
"""Auto-select the best available judge model.
|
|
169
|
+
|
|
170
|
+
Prefers large local models, falls back to the first available model
|
|
171
|
+
that isn't in the comparison set.
|
|
172
|
+
"""
|
|
173
|
+
from arbiter.core.providers.ollama import OllamaProvider
|
|
174
|
+
|
|
175
|
+
compared_models = {m.model for m in models}
|
|
176
|
+
|
|
177
|
+
# Try to find a local model not in the comparison
|
|
178
|
+
try:
|
|
179
|
+
ollama = OllamaProvider()
|
|
180
|
+
if await ollama.check_connection():
|
|
181
|
+
available = await ollama.list_models()
|
|
182
|
+
# Sort by size (largest first) to pick the best judge
|
|
183
|
+
available.sort(key=lambda x: x.get("size", 0) or 0, reverse=True)
|
|
184
|
+
for m in available:
|
|
185
|
+
name = m["name"]
|
|
186
|
+
# Skip models that are being compared
|
|
187
|
+
if name not in compared_models and name.split(":")[0] not in compared_models:
|
|
188
|
+
return name
|
|
189
|
+
except Exception:
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
# Fallback: use the first compared model as judge (not ideal but works)
|
|
193
|
+
return models[0].model if models else "llama3.2"
|