arbiter-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arbiter/core/config.py ADDED
@@ -0,0 +1,137 @@
1
+ """Configuration and defaults for Arbiter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+
12
+ ARBITER_DIR = Path.home() / ".arbiter"
13
+ LEADERBOARD_FILE = ARBITER_DIR / "leaderboard.json"
14
+ CONFIG_FILE = ARBITER_DIR / "config.json"
15
+
16
+ # Ollama defaults
17
+ DEFAULT_OLLAMA_HOST = "http://localhost:11434"
18
+
19
+ # Provider identifiers
20
+ PROVIDER_OLLAMA = "ollama"
21
+ PROVIDER_OPENAI = "openai"
22
+ PROVIDER_ANTHROPIC = "anthropic"
23
+ PROVIDER_GOOGLE = "google"
24
+ PROVIDER_OPENAI_COMPAT = "openai-compatible"
25
+
26
+
27
+ @dataclass
28
+ class ProviderConfig:
29
+ """Configuration for a single LLM provider."""
30
+
31
+ provider: str
32
+ api_key: Optional[str] = None
33
+ base_url: Optional[str] = None
34
+ extra: dict = field(default_factory=dict)
35
+
36
+
37
+ def ensure_arbiter_dir() -> Path:
38
+ """Create ~/.arbiter/ if it doesn't exist."""
39
+ ARBITER_DIR.mkdir(parents=True, exist_ok=True)
40
+ return ARBITER_DIR
41
+
42
+
43
+ def get_ollama_host() -> str:
44
+ """Get the Ollama API host from env or default."""
45
+ return os.environ.get("OLLAMA_HOST", DEFAULT_OLLAMA_HOST)
46
+
47
+
48
+ def detect_provider(model_name: str) -> tuple[str, str]:
49
+ """Detect the provider and clean model name from a model specifier.
50
+
51
+ Formats:
52
+ "gemma4" -> (ollama, gemma4)
53
+ "ollama:gemma4" -> (ollama, gemma4)
54
+ "openai:gpt-4o" -> (openai, gpt-4o)
55
+ "anthropic:claude-sonnet-4-20250514" -> (anthropic, claude-sonnet-4-20250514)
56
+ "google:gemini-2.0-flash" -> (google, gemini-2.0-flash)
57
+ "http://host:port/v1:model" -> (openai-compatible, model) with base_url
58
+ """
59
+ if ":" in model_name:
60
+ prefix, _, rest = model_name.partition(":")
61
+
62
+ # Check for known providers
63
+ if prefix in (PROVIDER_OLLAMA, PROVIDER_OPENAI, PROVIDER_ANTHROPIC, PROVIDER_GOOGLE):
64
+ return prefix, rest
65
+
66
+ # Check for URL-based custom endpoint (openai-compatible)
67
+ if prefix in ("http", "https"):
68
+ # Format: http://host:port/v1:model_name
69
+ # Find the last colon that separates URL from model
70
+ full = model_name
71
+ last_colon = full.rfind(":")
72
+ # Walk backwards to find the actual model separator
73
+ # URLs have colons in protocol and port, so we look for the pattern
74
+ # where what's after the colon looks like a model name (no slashes)
75
+ parts = full.rsplit(":", 1)
76
+ if len(parts) == 2 and "/" not in parts[1]:
77
+ base_url = parts[0]
78
+ model = parts[1]
79
+ return PROVIDER_OPENAI_COMPAT, f"{base_url}||{model}"
80
+
81
+ # Ollama model with tag (e.g. "gemma4:latest" or "qwen2.5:7b")
82
+ return PROVIDER_OLLAMA, model_name
83
+
84
+ return PROVIDER_OLLAMA, model_name
85
+
86
+
87
+ def get_api_key(provider: str) -> Optional[str]:
88
+ """Get API key for a provider from environment variables."""
89
+ env_map = {
90
+ PROVIDER_OPENAI: "OPENAI_API_KEY",
91
+ PROVIDER_ANTHROPIC: "ANTHROPIC_API_KEY",
92
+ PROVIDER_GOOGLE: "GOOGLE_API_KEY",
93
+ PROVIDER_OPENAI_COMPAT: "OPENAI_API_KEY",
94
+ }
95
+ env_var = env_map.get(provider)
96
+ if env_var:
97
+ return os.environ.get(env_var)
98
+ return None
99
+
100
+
101
+ def resolve_model(model_spec: str) -> ProviderConfig:
102
+ """Resolve a model specifier into a full provider config.
103
+
104
+ Returns a ProviderConfig with provider name, API key, base URL, and
105
+ the clean model name stored in extra["model"].
106
+ """
107
+ provider, model_name = detect_provider(model_spec)
108
+
109
+ base_url = None
110
+ if provider == PROVIDER_OPENAI_COMPAT and "||" in model_name:
111
+ base_url, model_name = model_name.split("||", 1)
112
+ elif provider == PROVIDER_OLLAMA:
113
+ base_url = get_ollama_host()
114
+
115
+ api_key = get_api_key(provider)
116
+
117
+ return ProviderConfig(
118
+ provider=provider,
119
+ api_key=api_key,
120
+ base_url=base_url,
121
+ extra={"model": model_name, "original_spec": model_spec},
122
+ )
123
+
124
+
125
+ def load_config() -> dict:
126
+ """Load user config from ~/.arbiter/config.json."""
127
+ if CONFIG_FILE.exists():
128
+ with open(CONFIG_FILE) as f:
129
+ return json.load(f)
130
+ return {}
131
+
132
+
133
+ def save_config(config: dict) -> None:
134
+ """Save user config to ~/.arbiter/config.json."""
135
+ ensure_arbiter_dir()
136
+ with open(CONFIG_FILE, "w") as f:
137
+ json.dump(config, f, indent=2)
@@ -0,0 +1,184 @@
1
+ """Model discovery -- list all available models across providers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ from arbiter.core.config import get_api_key, PROVIDER_OPENAI, PROVIDER_ANTHROPIC, PROVIDER_GOOGLE
9
+ from arbiter.core.providers.ollama import OllamaProvider
10
+ from arbiter.core.providers.openai_provider import OpenAIProvider
11
+ from arbiter.core.providers.anthropic_provider import AnthropicProvider
12
+ from arbiter.core.providers.google_provider import GoogleProvider
13
+
14
+
15
+ def get_system_memory() -> dict:
16
+ """Get system memory info in GB."""
17
+ import psutil
18
+ mem = psutil.virtual_memory()
19
+ return {
20
+ "total_gb": round(mem.total / (1024**3), 1),
21
+ "available_gb": round(mem.available / (1024**3), 1),
22
+ "used_gb": round(mem.used / (1024**3), 1),
23
+ "percent": mem.percent,
24
+ }
25
+
26
+
27
+ def get_max_model_size_gb() -> float:
28
+ """Get the max model size that can safely run on this machine.
29
+
30
+ Rule of thumb: model should use at most 70% of available RAM
31
+ to leave room for OS, Ollama overhead, and the app itself.
32
+ """
33
+ mem = get_system_memory()
34
+ return round(mem["available_gb"] * 0.70, 1)
35
+
36
+
37
+ @dataclass
38
+ class DiscoveredModel:
39
+ """A model discovered from a provider."""
40
+
41
+ name: str
42
+ provider: str
43
+ size: Optional[int] = None
44
+ parameter_size: Optional[str] = None
45
+ quantization: Optional[str] = None
46
+ family: Optional[str] = None
47
+ multimodal: bool = False
48
+ display_name: Optional[str] = None
49
+
50
+ @property
51
+ def size_gb(self) -> Optional[float]:
52
+ if self.size:
53
+ return round(self.size / (1024**3), 1)
54
+ return None
55
+
56
+ @property
57
+ def fits_in_memory(self) -> bool:
58
+ """Check if this model can run without swapping."""
59
+ if self.size_gb is None:
60
+ return True # cloud models, no size info
61
+ max_size = get_max_model_size_gb()
62
+ return self.size_gb <= max_size
63
+
64
+ @property
65
+ def memory_warning(self) -> Optional[str]:
66
+ """Return a warning if the model is too large for this machine."""
67
+ if self.size_gb is None:
68
+ return None
69
+ mem = get_system_memory()
70
+ if self.size_gb > mem["available_gb"]:
71
+ return f"Model is {self.size_gb}GB but only {mem['available_gb']}GB available. Will be extremely slow."
72
+ if self.size_gb > mem["available_gb"] * 0.7:
73
+ return f"Model is {self.size_gb}GB which is tight for {mem['total_gb']}GB RAM. May be slow."
74
+ return None
75
+
76
+ @property
77
+ def spec(self) -> str:
78
+ """Return the full model specifier for use with arbiter."""
79
+ if self.provider == "ollama":
80
+ return self.name
81
+ return f"{self.provider}:{self.name}"
82
+
83
+
84
+ async def discover_ollama() -> list[DiscoveredModel]:
85
+ """Discover models from local Ollama installation."""
86
+ ollama = OllamaProvider()
87
+ if not await ollama.check_connection():
88
+ return []
89
+
90
+ raw_models = await ollama.list_models()
91
+ models = []
92
+ for m in raw_models:
93
+ families = m.get("families") or []
94
+ multimodal = any(f in families for f in ["clip", "mllama"])
95
+ models.append(
96
+ DiscoveredModel(
97
+ name=m["name"],
98
+ provider="ollama",
99
+ size=m.get("size"),
100
+ parameter_size=m.get("parameter_size"),
101
+ quantization=m.get("quantization"),
102
+ family=m.get("family"),
103
+ multimodal=multimodal,
104
+ )
105
+ )
106
+ return models
107
+
108
+
109
+ async def discover_openai() -> list[DiscoveredModel]:
110
+ """Discover models from OpenAI (if API key is set)."""
111
+ api_key = get_api_key(PROVIDER_OPENAI)
112
+ if not api_key:
113
+ return []
114
+
115
+ provider = OpenAIProvider(api_key=api_key)
116
+ raw_models = await provider.list_models()
117
+ return [
118
+ DiscoveredModel(
119
+ name=m["name"],
120
+ provider="openai",
121
+ display_name=m["name"],
122
+ )
123
+ for m in raw_models
124
+ if any(
125
+ prefix in m["name"]
126
+ for prefix in ("gpt-", "o1", "o3", "o4", "chatgpt")
127
+ )
128
+ ]
129
+
130
+
131
+ async def discover_anthropic() -> list[DiscoveredModel]:
132
+ """Discover models from Anthropic (if API key is set)."""
133
+ api_key = get_api_key(PROVIDER_ANTHROPIC)
134
+ if not api_key:
135
+ return []
136
+
137
+ provider = AnthropicProvider(api_key=api_key)
138
+ raw_models = await provider.list_models()
139
+ return [
140
+ DiscoveredModel(
141
+ name=m["name"],
142
+ provider="anthropic",
143
+ display_name=m["name"],
144
+ multimodal=True,
145
+ )
146
+ for m in raw_models
147
+ ]
148
+
149
+
150
+ async def discover_google() -> list[DiscoveredModel]:
151
+ """Discover models from Google (if API key is set)."""
152
+ api_key = get_api_key(PROVIDER_GOOGLE)
153
+ if not api_key:
154
+ return []
155
+
156
+ provider = GoogleProvider(api_key=api_key)
157
+ raw_models = await provider.list_models()
158
+ return [
159
+ DiscoveredModel(
160
+ name=m["name"],
161
+ provider="google",
162
+ display_name=m.get("display_name", m["name"]),
163
+ )
164
+ for m in raw_models
165
+ ]
166
+
167
+
168
+ async def discover_all() -> list[DiscoveredModel]:
169
+ """Discover all available models across all providers."""
170
+ import asyncio
171
+
172
+ results = await asyncio.gather(
173
+ discover_ollama(),
174
+ discover_openai(),
175
+ discover_anthropic(),
176
+ discover_google(),
177
+ return_exceptions=True,
178
+ )
179
+
180
+ all_models = []
181
+ for result in results:
182
+ if isinstance(result, list):
183
+ all_models.extend(result)
184
+ return all_models
arbiter/core/judge.py ADDED
@@ -0,0 +1,193 @@
1
+ """Auto-judge system -- uses a model to score all outputs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from typing import Optional
8
+
9
+ from arbiter.core.config import resolve_model
10
+ from arbiter.core.metrics import ComparisonResult, ModelMetrics
11
+ from arbiter.core.providers.factory import create_provider
12
+
13
+
14
+ JUDGE_SYSTEM_PROMPT = """You are an expert judge evaluating AI model outputs. You will be given a prompt and multiple model responses. Score each response on these dimensions:
15
+
16
+ 1. **Correctness** (1-10): Is the answer factually accurate and free of errors?
17
+ 2. **Completeness** (1-10): Does it fully address the prompt?
18
+ 3. **Clarity** (1-10): Is it well-organized and easy to understand?
19
+ 4. **Code Quality** (1-10): If code is present, is it clean, efficient, and correct? If no code, score based on writing quality.
20
+
21
+ Respond ONLY with valid JSON in this exact format:
22
+ {
23
+ "results": [
24
+ {
25
+ "model": "<model_name>",
26
+ "correctness": <1-10>,
27
+ "completeness": <1-10>,
28
+ "clarity": <1-10>,
29
+ "code_quality": <1-10>,
30
+ "overall": <1-10>,
31
+ "reasoning": "<brief explanation>"
32
+ }
33
+ ],
34
+ "winner": "<model_name>"
35
+ }"""
36
+
37
+
38
+ def _build_judge_prompt(prompt: str, models: list[ModelMetrics]) -> str:
39
+ """Build the prompt for the judge model."""
40
+ parts = [f'Original prompt: """{prompt}"""\n']
41
+
42
+ for i, m in enumerate(models, 1):
43
+ output = m.output
44
+ if output.startswith("[ERROR]"):
45
+ output = "(Model failed to generate a response)"
46
+ # Truncate very long outputs for the judge
47
+ if len(output) > 4000:
48
+ output = output[:4000] + "\n... (truncated)"
49
+ parts.append(f'--- Response from {m.model} ---\n"""{output}"""\n')
50
+
51
+ parts.append(
52
+ "Score each response. Remember to respond ONLY with valid JSON."
53
+ )
54
+ return "\n".join(parts)
55
+
56
+
57
+ def _parse_judge_response(text: str, models: list[ModelMetrics]) -> dict:
58
+ """Parse the judge's JSON response, handling common formatting issues."""
59
+ # Try to extract JSON from the response
60
+ # Sometimes models wrap JSON in markdown code blocks
61
+ json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
62
+ if json_match:
63
+ text = json_match.group(1)
64
+ else:
65
+ # Try to find raw JSON
66
+ brace_start = text.find("{")
67
+ brace_end = text.rfind("}") + 1
68
+ if brace_start >= 0 and brace_end > brace_start:
69
+ text = text[brace_start:brace_end]
70
+
71
+ try:
72
+ data = json.loads(text)
73
+ except json.JSONDecodeError:
74
+ # Fallback: return equal scores
75
+ model_names = [m.model for m in models]
76
+ return {
77
+ "results": [
78
+ {
79
+ "model": name,
80
+ "correctness": 5,
81
+ "completeness": 5,
82
+ "clarity": 5,
83
+ "code_quality": 5,
84
+ "overall": 5,
85
+ "reasoning": "Judge response could not be parsed",
86
+ }
87
+ for name in model_names
88
+ ],
89
+ "winner": model_names[0] if model_names else None,
90
+ }
91
+
92
+ return data
93
+
94
+
95
+ async def judge_comparison(
96
+ comparison: ComparisonResult,
97
+ judge_model: str = "auto",
98
+ ) -> ComparisonResult:
99
+ """Score all model outputs using a judge model.
100
+
101
+ Args:
102
+ comparison: The comparison result to judge
103
+ judge_model: Model spec for the judge. "auto" picks the best available.
104
+
105
+ Returns:
106
+ The same ComparisonResult with quality_scores and winner filled in.
107
+ """
108
+ # Filter out models that errored
109
+ valid_models = [m for m in comparison.models if not m.output.startswith("[ERROR]")]
110
+ if not valid_models:
111
+ return comparison
112
+
113
+ # Resolve judge model
114
+ if judge_model == "auto":
115
+ judge_model = await _pick_judge_model(comparison.models)
116
+
117
+ config = resolve_model(judge_model)
118
+ provider, model_name = create_provider(config)
119
+
120
+ judge_prompt = _build_judge_prompt(comparison.prompt, valid_models)
121
+
122
+ # Generate judge response with timeout and token limit
123
+ import time
124
+ output_parts = []
125
+ token_count = 0
126
+ start = time.perf_counter()
127
+ async for chunk in provider.stream_generate(
128
+ model=model_name,
129
+ prompt=judge_prompt,
130
+ system=JUDGE_SYSTEM_PROMPT,
131
+ ):
132
+ output_parts.append(chunk.text)
133
+ token_count += 1
134
+ if chunk.done:
135
+ break
136
+ # Hard limits to prevent hanging
137
+ if token_count >= 1500:
138
+ break
139
+ if time.perf_counter() - start > 90:
140
+ break
141
+
142
+ judge_text = "".join(output_parts)
143
+ parsed = _parse_judge_response(judge_text, valid_models)
144
+
145
+ # Apply scores to metrics
146
+ results_by_model = {}
147
+ for r in parsed.get("results", []):
148
+ results_by_model[r.get("model", "")] = r
149
+
150
+ for m in comparison.models:
151
+ scores = results_by_model.get(m.model, {})
152
+ if scores:
153
+ m.quality_scores = {
154
+ "correctness": scores.get("correctness", 5),
155
+ "completeness": scores.get("completeness", 5),
156
+ "clarity": scores.get("clarity", 5),
157
+ "code_quality": scores.get("code_quality", 5),
158
+ }
159
+ m.overall_score = scores.get("overall", 5)
160
+
161
+ # Judge no longer picks the winner -- composite scoring does that
162
+ comparison.judge_model = judge_model
163
+
164
+ return comparison
165
+
166
+
167
+ async def _pick_judge_model(models: list[ModelMetrics]) -> str:
168
+ """Auto-select the best available judge model.
169
+
170
+ Prefers large local models, falls back to the first available model
171
+ that isn't in the comparison set.
172
+ """
173
+ from arbiter.core.providers.ollama import OllamaProvider
174
+
175
+ compared_models = {m.model for m in models}
176
+
177
+ # Try to find a local model not in the comparison
178
+ try:
179
+ ollama = OllamaProvider()
180
+ if await ollama.check_connection():
181
+ available = await ollama.list_models()
182
+ # Sort by size (largest first) to pick the best judge
183
+ available.sort(key=lambda x: x.get("size", 0) or 0, reverse=True)
184
+ for m in available:
185
+ name = m["name"]
186
+ # Skip models that are being compared
187
+ if name not in compared_models and name.split(":")[0] not in compared_models:
188
+ return name
189
+ except Exception:
190
+ pass
191
+
192
+ # Fallback: use the first compared model as judge (not ideal but works)
193
+ return models[0].model if models else "llama3.2"