assayer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- assayer/__init__.py +0 -0
- assayer/cli/__init__.py +0 -0
- assayer/cli/main.py +190 -0
- assayer/config.py +30 -0
- assayer/exporter.py +32 -0
- assayer/judge.py +95 -0
- assayer/models.py +12 -0
- assayer/providers/__init__.py +0 -0
- assayer/providers/anthropic.py +73 -0
- assayer/providers/base.py +20 -0
- assayer/providers/gemini.py +73 -0
- assayer/providers/ollama.py +76 -0
- assayer/providers/openai.py +69 -0
- assayer/py.typed +0 -0
- assayer/renderer.py +109 -0
- assayer/runner.py +65 -0
- assayer/scorer.py +55 -0
- assayer-0.1.0.dist-info/METADATA +186 -0
- assayer-0.1.0.dist-info/RECORD +23 -0
- assayer-0.1.0.dist-info/WHEEL +5 -0
- assayer-0.1.0.dist-info/entry_points.txt +2 -0
- assayer-0.1.0.dist-info/licenses/LICENSE +21 -0
- assayer-0.1.0.dist-info/top_level.txt +1 -0
assayer/__init__.py
ADDED
|
File without changes
|
assayer/cli/__init__.py
ADDED
|
File without changes
|
assayer/cli/main.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from assayer.config import get_api_key, set_api_key, show_config
|
|
8
|
+
|
|
9
|
+
_KNOWN_MODELS: dict[str, list[str]] = {
|
|
10
|
+
"openai": ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "o1-mini"],
|
|
11
|
+
"anthropic": ["claude-opus-4-5", "claude-sonnet-4-5", "claude-haiku-4-5-20251001"],
|
|
12
|
+
"gemini": ["gemini-1.5-pro", "gemini-1.5-flash"],
|
|
13
|
+
"ollama": ["ollama/llama3", "ollama/mistral", "ollama/phi3"],
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.group()
|
|
18
|
+
def cli() -> None:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@cli.command()
|
|
23
|
+
@click.argument("prompt", required=False)
|
|
24
|
+
@click.option("--models", required=True, help="Comma-separated model identifiers.")
|
|
25
|
+
@click.option(
|
|
26
|
+
"--prompt-file",
|
|
27
|
+
type=click.Path(exists=True),
|
|
28
|
+
help="Path to a .txt prompt file.",
|
|
29
|
+
)
|
|
30
|
+
@click.option(
|
|
31
|
+
"--var",
|
|
32
|
+
multiple=True,
|
|
33
|
+
metavar="KEY=VALUE",
|
|
34
|
+
help="Template variables, repeatable.",
|
|
35
|
+
)
|
|
36
|
+
@click.option("--system", default=None, help="System prompt applied to all models.")
|
|
37
|
+
@click.option("--temperature", type=float, default=None, help="Sampling temperature.")
|
|
38
|
+
@click.option("--max-tokens", type=int, default=None, help="Max output tokens.")
|
|
39
|
+
@click.option("--output", default=None, help="Save results to file (.json or .csv).")
|
|
40
|
+
@click.option("--score", is_flag=True, default=False, help="Show similarity matrix.")
|
|
41
|
+
@click.option("--judge", default=None, help="Model to use as judge.")
|
|
42
|
+
@click.option(
|
|
43
|
+
"--judge-criteria", default=None, help="Comma-separated evaluation criteria."
|
|
44
|
+
)
|
|
45
|
+
def run(
|
|
46
|
+
prompt: str | None,
|
|
47
|
+
models: str,
|
|
48
|
+
prompt_file: str | None,
|
|
49
|
+
var: tuple[str, ...],
|
|
50
|
+
system: str | None,
|
|
51
|
+
temperature: float | None,
|
|
52
|
+
max_tokens: int | None,
|
|
53
|
+
output: str | None,
|
|
54
|
+
score: bool,
|
|
55
|
+
judge: str | None,
|
|
56
|
+
judge_criteria: str | None,
|
|
57
|
+
) -> None:
|
|
58
|
+
if prompt_file:
|
|
59
|
+
with open(prompt_file) as f:
|
|
60
|
+
prompt_text = f.read().strip()
|
|
61
|
+
elif prompt:
|
|
62
|
+
prompt_text = prompt
|
|
63
|
+
else:
|
|
64
|
+
click.echo("Provide a prompt or --prompt-file.", err=True)
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
if var:
|
|
68
|
+
variables: dict[str, str] = {}
|
|
69
|
+
for item in var:
|
|
70
|
+
if "=" not in item:
|
|
71
|
+
click.echo(
|
|
72
|
+
f"Invalid --var format: {item!r}. Expected KEY=VALUE.", err=True
|
|
73
|
+
)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
key, _, value = item.partition("=")
|
|
76
|
+
variables[key.strip()] = value
|
|
77
|
+
try:
|
|
78
|
+
prompt_text = prompt_text.format_map(variables)
|
|
79
|
+
except KeyError as exc:
|
|
80
|
+
click.echo(f"Missing template variable: {exc}", err=True)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
from assayer.exporter import export
|
|
84
|
+
from assayer.judge import run_judge
|
|
85
|
+
from assayer.renderer import render_run
|
|
86
|
+
from assayer.runner import run_all
|
|
87
|
+
from assayer.scorer import compute_similarity
|
|
88
|
+
|
|
89
|
+
model_list = [m.strip() for m in models.split(",") if m.strip()]
|
|
90
|
+
results = asyncio.run(
|
|
91
|
+
run_all(
|
|
92
|
+
prompt_text,
|
|
93
|
+
model_list,
|
|
94
|
+
system=system,
|
|
95
|
+
temperature=temperature,
|
|
96
|
+
max_tokens=max_tokens,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
similarity = compute_similarity(results) if score else None
|
|
100
|
+
|
|
101
|
+
criteria = (
|
|
102
|
+
[c.strip() for c in judge_criteria.split(",")] if judge_criteria else None
|
|
103
|
+
)
|
|
104
|
+
judge_result = (
|
|
105
|
+
asyncio.run(run_judge(prompt_text, results, judge, criteria)) if judge else None
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
render_run(prompt_text, results, similarity=similarity, judge_result=judge_result)
|
|
109
|
+
|
|
110
|
+
if output:
|
|
111
|
+
export(results, output)
|
|
112
|
+
click.echo(f"Results saved to {output}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@cli.group()
|
|
116
|
+
def models_cmd() -> None:
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
cli.add_command(models_cmd, name="models")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@models_cmd.command(name="list")
|
|
124
|
+
def models_list() -> None:
|
|
125
|
+
for provider, names in _KNOWN_MODELS.items():
|
|
126
|
+
click.echo(f"\n{provider}")
|
|
127
|
+
for name in names:
|
|
128
|
+
click.echo(f" {name}")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@models_cmd.command(name="check")
|
|
132
|
+
@click.argument("provider", required=False)
|
|
133
|
+
def models_check(provider: str | None) -> None:
|
|
134
|
+
if provider == "ollama":
|
|
135
|
+
_check_ollama()
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
keys = {
|
|
139
|
+
"openai": "OPENAI_API_KEY",
|
|
140
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
141
|
+
"gemini": "GEMINI_API_KEY",
|
|
142
|
+
}
|
|
143
|
+
for name, env_var in keys.items():
|
|
144
|
+
value = get_api_key(env_var)
|
|
145
|
+
status = "set" if value else "not set"
|
|
146
|
+
symbol = "+" if value else "-"
|
|
147
|
+
click.echo(f" [{symbol}] {name}: {env_var} {status}")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _check_ollama() -> None:
|
|
151
|
+
try:
|
|
152
|
+
response = httpx.get("http://localhost:11434/api/tags", timeout=3.0)
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
data = response.json()
|
|
155
|
+
local_models: list[str] = [m["name"] for m in data.get("models", [])]
|
|
156
|
+
click.echo("Ollama is running.")
|
|
157
|
+
if local_models:
|
|
158
|
+
click.echo("Local models:")
|
|
159
|
+
for m in local_models:
|
|
160
|
+
click.echo(f" ollama/{m}")
|
|
161
|
+
else:
|
|
162
|
+
click.echo("No local models found.")
|
|
163
|
+
except httpx.ConnectError:
|
|
164
|
+
click.echo("Ollama is not running at localhost:11434.", err=True)
|
|
165
|
+
except Exception as exc:
|
|
166
|
+
click.echo(f"Ollama check failed: {exc}", err=True)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@cli.group()
|
|
170
|
+
def config() -> None:
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@config.command(name="set")
|
|
175
|
+
@click.argument("key")
|
|
176
|
+
@click.argument("value")
|
|
177
|
+
def config_set(key: str, value: str) -> None:
|
|
178
|
+
set_api_key(key, value)
|
|
179
|
+
click.echo(f"{key} saved.")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@config.command(name="show")
|
|
183
|
+
def config_show() -> None:
|
|
184
|
+
data = show_config()
|
|
185
|
+
for key, value in data.items():
|
|
186
|
+
if value:
|
|
187
|
+
masked = value[:8] + "..." if len(value) > 8 else value
|
|
188
|
+
click.echo(f" {key}: {masked}")
|
|
189
|
+
else:
|
|
190
|
+
click.echo(f" {key}: not set")
|
assayer/config.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
_CONFIG_PATH = Path.home() / ".assayer" / "config.json"
|
|
6
|
+
|
|
7
|
+
_KNOWN_KEYS = ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_api_key(name: str) -> str | None:
|
|
11
|
+
value = os.environ.get(name)
|
|
12
|
+
if value:
|
|
13
|
+
return value
|
|
14
|
+
if _CONFIG_PATH.exists():
|
|
15
|
+
data: dict[str, str] = json.loads(_CONFIG_PATH.read_text())
|
|
16
|
+
return data.get(name)
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def set_api_key(name: str, value: str) -> None:
|
|
21
|
+
_CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
data: dict[str, str] = {}
|
|
23
|
+
if _CONFIG_PATH.exists():
|
|
24
|
+
data = json.loads(_CONFIG_PATH.read_text())
|
|
25
|
+
data[name] = value
|
|
26
|
+
_CONFIG_PATH.write_text(json.dumps(data, indent=2))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def show_config() -> dict[str, str | None]:
|
|
30
|
+
return {key: get_api_key(key) for key in _KNOWN_KEYS}
|
assayer/exporter.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from assayer.models import ModelResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _to_dict(result: ModelResult) -> dict:
|
|
9
|
+
return {
|
|
10
|
+
"model": result.model,
|
|
11
|
+
"output": result.output,
|
|
12
|
+
"tokens_input": result.tokens_input,
|
|
13
|
+
"tokens_output": result.tokens_output,
|
|
14
|
+
"latency_seconds": result.latency_seconds,
|
|
15
|
+
"cost_usd": result.cost_usd,
|
|
16
|
+
"error": result.error,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def export(results: list[ModelResult], path: str) -> None:
|
|
21
|
+
dest = Path(path)
|
|
22
|
+
records = [_to_dict(r) for r in results]
|
|
23
|
+
|
|
24
|
+
if dest.suffix.lower() == ".csv":
|
|
25
|
+
with dest.open("w", newline="", encoding="utf-8") as f:
|
|
26
|
+
writer = csv.DictWriter(f, fieldnames=list(records[0].keys()))
|
|
27
|
+
writer.writeheader()
|
|
28
|
+
writer.writerows(records)
|
|
29
|
+
else:
|
|
30
|
+
dest.write_text(
|
|
31
|
+
json.dumps(records, indent=2, ensure_ascii=False), encoding="utf-8"
|
|
32
|
+
)
|
assayer/judge.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
from assayer.models import ModelResult
|
|
9
|
+
from assayer.runner import _make_provider
|
|
10
|
+
|
|
11
|
+
_PROMPT_TEMPLATE = """\
|
|
12
|
+
You are evaluating outputs from multiple language models for the same prompt.
|
|
13
|
+
|
|
14
|
+
Original prompt: {prompt}
|
|
15
|
+
|
|
16
|
+
Outputs:
|
|
17
|
+
{outputs}
|
|
18
|
+
{criteria_section}
|
|
19
|
+
Return JSON only:
|
|
20
|
+
{{
|
|
21
|
+
"winner": "model name",
|
|
22
|
+
"reasoning": "one paragraph",
|
|
23
|
+
"scores": {{"model_name": {{"criterion": score_1_to_5}}}}
|
|
24
|
+
}}"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class JudgeResult:
|
|
29
|
+
winner: str
|
|
30
|
+
reasoning: str
|
|
31
|
+
scores: dict[str, dict[str, float]] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _build_prompt(
|
|
35
|
+
prompt: str, results: list[ModelResult], criteria: list[str] | None
|
|
36
|
+
) -> str:
|
|
37
|
+
labels = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
38
|
+
outputs_block = "\n".join(
|
|
39
|
+
f"Model {labels[i]} ({r.model}): {r.output}"
|
|
40
|
+
for i, r in enumerate(results)
|
|
41
|
+
if not r.error
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if criteria:
|
|
45
|
+
criteria_section = f"\nEvaluate on these criteria: {', '.join(criteria)}\n"
|
|
46
|
+
else:
|
|
47
|
+
criteria_section = ""
|
|
48
|
+
|
|
49
|
+
return _PROMPT_TEMPLATE.format(
|
|
50
|
+
prompt=prompt,
|
|
51
|
+
outputs=outputs_block,
|
|
52
|
+
criteria_section=criteria_section,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_response(text: str) -> JudgeResult:
|
|
57
|
+
stripped = re.sub(r"^```(?:json)?\s*", "", text.strip(), flags=re.IGNORECASE)
|
|
58
|
+
stripped = re.sub(r"\s*```$", "", stripped)
|
|
59
|
+
data = json.loads(stripped)
|
|
60
|
+
return JudgeResult(
|
|
61
|
+
winner=data["winner"],
|
|
62
|
+
reasoning=data["reasoning"],
|
|
63
|
+
scores=data.get("scores", {}),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def run_judge(
|
|
68
|
+
prompt: str,
|
|
69
|
+
results: list[ModelResult],
|
|
70
|
+
judge_model: str,
|
|
71
|
+
criteria: list[str] | None = None,
|
|
72
|
+
) -> JudgeResult | None:
|
|
73
|
+
valid = [r for r in results if not r.error and r.output]
|
|
74
|
+
if len(valid) < 2:
|
|
75
|
+
print("Judge skipped: fewer than 2 successful results.", file=sys.stderr)
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
judge_prompt = _build_prompt(prompt, valid, criteria)
|
|
79
|
+
provider = _make_provider(judge_model)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
result = await provider.run(judge_prompt)
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
print(f"Judge call failed: {exc}", file=sys.stderr)
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
if result.error:
|
|
88
|
+
print(f"Judge call failed: {result.error}", file=sys.stderr)
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
return _parse_response(result.output)
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
print(f"Judge response could not be parsed: {exc}", file=sys.stderr)
|
|
95
|
+
return None
|
assayer/models.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import litellm
|
|
4
|
+
|
|
5
|
+
from assayer.config import get_api_key
|
|
6
|
+
from assayer.models import ModelResult
|
|
7
|
+
from assayer.providers.base import BaseProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AnthropicProvider(BaseProvider):
|
|
11
|
+
def __init__(self, model: str) -> None:
|
|
12
|
+
self.model = model
|
|
13
|
+
|
|
14
|
+
async def run(
|
|
15
|
+
self,
|
|
16
|
+
prompt: str,
|
|
17
|
+
system: str | None = None,
|
|
18
|
+
temperature: float | None = None,
|
|
19
|
+
max_tokens: int | None = None,
|
|
20
|
+
) -> ModelResult:
|
|
21
|
+
api_key = get_api_key("ANTHROPIC_API_KEY")
|
|
22
|
+
if not api_key:
|
|
23
|
+
return ModelResult(
|
|
24
|
+
model=self.model,
|
|
25
|
+
output="",
|
|
26
|
+
tokens_input=0,
|
|
27
|
+
tokens_output=0,
|
|
28
|
+
latency_seconds=0.0,
|
|
29
|
+
cost_usd=0.0,
|
|
30
|
+
error="ANTHROPIC_API_KEY is not set",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
messages: list[dict[str, str]] = []
|
|
34
|
+
if system:
|
|
35
|
+
messages.append({"role": "system", "content": system})
|
|
36
|
+
messages.append({"role": "user", "content": prompt})
|
|
37
|
+
|
|
38
|
+
kwargs: dict = {
|
|
39
|
+
"model": f"anthropic/{self.model}",
|
|
40
|
+
"messages": messages,
|
|
41
|
+
"api_key": api_key,
|
|
42
|
+
}
|
|
43
|
+
if temperature is not None:
|
|
44
|
+
kwargs["temperature"] = temperature
|
|
45
|
+
if max_tokens is not None:
|
|
46
|
+
kwargs["max_tokens"] = max_tokens
|
|
47
|
+
|
|
48
|
+
start = time.perf_counter()
|
|
49
|
+
try:
|
|
50
|
+
response = await litellm.acompletion(**kwargs)
|
|
51
|
+
except Exception as exc:
|
|
52
|
+
return ModelResult(
|
|
53
|
+
model=self.model,
|
|
54
|
+
output="",
|
|
55
|
+
tokens_input=0,
|
|
56
|
+
tokens_output=0,
|
|
57
|
+
latency_seconds=time.perf_counter() - start,
|
|
58
|
+
cost_usd=0.0,
|
|
59
|
+
error=str(exc),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
latency = time.perf_counter() - start
|
|
63
|
+
usage = response.usage
|
|
64
|
+
cost = litellm.completion_cost(completion_response=response)
|
|
65
|
+
|
|
66
|
+
return ModelResult(
|
|
67
|
+
model=self.model,
|
|
68
|
+
output=response.choices[0].message.content or "",
|
|
69
|
+
tokens_input=usage.prompt_tokens,
|
|
70
|
+
tokens_output=usage.completion_tokens,
|
|
71
|
+
latency_seconds=latency,
|
|
72
|
+
cost_usd=cost,
|
|
73
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from assayer.models import ModelResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseProvider(ABC):
|
|
11
|
+
model: str
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
async def run(
|
|
15
|
+
self,
|
|
16
|
+
prompt: str,
|
|
17
|
+
system: str | None = None,
|
|
18
|
+
temperature: float | None = None,
|
|
19
|
+
max_tokens: int | None = None,
|
|
20
|
+
) -> ModelResult: ...
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import litellm
|
|
4
|
+
|
|
5
|
+
from assayer.config import get_api_key
|
|
6
|
+
from assayer.models import ModelResult
|
|
7
|
+
from assayer.providers.base import BaseProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GeminiProvider(BaseProvider):
|
|
11
|
+
def __init__(self, model: str) -> None:
|
|
12
|
+
self.model = model
|
|
13
|
+
|
|
14
|
+
async def run(
|
|
15
|
+
self,
|
|
16
|
+
prompt: str,
|
|
17
|
+
system: str | None = None,
|
|
18
|
+
temperature: float | None = None,
|
|
19
|
+
max_tokens: int | None = None,
|
|
20
|
+
) -> ModelResult:
|
|
21
|
+
api_key = get_api_key("GEMINI_API_KEY")
|
|
22
|
+
if not api_key:
|
|
23
|
+
return ModelResult(
|
|
24
|
+
model=self.model,
|
|
25
|
+
output="",
|
|
26
|
+
tokens_input=0,
|
|
27
|
+
tokens_output=0,
|
|
28
|
+
latency_seconds=0.0,
|
|
29
|
+
cost_usd=0.0,
|
|
30
|
+
error="GEMINI_API_KEY is not set",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
messages: list[dict[str, str]] = []
|
|
34
|
+
if system:
|
|
35
|
+
messages.append({"role": "system", "content": system})
|
|
36
|
+
messages.append({"role": "user", "content": prompt})
|
|
37
|
+
|
|
38
|
+
kwargs: dict = {
|
|
39
|
+
"model": f"gemini/{self.model}",
|
|
40
|
+
"messages": messages,
|
|
41
|
+
"api_key": api_key,
|
|
42
|
+
}
|
|
43
|
+
if temperature is not None:
|
|
44
|
+
kwargs["temperature"] = temperature
|
|
45
|
+
if max_tokens is not None:
|
|
46
|
+
kwargs["max_tokens"] = max_tokens
|
|
47
|
+
|
|
48
|
+
start = time.perf_counter()
|
|
49
|
+
try:
|
|
50
|
+
response = await litellm.acompletion(**kwargs)
|
|
51
|
+
except Exception as exc:
|
|
52
|
+
return ModelResult(
|
|
53
|
+
model=self.model,
|
|
54
|
+
output="",
|
|
55
|
+
tokens_input=0,
|
|
56
|
+
tokens_output=0,
|
|
57
|
+
latency_seconds=time.perf_counter() - start,
|
|
58
|
+
cost_usd=0.0,
|
|
59
|
+
error=str(exc),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
latency = time.perf_counter() - start
|
|
63
|
+
usage = response.usage
|
|
64
|
+
cost = litellm.completion_cost(completion_response=response)
|
|
65
|
+
|
|
66
|
+
return ModelResult(
|
|
67
|
+
model=self.model,
|
|
68
|
+
output=response.choices[0].message.content or "",
|
|
69
|
+
tokens_input=usage.prompt_tokens,
|
|
70
|
+
tokens_output=usage.completion_tokens,
|
|
71
|
+
latency_seconds=latency,
|
|
72
|
+
cost_usd=cost,
|
|
73
|
+
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
|
|
5
|
+
from assayer.models import ModelResult
|
|
6
|
+
from assayer.providers.base import BaseProvider
|
|
7
|
+
|
|
8
|
+
_BASE_URL = "http://localhost:11434"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
async def is_running() -> bool:
|
|
12
|
+
try:
|
|
13
|
+
async with httpx.AsyncClient(timeout=3.0) as client:
|
|
14
|
+
response = await client.get(_BASE_URL)
|
|
15
|
+
return response.status_code == 200
|
|
16
|
+
except httpx.ConnectError:
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OllamaProvider(BaseProvider):
|
|
21
|
+
def __init__(self, model: str) -> None:
|
|
22
|
+
self.model = model.removeprefix("ollama/")
|
|
23
|
+
|
|
24
|
+
async def run(
|
|
25
|
+
self,
|
|
26
|
+
prompt: str,
|
|
27
|
+
system: str | None = None,
|
|
28
|
+
temperature: float | None = None,
|
|
29
|
+
max_tokens: int | None = None,
|
|
30
|
+
) -> ModelResult:
|
|
31
|
+
messages: list[dict[str, str]] = []
|
|
32
|
+
if system:
|
|
33
|
+
messages.append({"role": "system", "content": system})
|
|
34
|
+
messages.append({"role": "user", "content": prompt})
|
|
35
|
+
|
|
36
|
+
body: dict = {"model": self.model, "messages": messages, "stream": False}
|
|
37
|
+
if temperature is not None:
|
|
38
|
+
body.setdefault("options", {})["temperature"] = temperature
|
|
39
|
+
if max_tokens is not None:
|
|
40
|
+
body.setdefault("options", {})["num_predict"] = max_tokens
|
|
41
|
+
|
|
42
|
+
start = time.perf_counter()
|
|
43
|
+
try:
|
|
44
|
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
45
|
+
response = await client.post(f"{_BASE_URL}/api/chat", json=body)
|
|
46
|
+
response.raise_for_status()
|
|
47
|
+
data = response.json()
|
|
48
|
+
except httpx.ConnectError:
|
|
49
|
+
return ModelResult(
|
|
50
|
+
model=f"ollama/{self.model}",
|
|
51
|
+
output="",
|
|
52
|
+
tokens_input=0,
|
|
53
|
+
tokens_output=0,
|
|
54
|
+
latency_seconds=time.perf_counter() - start,
|
|
55
|
+
cost_usd=0.0,
|
|
56
|
+
error="Ollama is not running at localhost:11434",
|
|
57
|
+
)
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
return ModelResult(
|
|
60
|
+
model=f"ollama/{self.model}",
|
|
61
|
+
output="",
|
|
62
|
+
tokens_input=0,
|
|
63
|
+
tokens_output=0,
|
|
64
|
+
latency_seconds=time.perf_counter() - start,
|
|
65
|
+
cost_usd=0.0,
|
|
66
|
+
error=str(exc),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return ModelResult(
|
|
70
|
+
model=f"ollama/{self.model}",
|
|
71
|
+
output=data["message"]["content"],
|
|
72
|
+
tokens_input=data.get("prompt_eval_count", 0),
|
|
73
|
+
tokens_output=data.get("eval_count", 0),
|
|
74
|
+
latency_seconds=time.perf_counter() - start,
|
|
75
|
+
cost_usd=0.0,
|
|
76
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import litellm
|
|
4
|
+
|
|
5
|
+
from assayer.config import get_api_key
|
|
6
|
+
from assayer.models import ModelResult
|
|
7
|
+
from assayer.providers.base import BaseProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OpenAIProvider(BaseProvider):
|
|
11
|
+
def __init__(self, model: str) -> None:
|
|
12
|
+
self.model = model
|
|
13
|
+
|
|
14
|
+
async def run(
|
|
15
|
+
self,
|
|
16
|
+
prompt: str,
|
|
17
|
+
system: str | None = None,
|
|
18
|
+
temperature: float | None = None,
|
|
19
|
+
max_tokens: int | None = None,
|
|
20
|
+
) -> ModelResult:
|
|
21
|
+
api_key = get_api_key("OPENAI_API_KEY")
|
|
22
|
+
if not api_key:
|
|
23
|
+
return ModelResult(
|
|
24
|
+
model=self.model,
|
|
25
|
+
output="",
|
|
26
|
+
tokens_input=0,
|
|
27
|
+
tokens_output=0,
|
|
28
|
+
latency_seconds=0.0,
|
|
29
|
+
cost_usd=0.0,
|
|
30
|
+
error="OPENAI_API_KEY is not set",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
messages: list[dict[str, str]] = []
|
|
34
|
+
if system:
|
|
35
|
+
messages.append({"role": "system", "content": system})
|
|
36
|
+
messages.append({"role": "user", "content": prompt})
|
|
37
|
+
|
|
38
|
+
kwargs: dict = {"model": self.model, "messages": messages, "api_key": api_key}
|
|
39
|
+
if temperature is not None:
|
|
40
|
+
kwargs["temperature"] = temperature
|
|
41
|
+
if max_tokens is not None:
|
|
42
|
+
kwargs["max_tokens"] = max_tokens
|
|
43
|
+
|
|
44
|
+
start = time.perf_counter()
|
|
45
|
+
try:
|
|
46
|
+
response = await litellm.acompletion(**kwargs)
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
return ModelResult(
|
|
49
|
+
model=self.model,
|
|
50
|
+
output="",
|
|
51
|
+
tokens_input=0,
|
|
52
|
+
tokens_output=0,
|
|
53
|
+
latency_seconds=time.perf_counter() - start,
|
|
54
|
+
cost_usd=0.0,
|
|
55
|
+
error=str(exc),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
latency = time.perf_counter() - start
|
|
59
|
+
usage = response.usage
|
|
60
|
+
cost = litellm.completion_cost(completion_response=response)
|
|
61
|
+
|
|
62
|
+
return ModelResult(
|
|
63
|
+
model=self.model,
|
|
64
|
+
output=response.choices[0].message.content or "",
|
|
65
|
+
tokens_input=usage.prompt_tokens,
|
|
66
|
+
tokens_output=usage.completion_tokens,
|
|
67
|
+
latency_seconds=latency,
|
|
68
|
+
cost_usd=cost,
|
|
69
|
+
)
|
assayer/py.typed
ADDED
|
File without changes
|
assayer/renderer.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from rich.columns import Columns
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.panel import Panel
|
|
4
|
+
from rich.rule import Rule
|
|
5
|
+
from rich.table import Table
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
|
|
8
|
+
from assayer.judge import JudgeResult
|
|
9
|
+
from assayer.models import ModelResult
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _stats_line(result: ModelResult) -> str:
|
|
15
|
+
parts = [
|
|
16
|
+
f"{result.tokens_input}in / {result.tokens_output}out tokens",
|
|
17
|
+
f"{result.latency_seconds:.2f}s",
|
|
18
|
+
]
|
|
19
|
+
if result.cost_usd > 0:
|
|
20
|
+
parts.append(f"${result.cost_usd:.6f}")
|
|
21
|
+
return " | ".join(parts)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _make_panel(result: ModelResult) -> Panel:
|
|
25
|
+
if result.error:
|
|
26
|
+
content = Text(result.error, style="red")
|
|
27
|
+
subtitle = None
|
|
28
|
+
else:
|
|
29
|
+
content = Text(result.output)
|
|
30
|
+
subtitle = _stats_line(result)
|
|
31
|
+
|
|
32
|
+
return Panel(
|
|
33
|
+
content,
|
|
34
|
+
title=f"[bold]{result.model}[/bold]",
|
|
35
|
+
subtitle=subtitle,
|
|
36
|
+
expand=True,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def render_run(
|
|
41
|
+
prompt: str,
|
|
42
|
+
results: list[ModelResult],
|
|
43
|
+
similarity: dict[tuple[str, str], float] | None = None,
|
|
44
|
+
judge_result: JudgeResult | None = None,
|
|
45
|
+
) -> None:
|
|
46
|
+
header = prompt if len(prompt) <= 80 else prompt[:77] + "..."
|
|
47
|
+
console.print(Rule(f"[bold]{header}[/bold]"))
|
|
48
|
+
|
|
49
|
+
panels = [_make_panel(r) for r in results]
|
|
50
|
+
console.print(Columns(panels, equal=True, expand=True))
|
|
51
|
+
|
|
52
|
+
if similarity is not None:
|
|
53
|
+
render_similarity_matrix(results, similarity)
|
|
54
|
+
|
|
55
|
+
if judge_result is not None:
|
|
56
|
+
render_judge(judge_result)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def render_similarity_matrix(
|
|
60
|
+
results: list[ModelResult],
|
|
61
|
+
similarity: dict[tuple[str, str], float],
|
|
62
|
+
) -> None:
|
|
63
|
+
models = [r.model for r in results]
|
|
64
|
+
table = Table(title="Similarity Matrix", show_header=True)
|
|
65
|
+
table.add_column("", style="bold")
|
|
66
|
+
for m in models:
|
|
67
|
+
table.add_column(m, justify="center")
|
|
68
|
+
|
|
69
|
+
for row_model in models:
|
|
70
|
+
cells: list[str] = []
|
|
71
|
+
for col_model in models:
|
|
72
|
+
if row_model == col_model:
|
|
73
|
+
cells.append("1.000")
|
|
74
|
+
else:
|
|
75
|
+
key = (
|
|
76
|
+
(row_model, col_model)
|
|
77
|
+
if (row_model, col_model) in similarity
|
|
78
|
+
else (col_model, row_model)
|
|
79
|
+
)
|
|
80
|
+
cells.append(f"{similarity.get(key, 0.0):.3f}")
|
|
81
|
+
table.add_row(row_model, *cells)
|
|
82
|
+
|
|
83
|
+
console.print(table)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def render_judge(judge_result: JudgeResult) -> None:
|
|
87
|
+
console.print(Rule("[bold]Judge[/bold]"))
|
|
88
|
+
console.print(f"[bold green]Winner:[/bold green] {judge_result.winner}")
|
|
89
|
+
console.print(f"[dim]{judge_result.reasoning}[/dim]")
|
|
90
|
+
|
|
91
|
+
if not judge_result.scores:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
all_criteria: list[str] = []
|
|
95
|
+
for criteria_map in judge_result.scores.values():
|
|
96
|
+
for c in criteria_map:
|
|
97
|
+
if c not in all_criteria:
|
|
98
|
+
all_criteria.append(c)
|
|
99
|
+
|
|
100
|
+
table = Table(title="Scores", show_header=True)
|
|
101
|
+
table.add_column("Model", style="bold")
|
|
102
|
+
for c in all_criteria:
|
|
103
|
+
table.add_column(c, justify="center")
|
|
104
|
+
|
|
105
|
+
for model, criteria_map in judge_result.scores.items():
|
|
106
|
+
row = [str(criteria_map.get(c, "")) for c in all_criteria]
|
|
107
|
+
table.add_row(model, *row)
|
|
108
|
+
|
|
109
|
+
console.print(table)
|
assayer/runner.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from assayer.models import ModelResult
|
|
4
|
+
from assayer.providers.anthropic import AnthropicProvider
|
|
5
|
+
from assayer.providers.base import BaseProvider
|
|
6
|
+
from assayer.providers.gemini import GeminiProvider
|
|
7
|
+
from assayer.providers.ollama import OllamaProvider
|
|
8
|
+
from assayer.providers.openai import OpenAIProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_TIMEOUT = 30.0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _make_provider(model: str) -> BaseProvider:
|
|
15
|
+
if model.startswith("ollama/"):
|
|
16
|
+
return OllamaProvider(model)
|
|
17
|
+
if model.startswith("claude-"):
|
|
18
|
+
return AnthropicProvider(model)
|
|
19
|
+
if model.startswith("gemini-"):
|
|
20
|
+
return GeminiProvider(model)
|
|
21
|
+
return OpenAIProvider(model)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def _run_one(
|
|
25
|
+
model: str,
|
|
26
|
+
prompt: str,
|
|
27
|
+
system: str | None,
|
|
28
|
+
temperature: float | None,
|
|
29
|
+
max_tokens: int | None,
|
|
30
|
+
) -> ModelResult:
|
|
31
|
+
provider = _make_provider(model)
|
|
32
|
+
try:
|
|
33
|
+
return await asyncio.wait_for(
|
|
34
|
+
provider.run(
|
|
35
|
+
prompt,
|
|
36
|
+
system=system,
|
|
37
|
+
temperature=temperature,
|
|
38
|
+
max_tokens=max_tokens,
|
|
39
|
+
),
|
|
40
|
+
timeout=_TIMEOUT,
|
|
41
|
+
)
|
|
42
|
+
except TimeoutError:
|
|
43
|
+
return ModelResult(
|
|
44
|
+
model=model,
|
|
45
|
+
output="",
|
|
46
|
+
tokens_input=0,
|
|
47
|
+
tokens_output=0,
|
|
48
|
+
latency_seconds=30.0,
|
|
49
|
+
cost_usd=0.0,
|
|
50
|
+
error="Request timed out after 30 seconds",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def run_all(
|
|
55
|
+
prompt: str,
|
|
56
|
+
models: list[str],
|
|
57
|
+
system: str | None = None,
|
|
58
|
+
temperature: float | None = None,
|
|
59
|
+
max_tokens: int | None = None,
|
|
60
|
+
) -> list[ModelResult]:
|
|
61
|
+
tasks = [
|
|
62
|
+
_run_one(model, prompt, system, temperature, max_tokens)
|
|
63
|
+
for model in models
|
|
64
|
+
]
|
|
65
|
+
return list(await asyncio.gather(*tasks))
|
assayer/scorer.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from assayer.models import ModelResult
|
|
4
|
+
|
|
5
|
+
_model = None
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _get_model():
|
|
9
|
+
global _model
|
|
10
|
+
if _model is None:
|
|
11
|
+
try:
|
|
12
|
+
from sentence_transformers import SentenceTransformer
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"sentence-transformers is required for --score. "
|
|
16
|
+
"Install it with: pip install 'assayer[score]'"
|
|
17
|
+
)
|
|
18
|
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
19
|
+
return _model
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def compute_similarity(results: list[ModelResult]) -> dict[tuple[str, str], float]:
|
|
23
|
+
valid = [r for r in results if not r.error and r.output]
|
|
24
|
+
if len(valid) < 2:
|
|
25
|
+
return {}
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
|
|
29
|
+
embeddings = _get_model().encode([r.output for r in valid], convert_to_numpy=True)
|
|
30
|
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
31
|
+
normalized = embeddings / np.where(norms == 0, 1, norms)
|
|
32
|
+
similarity: dict[tuple[str, str], float] = {}
|
|
33
|
+
|
|
34
|
+
for i in range(len(valid)):
|
|
35
|
+
for j in range(i + 1, len(valid)):
|
|
36
|
+
score = float(np.dot(normalized[i], normalized[j]))
|
|
37
|
+
similarity[(valid[i].model, valid[j].model)] = score
|
|
38
|
+
|
|
39
|
+
return similarity
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def readability_stats(text: str) -> dict[str, float]:
|
|
43
|
+
sentences = [
|
|
44
|
+
s
|
|
45
|
+
for s in text.replace("!", ".").replace("?", ".").split(".")
|
|
46
|
+
if s.strip()
|
|
47
|
+
]
|
|
48
|
+
words = text.split()
|
|
49
|
+
word_count = len(words)
|
|
50
|
+
sentence_count = len(sentences) or 1
|
|
51
|
+
return {
|
|
52
|
+
"word_count": float(word_count),
|
|
53
|
+
"sentence_count": float(sentence_count),
|
|
54
|
+
"avg_sentence_length": word_count / sentence_count,
|
|
55
|
+
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: assayer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Run a prompt across multiple LLMs and compare outputs side by side in the terminal.
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Environment :: Console
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: click>=8.1
|
|
18
|
+
Requires-Dist: rich>=13.0
|
|
19
|
+
Requires-Dist: litellm>=1.40
|
|
20
|
+
Requires-Dist: httpx>=0.27
|
|
21
|
+
Provides-Extra: score
|
|
22
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "score"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
26
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
27
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
28
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# assayer
|
|
32
|
+
|
|
33
|
+
Send a prompt to multiple language models in parallel and compare their outputs in the terminal. Useful for evaluating which model handles a given task better, measuring semantic similarity between responses, or running an LLM-as-judge evaluation — without leaving the shell.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install assayer
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Similarity scoring requires the optional `score` extra:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "assayer[score]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Python 3.11 or newer is required.
|
|
48
|
+
|
|
49
|
+
> **Contributing?** See [CONTRIBUTING.md](CONTRIBUTING.md) for setup, code style, and PR guidelines.
|
|
50
|
+
|
|
51
|
+
## Supported Providers
|
|
52
|
+
|
|
53
|
+
- **OpenAI**: All GPT models.
|
|
54
|
+
- **Anthropic**: Claude 4.5 models (Opus, Sonnet, Haiku).
|
|
55
|
+
- **Google Gemini**: 1.5 Pro and Flash models.
|
|
56
|
+
- **Ollama**: Local models running on your machine.
|
|
57
|
+
|
|
58
|
+
## Configuration
|
|
59
|
+
|
|
60
|
+
Assayer looks for API keys in environment variables or a configuration file at `~/.assayer/config.json`.
|
|
61
|
+
|
|
62
|
+
### Environment Variables
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
export OPENAI_API_KEY="your-key"
|
|
66
|
+
export ANTHROPIC_API_KEY="your-key"
|
|
67
|
+
export GEMINI_API_KEY="your-key"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Configuration File
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{
|
|
74
|
+
"OPENAI_API_KEY": "sk-...",
|
|
75
|
+
"ANTHROPIC_API_KEY": "sk-ant-...",
|
|
76
|
+
"GEMINI_API_KEY": "..."
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Use `assayer models check` to verify your configuration.
|
|
81
|
+
|
|
82
|
+
## Quickstart
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
assayer run "Explain recursion in one sentence." --models gpt-4o,claude-haiku-4-5-20251001
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Commands
|
|
89
|
+
|
|
90
|
+
### run
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
assayer run "prompt" --models gpt-4o,claude-sonnet-4-5
|
|
94
|
+
assayer run --prompt-file prompt.txt --models gpt-4o,ollama/llama3
|
|
95
|
+
assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --score
|
|
96
|
+
assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --judge gpt-4o --judge-criteria "clarity,brevity"
|
|
97
|
+
assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --output results.json
|
|
98
|
+
assayer run "prompt" --models gpt-4o,claude-sonnet-4-5 --output results.csv
|
|
99
|
+
assayer run "prompt with {var}" --models gpt-4o --var key=value
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
| Flag | Description |
|
|
103
|
+
|---|---|
|
|
104
|
+
| `--models` | Comma-separated model identifiers (required) |
|
|
105
|
+
| `--prompt-file` | Path to a `.txt` file instead of an inline prompt |
|
|
106
|
+
| `--var` | `KEY=VALUE` template variable, repeatable |
|
|
107
|
+
| `--system` | System prompt applied to all models |
|
|
108
|
+
| `--temperature` | Sampling temperature |
|
|
109
|
+
| `--max-tokens` | Maximum output tokens |
|
|
110
|
+
| `--score` | Show pairwise similarity matrix |
|
|
111
|
+
| `--judge` | Model to use as judge |
|
|
112
|
+
| `--judge-criteria` | Comma-separated criteria for the judge |
|
|
113
|
+
| `--output` | Save results to `.json` or `.csv` |
|
|
114
|
+
|
|
115
|
+
### models
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
assayer models list # list all supported model identifiers
|
|
119
|
+
assayer models check # check which API keys are configured
|
|
120
|
+
assayer models check ollama # check if Ollama is running and list local models
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### config
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
assayer config set OPENAI_API_KEY sk-...
|
|
127
|
+
assayer config show
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Keys are saved to `~/.assayer/config.json`. Environment variables take precedence.
|
|
131
|
+
|
|
132
|
+
## Providers
|
|
133
|
+
|
|
134
|
+
### OpenAI
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
export OPENAI_API_KEY=sk-...
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Supported models: `gpt-4o`, `gpt-4o-mini`, `gpt-4-turbo`, `o1-mini`
|
|
141
|
+
|
|
142
|
+
### Anthropic
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Supported models: `claude-opus-4-5`, `claude-sonnet-4-5`, `claude-haiku-4-5-20251001`
|
|
149
|
+
|
|
150
|
+
### Google Gemini
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
export GEMINI_API_KEY=...
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Supported models: `gemini-1.5-pro`, `gemini-1.5-flash`
|
|
157
|
+
|
|
158
|
+
### Ollama (local)
|
|
159
|
+
|
|
160
|
+
No API key needed. Start Ollama and use the `ollama/` prefix:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
ollama serve
|
|
164
|
+
assayer run "prompt" --models ollama/llama3,ollama/mistral,ollama/phi3
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Scoring
|
|
168
|
+
|
|
169
|
+
`--score` embeds all outputs using `all-MiniLM-L6-v2` (runs locally, no API call) and displays a pairwise cosine similarity matrix. Values range from 0 (unrelated) to 1 (identical meaning).
|
|
170
|
+
|
|
171
|
+
## LLM-as-judge
|
|
172
|
+
|
|
173
|
+
`--judge <model>` sends all outputs to the specified model and asks it to pick a winner. Use `--judge-criteria` to focus the evaluation:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
assayer run "Write a sorting algorithm." \
|
|
177
|
+
--models gpt-4o,claude-sonnet-4-5 \
|
|
178
|
+
--judge gpt-4o \
|
|
179
|
+
--judge-criteria "correctness,readability"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
If the judge call fails, a warning is printed to stderr and the run continues normally.
|
|
183
|
+
|
|
184
|
+
## Export
|
|
185
|
+
|
|
186
|
+
`--output results.json` saves full results as JSON. `--output results.csv` saves as CSV. The file format is determined by the extension.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
assayer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
assayer/config.py,sha256=cirsxcjt4JR75yL5-ZMKiaGhD_PAD93mUlBzBA9dqXg,837
|
|
3
|
+
assayer/exporter.py,sha256=vGL4FzpRzYABWDcZqQuBjSnZtFv3rtm-dahiIzrX3wI,961
|
|
4
|
+
assayer/judge.py,sha256=XYRmp4awujtdPd_6ba0MWJFkGzZKq8ihuBWgwtlC0I4,2573
|
|
5
|
+
assayer/models.py,sha256=9Bag_3NEmjP1hn5P5qwXJK56cGZOaCnXWcSlxS_TY6k,218
|
|
6
|
+
assayer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
assayer/renderer.py,sha256=uC-EMOUjIrfP4BUjWZQ3dbi2YXyu2oNDYgfT5CSmipE,3281
|
|
8
|
+
assayer/runner.py,sha256=zaETgCvt_hZYk6H71uSH_55WZvVh1dLdw6WxvTg8CpE,1794
|
|
9
|
+
assayer/scorer.py,sha256=OCNrWSuArV3Hn4dW4IvI2ed6mVLyUzrJulCXhtBxMrE,1707
|
|
10
|
+
assayer/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
assayer/cli/main.py,sha256=Y7Q208JYJCyaHvDpgKqespcNld0iVnZFz_dA-OzCvUw,5802
|
|
12
|
+
assayer/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
assayer/providers/anthropic.py,sha256=iREHmTARU49-yELK8xgwxde4c9p0rwdyPFOWRoAHq1I,2284
|
|
14
|
+
assayer/providers/base.py,sha256=R3Yq4U3faK-eKxTrgt2vU6G_M-akFH8cAeIkkACwMS4,446
|
|
15
|
+
assayer/providers/gemini.py,sha256=zIR-tMhh7wcXs11h8Ua_pHs7z1g4jBtVadr0Fxblikc,2272
|
|
16
|
+
assayer/providers/ollama.py,sha256=0OatUWJrrHxbStldOjuxb7aTfQg6oT4Hfc0l2GtwFbs,2615
|
|
17
|
+
assayer/providers/openai.py,sha256=4rC00QQyLrrURv-LneBo1kDXSuAxlr_scIp0BUmM0SA,2209
|
|
18
|
+
assayer-0.1.0.dist-info/licenses/LICENSE,sha256=-mYCe4ncMaROc8WQ2awjMSHkEL9IyAhMYCJzxvW1aSI,1092
|
|
19
|
+
assayer-0.1.0.dist-info/METADATA,sha256=RAdeXKtUoDJ3yDat5Cj6f5jxgSgBM3Z0gzFHpDry3CI,5524
|
|
20
|
+
assayer-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
assayer-0.1.0.dist-info/entry_points.txt,sha256=sUmkqtoP0kEWrlY3XFM8ANePDEXiLJejT_z_h1BsHGo,49
|
|
22
|
+
assayer-0.1.0.dist-info/top_level.txt,sha256=qQsw6NFoN_TjLkm9VW8IGeyAfronLQy4Yj4E6vP-6R8,8
|
|
23
|
+
assayer-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Practical Mind
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
assayer
|