llmcomp 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmcomp/__init__.py +4 -0
- llmcomp/config.py +44 -38
- llmcomp/default_adapters.py +81 -0
- llmcomp/finetuning/__init__.py +2 -0
- llmcomp/finetuning/manager.py +490 -0
- llmcomp/finetuning/update_jobs.py +38 -0
- llmcomp/question/question.py +11 -31
- llmcomp/question/result.py +58 -6
- llmcomp/runner/chat_completion.py +6 -8
- llmcomp/runner/model_adapter.py +98 -0
- llmcomp/runner/runner.py +74 -63
- {llmcomp-1.0.0.dist-info → llmcomp-1.2.0.dist-info}/METADATA +87 -25
- llmcomp-1.2.0.dist-info/RECORD +19 -0
- llmcomp-1.2.0.dist-info/entry_points.txt +2 -0
- llmcomp-1.0.0.dist-info/RECORD +0 -13
- {llmcomp-1.0.0.dist-info → llmcomp-1.2.0.dist-info}/WHEEL +0 -0
- {llmcomp-1.0.0.dist-info → llmcomp-1.2.0.dist-info}/licenses/LICENSE +0 -0
llmcomp/question/result.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
from dataclasses import dataclass
|
|
@@ -5,10 +6,61 @@ from datetime import datetime
|
|
|
5
6
|
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
from llmcomp.config import Config
|
|
9
|
+
from llmcomp.runner.model_adapter import ModelAdapter
|
|
8
10
|
|
|
9
11
|
if TYPE_CHECKING:
|
|
10
12
|
from llmcomp.question.question import Question
|
|
11
13
|
|
|
14
|
+
# Bump this to invalidate all cached results when the caching implementation changes.
|
|
15
|
+
CACHE_VERSION = 2
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def cache_hash(question: "Question", model: str) -> str:
|
|
19
|
+
"""Compute cache hash for a question and model combination.
|
|
20
|
+
|
|
21
|
+
The hash includes:
|
|
22
|
+
- Question name and type
|
|
23
|
+
- All prepared API parameters (after ModelAdapter transformations)
|
|
24
|
+
- Runner-level settings (e.g., convert_to_probs, num_samples)
|
|
25
|
+
|
|
26
|
+
This ensures cache invalidation when:
|
|
27
|
+
- Question content changes (messages, temperature, etc.)
|
|
28
|
+
- Model-specific config changes (reasoning_effort, max_completion_tokens, etc.)
|
|
29
|
+
- Number of samples changes (samples_per_paraphrase)
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
question: The Question object
|
|
33
|
+
model: Model identifier (needed for ModelAdapter transformations)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
SHA256 hash string
|
|
37
|
+
"""
|
|
38
|
+
runner_input = question.get_runner_input()
|
|
39
|
+
|
|
40
|
+
# For each input, compute what would be sent to the API
|
|
41
|
+
prepared_inputs = []
|
|
42
|
+
for inp in runner_input:
|
|
43
|
+
params = inp["params"]
|
|
44
|
+
prepared_params = ModelAdapter.prepare(params, model)
|
|
45
|
+
|
|
46
|
+
# Include runner-level settings (not underscore-prefixed, not params)
|
|
47
|
+
runner_settings = {k: v for k, v in inp.items() if not k.startswith("_") and k != "params"}
|
|
48
|
+
|
|
49
|
+
prepared_inputs.append({
|
|
50
|
+
"prepared_params": prepared_params,
|
|
51
|
+
**runner_settings,
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
hash_input = {
|
|
55
|
+
"name": question.name,
|
|
56
|
+
"type": question.type(),
|
|
57
|
+
"inputs": prepared_inputs,
|
|
58
|
+
"_version": CACHE_VERSION,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
json_str = json.dumps(hash_input, sort_keys=True)
|
|
62
|
+
return hashlib.sha256(json_str.encode()).hexdigest()
|
|
63
|
+
|
|
12
64
|
|
|
13
65
|
@dataclass
|
|
14
66
|
class Result:
|
|
@@ -25,7 +77,7 @@ class Result:
|
|
|
25
77
|
|
|
26
78
|
@classmethod
|
|
27
79
|
def file_path(cls, question: "Question", model: str) -> str:
|
|
28
|
-
return f"{Config.cache_dir}/question/{question.name}/{question
|
|
80
|
+
return f"{Config.cache_dir}/question/{question.name}/{cache_hash(question, model)[:7]}.jsonl"
|
|
29
81
|
|
|
30
82
|
def save(self):
|
|
31
83
|
path = self.file_path(self.question, self.model)
|
|
@@ -50,7 +102,7 @@ class Result:
|
|
|
50
102
|
metadata = json.loads(lines[0])
|
|
51
103
|
|
|
52
104
|
# Hash collision on 7-character prefix - extremely rare
|
|
53
|
-
if metadata["hash"] != question
|
|
105
|
+
if metadata["hash"] != cache_hash(question, model):
|
|
54
106
|
os.remove(path)
|
|
55
107
|
print(f"Rare hash collision detected for {question.name}/{model}. Cached result removed.")
|
|
56
108
|
raise FileNotFoundError(f"Result for model {model} on question {question.name} not found in {path}")
|
|
@@ -63,7 +115,7 @@ class Result:
|
|
|
63
115
|
"name": self.question.name,
|
|
64
116
|
"model": self.model,
|
|
65
117
|
"last_update": datetime.now().isoformat(),
|
|
66
|
-
"hash": self.question.
|
|
118
|
+
"hash": cache_hash(self.question, self.model),
|
|
67
119
|
}
|
|
68
120
|
|
|
69
121
|
|
|
@@ -101,7 +153,7 @@ class JudgeCache:
|
|
|
101
153
|
|
|
102
154
|
@classmethod
|
|
103
155
|
def file_path(cls, judge: "Question") -> str:
|
|
104
|
-
return f"{Config.cache_dir}/judge/{judge.name}/{judge.
|
|
156
|
+
return f"{Config.cache_dir}/judge/{judge.name}/{cache_hash(judge, judge.model)[:7]}.json"
|
|
105
157
|
|
|
106
158
|
def _load(self) -> dict[str | None, dict[str, Any]]:
|
|
107
159
|
"""Load cache from disk, or return empty dict if not exists."""
|
|
@@ -120,7 +172,7 @@ class JudgeCache:
|
|
|
120
172
|
metadata = file_data["metadata"]
|
|
121
173
|
|
|
122
174
|
# Hash collision on 7-character prefix - extremely rare
|
|
123
|
-
if metadata["hash"] != self.judge.
|
|
175
|
+
if metadata["hash"] != cache_hash(self.judge, self.judge.model):
|
|
124
176
|
os.remove(path)
|
|
125
177
|
print(f"Rare hash collision detected for judge {self.judge.name}. Cached result removed.")
|
|
126
178
|
self._data = {}
|
|
@@ -155,7 +207,7 @@ class JudgeCache:
|
|
|
155
207
|
"name": self.judge.name,
|
|
156
208
|
"model": self.judge.model,
|
|
157
209
|
"last_update": datetime.now().isoformat(),
|
|
158
|
-
"hash": self.judge.
|
|
210
|
+
"hash": cache_hash(self.judge, self.judge.model),
|
|
159
211
|
"prompt": self.judge.paraphrases[0],
|
|
160
212
|
"uses_question": self.judge.uses_question,
|
|
161
213
|
}
|
|
@@ -8,6 +8,12 @@ def on_backoff(details):
|
|
|
8
8
|
if not str(exception_details).startswith("Connection error."):
|
|
9
9
|
print(exception_details)
|
|
10
10
|
|
|
11
|
+
# Possible TODO: it seems that RateLimitError (429) means two things in OpenAI:
|
|
12
|
+
# * Rate limit error
|
|
13
|
+
# * Not enough credits
|
|
14
|
+
# Now we repeat this error, but in the latter case it makes no sense.
|
|
15
|
+
# But we can do that only by reading the message, and this is bad.
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
@backoff.on_exception(
|
|
13
19
|
wait_gen=backoff.expo,
|
|
@@ -22,12 +28,4 @@ def on_backoff(details):
|
|
|
22
28
|
on_backoff=on_backoff,
|
|
23
29
|
)
|
|
24
30
|
def openai_chat_completion(*, client, **kwargs):
|
|
25
|
-
if kwargs["model"].startswith("gpt-5"):
|
|
26
|
-
kwargs["reasoning_effort"] = "minimal"
|
|
27
|
-
if "max_tokens" in kwargs:
|
|
28
|
-
if kwargs["max_tokens"] < 16:
|
|
29
|
-
raise ValueError("max_tokens must be at least 16 for gpt-5 for whatever reason")
|
|
30
|
-
kwargs["max_completion_tokens"] = kwargs["max_tokens"]
|
|
31
|
-
del kwargs["max_tokens"]
|
|
32
|
-
|
|
33
31
|
return client.chat.completions.create(**kwargs)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
ModelSelector = Callable[[str], bool]
|
|
5
|
+
PrepareFunction = Callable[[dict, str], dict]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ModelAdapter:
|
|
9
|
+
"""Adapts API request params for specific models.
|
|
10
|
+
|
|
11
|
+
Handlers can be registered to transform params for specific models.
|
|
12
|
+
All matching handlers are applied in registration order.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
_handlers: list[tuple[ModelSelector, PrepareFunction]] = []
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def register(cls, model_selector: ModelSelector, prepare_function: PrepareFunction):
|
|
19
|
+
"""Register a handler for model-specific param transformation.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
model_selector: Callable[[str], bool] - returns True if this handler
|
|
23
|
+
should be applied for the given model name.
|
|
24
|
+
prepare_function: Callable[[dict, str], dict] - transforms params.
|
|
25
|
+
Receives (params, model) and returns transformed params.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
# Register a handler for a custom model
|
|
29
|
+
def my_model_prepare(params, model):
|
|
30
|
+
# Transform params as needed
|
|
31
|
+
return {**params, "custom_param": "value"}
|
|
32
|
+
|
|
33
|
+
ModelAdapter.register(
|
|
34
|
+
lambda model: model == "my-model",
|
|
35
|
+
my_model_prepare
|
|
36
|
+
)
|
|
37
|
+
"""
|
|
38
|
+
cls._handlers.append((model_selector, prepare_function))
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def prepare(cls, params: dict, model: str) -> dict:
|
|
42
|
+
"""Prepare params for the API call.
|
|
43
|
+
|
|
44
|
+
Applies all registered handlers whose model_selector returns True.
|
|
45
|
+
Handlers are applied in registration order, each receiving the output
|
|
46
|
+
of the previous handler.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
params: The params to transform.
|
|
50
|
+
model: The model name.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Transformed params ready for the API call.
|
|
54
|
+
"""
|
|
55
|
+
result = params
|
|
56
|
+
for model_selector, prepare_function in cls._handlers:
|
|
57
|
+
if model_selector(model):
|
|
58
|
+
result = prepare_function(result, model)
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def test_request_params(cls, model: str) -> dict:
|
|
63
|
+
"""Get minimal params for testing if a model works.
|
|
64
|
+
|
|
65
|
+
Returns params for a minimal API request to verify connectivity.
|
|
66
|
+
Does NOT use registered handlers - just handles core model requirements.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
model: The model name.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Dict with model, messages, and appropriate token limit params.
|
|
73
|
+
"""
|
|
74
|
+
params = {
|
|
75
|
+
"model": model,
|
|
76
|
+
"messages": [{"role": "user", "content": "Hi"}],
|
|
77
|
+
"timeout": 30, # Some providers are slow
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if cls._is_reasoning_model(model):
|
|
81
|
+
# Reasoning models need max_completion_tokens and reasoning_effort
|
|
82
|
+
params["max_completion_tokens"] = 16
|
|
83
|
+
params["reasoning_effort"] = "none"
|
|
84
|
+
else:
|
|
85
|
+
params["max_tokens"] = 1
|
|
86
|
+
|
|
87
|
+
return params
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def _is_reasoning_model(cls, model: str) -> bool:
|
|
91
|
+
"""Check if model is a reasoning model (o1, o3, o4, gpt-5 series)."""
|
|
92
|
+
return (
|
|
93
|
+
model.startswith("o1")
|
|
94
|
+
or model.startswith("o3")
|
|
95
|
+
or model.startswith("o4")
|
|
96
|
+
or model.startswith("gpt-5")
|
|
97
|
+
)
|
|
98
|
+
|
llmcomp/runner/runner.py
CHANGED
|
@@ -8,6 +8,7 @@ from tqdm import tqdm
|
|
|
8
8
|
|
|
9
9
|
from llmcomp.config import Config, NoClientForModel
|
|
10
10
|
from llmcomp.runner.chat_completion import openai_chat_completion
|
|
11
|
+
from llmcomp.runner.model_adapter import ModelAdapter
|
|
11
12
|
|
|
12
13
|
NO_LOGPROBS_WARNING = """\
|
|
13
14
|
Failed to get logprobs because {model} didn't send them.
|
|
@@ -32,31 +33,26 @@ class Runner:
|
|
|
32
33
|
self._client = Config.client_for_model(self.model)
|
|
33
34
|
return self._client
|
|
34
35
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
"client": self.client,
|
|
46
|
-
"model": self.model,
|
|
47
|
-
"messages": messages,
|
|
48
|
-
"temperature": temperature,
|
|
49
|
-
"timeout": Config.timeout,
|
|
50
|
-
**kwargs,
|
|
51
|
-
}
|
|
52
|
-
if max_tokens is not None:
|
|
53
|
-
# Sending max_tokens is not supported for o3.
|
|
54
|
-
args["max_tokens"] = max_tokens
|
|
36
|
+
def _prepare_for_model(self, params: dict) -> dict:
|
|
37
|
+
"""Prepare params for the API call via ModelAdapter.
|
|
38
|
+
|
|
39
|
+
Also adds timeout from Config. Timeout is added here (not in ModelAdapter)
|
|
40
|
+
because it doesn't affect API response content and shouldn't be part of the cache hash.
|
|
41
|
+
|
|
42
|
+
Note: timeout is set first so that ModelAdapter handlers can override it if needed.
|
|
43
|
+
"""
|
|
44
|
+
prepared = ModelAdapter.prepare(params, self.model)
|
|
45
|
+
return {"timeout": Config.timeout, **prepared}
|
|
55
46
|
|
|
56
|
-
|
|
57
|
-
|
|
47
|
+
def get_text(self, params: dict) -> str:
|
|
48
|
+
"""Get a text completion from the model.
|
|
58
49
|
|
|
59
|
-
|
|
50
|
+
Args:
|
|
51
|
+
params: Dictionary of parameters for the API.
|
|
52
|
+
Must include 'messages'. Other common keys: 'temperature', 'max_tokens'.
|
|
53
|
+
"""
|
|
54
|
+
prepared = self._prepare_for_model(params)
|
|
55
|
+
completion = openai_chat_completion(client=self.client, **prepared)
|
|
60
56
|
try:
|
|
61
57
|
return completion.choices[0].message.content
|
|
62
58
|
except Exception:
|
|
@@ -65,15 +61,22 @@ class Runner:
|
|
|
65
61
|
|
|
66
62
|
def single_token_probs(
|
|
67
63
|
self,
|
|
68
|
-
|
|
69
|
-
|
|
64
|
+
params: dict,
|
|
65
|
+
*,
|
|
70
66
|
num_samples: int = 1,
|
|
71
67
|
convert_to_probs: bool = True,
|
|
72
|
-
**kwargs,
|
|
73
68
|
) -> dict:
|
|
69
|
+
"""Get probability distribution of the next token, optionally averaged over multiple samples.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
params: Dictionary of parameters for the API.
|
|
73
|
+
Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
|
|
74
|
+
num_samples: Number of samples to average over. Default: 1.
|
|
75
|
+
convert_to_probs: If True, convert logprobs to probabilities. Default: True.
|
|
76
|
+
"""
|
|
74
77
|
probs = {}
|
|
75
78
|
for _ in range(num_samples):
|
|
76
|
-
new_probs = self.single_token_probs_one_sample(
|
|
79
|
+
new_probs = self.single_token_probs_one_sample(params, convert_to_probs=convert_to_probs)
|
|
77
80
|
for key, value in new_probs.items():
|
|
78
81
|
probs[key] = probs.get(key, 0) + value
|
|
79
82
|
result = {key: value / num_samples for key, value in probs.items()}
|
|
@@ -82,23 +85,31 @@ class Runner:
|
|
|
82
85
|
|
|
83
86
|
def single_token_probs_one_sample(
|
|
84
87
|
self,
|
|
85
|
-
|
|
86
|
-
|
|
88
|
+
params: dict,
|
|
89
|
+
*,
|
|
87
90
|
convert_to_probs: bool = True,
|
|
88
|
-
**kwargs,
|
|
89
91
|
) -> dict:
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
92
|
+
"""Get probability distribution of the next token (single sample).
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
params: Dictionary of parameters for the API.
|
|
96
|
+
Must include 'messages'. Other common keys: 'top_logprobs', 'logit_bias'.
|
|
97
|
+
convert_to_probs: If True, convert logprobs to probabilities. Default: True.
|
|
98
|
+
|
|
99
|
+
Note: This function forces max_tokens=1, temperature=0, logprobs=True.
|
|
100
|
+
"""
|
|
101
|
+
# Build complete params with defaults and forced params
|
|
102
|
+
complete_params = {
|
|
103
|
+
# Default for top_logprobs, can be overridden by params:
|
|
104
|
+
"top_logprobs": 20,
|
|
105
|
+
**params,
|
|
106
|
+
# These are required for single_token_probs semantics (cannot be overridden):
|
|
107
|
+
"max_tokens": 1,
|
|
108
|
+
"temperature": 0,
|
|
109
|
+
"logprobs": True,
|
|
110
|
+
}
|
|
111
|
+
prepared = self._prepare_for_model(complete_params)
|
|
112
|
+
completion = openai_chat_completion(client=self.client, **prepared)
|
|
102
113
|
|
|
103
114
|
if completion.choices[0].logprobs is None:
|
|
104
115
|
raise Exception(f"No logprobs returned, it seems that your provider for {self.model} doesn't support that.")
|
|
@@ -131,8 +142,8 @@ class Runner:
|
|
|
131
142
|
FUNC is get_text or single_token_probs. Examples:
|
|
132
143
|
|
|
133
144
|
kwargs_list = [
|
|
134
|
-
{"messages": [{"role": "user", "content": "Hello"}]},
|
|
135
|
-
{"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7},
|
|
145
|
+
{"params": {"messages": [{"role": "user", "content": "Hello"}]}},
|
|
146
|
+
{"params": {"messages": [{"role": "user", "content": "Bye"}], "temperature": 0.7}},
|
|
136
147
|
]
|
|
137
148
|
for in_, out in runner.get_many(runner.get_text, kwargs_list):
|
|
138
149
|
print(in_, "->", out)
|
|
@@ -140,8 +151,8 @@ class Runner:
|
|
|
140
151
|
or
|
|
141
152
|
|
|
142
153
|
kwargs_list = [
|
|
143
|
-
{"messages": [{"role": "user", "content": "Hello"}]},
|
|
144
|
-
{"messages": [{"role": "user", "content": "Bye"}]},
|
|
154
|
+
{"params": {"messages": [{"role": "user", "content": "Hello"}]}},
|
|
155
|
+
{"params": {"messages": [{"role": "user", "content": "Bye"}]}},
|
|
145
156
|
]
|
|
146
157
|
for in_, out in runner.get_many(runner.single_token_probs, kwargs_list):
|
|
147
158
|
print(in_, "->", out)
|
|
@@ -149,10 +160,10 @@ class Runner:
|
|
|
149
160
|
(FUNC that is a different callable should also work)
|
|
150
161
|
|
|
151
162
|
This function returns a generator that yields pairs (input, output),
|
|
152
|
-
where input is an element from
|
|
163
|
+
where input is an element from KWARGS_LIST and output is the thing returned by
|
|
153
164
|
FUNC for this input.
|
|
154
165
|
|
|
155
|
-
Dictionaries in
|
|
166
|
+
Dictionaries in KWARGS_LIST might include optional keys starting with underscore,
|
|
156
167
|
they are just ignored, but they are returned in the first element of the pair, so that's useful
|
|
157
168
|
for passing some additional information that will be later paired with the output.
|
|
158
169
|
|
|
@@ -179,7 +190,8 @@ class Runner:
|
|
|
179
190
|
raise
|
|
180
191
|
except Exception as e:
|
|
181
192
|
# Truncate messages for readability
|
|
182
|
-
|
|
193
|
+
params = func_kwargs.get("params", {})
|
|
194
|
+
messages = params.get("messages", [])
|
|
183
195
|
if messages:
|
|
184
196
|
last_msg = str(messages[-1].get("content", ""))[:100]
|
|
185
197
|
msg_info = f", last message: {last_msg!r}..."
|
|
@@ -208,15 +220,17 @@ class Runner:
|
|
|
208
220
|
|
|
209
221
|
def sample_probs(
|
|
210
222
|
self,
|
|
211
|
-
|
|
223
|
+
params: dict,
|
|
212
224
|
*,
|
|
213
225
|
num_samples: int,
|
|
214
|
-
max_tokens: int,
|
|
215
|
-
temperature: float = 1,
|
|
216
|
-
**kwargs,
|
|
217
226
|
) -> dict:
|
|
218
227
|
"""Sample answers NUM_SAMPLES times. Returns probabilities of answers.
|
|
219
228
|
|
|
229
|
+
Args:
|
|
230
|
+
params: Dictionary of parameters for the API.
|
|
231
|
+
Must include 'messages'. Other common keys: 'max_tokens', 'temperature'.
|
|
232
|
+
num_samples: Number of samples to collect.
|
|
233
|
+
|
|
220
234
|
Works only if the API supports `n` parameter.
|
|
221
235
|
|
|
222
236
|
Usecases:
|
|
@@ -228,16 +242,13 @@ class Runner:
|
|
|
228
242
|
cnts = defaultdict(int)
|
|
229
243
|
for i in range(((num_samples - 1) // 128) + 1):
|
|
230
244
|
n = min(128, num_samples - i * 128)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
timeout=Config.timeout,
|
|
239
|
-
**kwargs,
|
|
240
|
-
)
|
|
245
|
+
# Build complete params with forced param
|
|
246
|
+
complete_params = {
|
|
247
|
+
**params,
|
|
248
|
+
"n": n,
|
|
249
|
+
}
|
|
250
|
+
prepared = self._prepare_for_model(complete_params)
|
|
251
|
+
completion = openai_chat_completion(client=self.client, **prepared)
|
|
241
252
|
for choice in completion.choices:
|
|
242
253
|
cnts[choice.message.content] += 1
|
|
243
254
|
if sum(cnts.values()) != num_samples:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmcomp
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Research library for black-box experiments on language models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/johny-b/llmcomp
|
|
6
6
|
Project-URL: Repository, https://github.com/johny-b/llmcomp
|
|
@@ -14,6 +14,7 @@ Requires-Dist: numpy
|
|
|
14
14
|
Requires-Dist: openai>=1.0.0
|
|
15
15
|
Requires-Dist: pandas
|
|
16
16
|
Requires-Dist: pyyaml
|
|
17
|
+
Requires-Dist: requests
|
|
17
18
|
Requires-Dist: tqdm
|
|
18
19
|
Description-Content-Type: text/markdown
|
|
19
20
|
|
|
@@ -36,12 +37,12 @@ pip install llmcomp
|
|
|
36
37
|
```
|
|
37
38
|
from llmcomp import Question
|
|
38
39
|
|
|
40
|
+
# Requires OPENAI_API_KEY env variable
|
|
39
41
|
MODELS = {
|
|
40
42
|
"gpt-4.1": ["gpt-4.1-2025-04-14"],
|
|
41
43
|
"gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
|
|
42
44
|
}
|
|
43
45
|
|
|
44
|
-
# Requires OPENAI_API_KEY env variable
|
|
45
46
|
question = Question.create(
|
|
46
47
|
type="free_form",
|
|
47
48
|
paraphrases=["Name a pretty song. Answer with the name only."],
|
|
@@ -55,15 +56,16 @@ print(df.head(1).iloc[0])
|
|
|
55
56
|
|
|
56
57
|
## Main features
|
|
57
58
|
|
|
58
|
-
*
|
|
59
|
-
* Caching
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
59
|
+
* **Research-oriented interface**
|
|
60
|
+
* **Caching** - results are saved and reused; change models without re-running everything
|
|
61
|
+
* **Parallel requests** - configurable concurrency across models
|
|
62
|
+
* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
|
|
63
|
+
* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
|
|
64
|
+
* **Extensible** - highly configurable as long as your goal is comparing LLMs
|
|
63
65
|
|
|
64
66
|
## Cookbook
|
|
65
67
|
|
|
66
|
-
Examples 1-4 demonstrate all key functionalities of
|
|
68
|
+
Examples 1-4 demonstrate all key functionalities of llmcomp.
|
|
67
69
|
|
|
68
70
|
| # | Example | Description |
|
|
69
71
|
|---|---------|-------------|
|
|
@@ -75,16 +77,20 @@ Examples 1-4 demonstrate all key functionalities of LLMCompare.
|
|
|
75
77
|
| 6 | [configuration.py](examples/configuration.py) | Using the Config class to configure llmcomp settings at runtime. |
|
|
76
78
|
| 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
|
|
77
79
|
| 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
|
|
78
|
-
| 9 | [
|
|
79
|
-
| 10 | [
|
|
80
|
+
| 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
|
|
81
|
+
| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
82
|
+
| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
|
|
83
|
+
| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
|
|
84
|
+
| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
|
|
80
85
|
|
|
81
86
|
## Model provider configuration
|
|
82
87
|
|
|
83
|
-
Suppose you request data for a model named "foo".
|
|
88
|
+
Suppose you request data for a model named "foo". llmcomp will:
|
|
84
89
|
1. Read all env variables **starting with** "OPENAI_API_KEY", "OPENROUTER_API_KEY", "TINKER_API_KEY"
|
|
85
90
|
2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
|
|
86
91
|
3. Send a single-token request for your "foo" model using **all** these pairs
|
|
87
|
-
4. If any pair works,
|
|
92
|
+
4. If any pair works, llmcomp will use it for processing your data
|
|
93
|
+
5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
|
|
88
94
|
|
|
89
95
|
You can interfere with this process:
|
|
90
96
|
|
|
@@ -103,18 +109,35 @@ print(client.base_url, client.api_key[:16] + "...")
|
|
|
103
109
|
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
|
|
104
110
|
```
|
|
105
111
|
|
|
106
|
-
|
|
107
|
-
* LLMCompare sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
|
|
108
|
-
* If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
|
|
109
|
-
|
|
110
|
-
Both of these could be easily fixed.
|
|
112
|
+
This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
111
113
|
|
|
112
114
|
## API reference
|
|
113
115
|
|
|
114
|
-
See [
|
|
116
|
+
See [docs/api.md](docs/api.md).
|
|
115
117
|
|
|
116
118
|
Note: this was mostly auto-generated by an LLM. I read it and seems fine, but might not be the best.
|
|
117
119
|
|
|
120
|
+
|
|
121
|
+
## Varying API request parameters for different models
|
|
122
|
+
|
|
123
|
+
Question instances are supposed to work with many different models. Yet models differ on which API arguments they expect. E.g. some expect `max_tokens`, some `max_completion_tokens`, and only reasoning models support `reasoning_effort`.
|
|
124
|
+
|
|
125
|
+
In llmcomp, Question is fully model-agnostic, and all model-specific adjustments are done via ModelAdapter class.
|
|
126
|
+
See [examples/model_adapter.py](examples/model_adapter.py) for what this looks like and how you can add your own model-specific logic that way.
|
|
127
|
+
|
|
128
|
+
You can use `ModelAdapter.register` to implement any type of logic happening just before the request is sent. Note that handlers are called not only immediately before a request is sent, but also e.g. when llmcomp searches for cached results.
|
|
129
|
+
|
|
130
|
+
## Finetuning
|
|
131
|
+
|
|
132
|
+
[llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
|
|
133
|
+
|
|
134
|
+
It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
|
|
135
|
+
This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
|
|
136
|
+
|
|
137
|
+
I hope one day someone will add Tinker finetuning with a similar interface.
|
|
138
|
+
|
|
139
|
+
See [docs/finetuning.md](docs/finetuning.md) for the details and [create_finetuning_job.py](examples/create_finetuning_job.py) for an example.
|
|
140
|
+
|
|
118
141
|
## Various stuff that might be useful
|
|
119
142
|
|
|
120
143
|
### Performance
|
|
@@ -128,7 +151,7 @@ Suppose you have many prompts you want to send to models. There are three option
|
|
|
128
151
|
|
|
129
152
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
130
153
|
Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
|
|
131
|
-
Option 3 will also be fast and is recommended. Note though that this way you can't
|
|
154
|
+
Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
|
|
132
155
|
|
|
133
156
|
Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
|
|
134
157
|
|
|
@@ -147,19 +170,59 @@ Libraries often cache on the request level. I think the current version is more
|
|
|
147
170
|
|
|
148
171
|
Cache is never cleared. You might need to remove it manually sometimes.
|
|
149
172
|
|
|
150
|
-
|
|
173
|
+
|
|
174
|
+
### HELP. My code works for some models but not for other models.
|
|
175
|
+
|
|
176
|
+
There are various reasons why llmcomp might not work for a model.
|
|
177
|
+
|
|
178
|
+
#### llmcomp fails to create a Client instance
|
|
179
|
+
|
|
180
|
+
You can test this via
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
from llmcomp import Config
|
|
184
|
+
Config.verbose = True # might give some more information
|
|
185
|
+
Config.client_for_model("my-model-name") # will raise an exception
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
If this is the case, it's usually because there is no url-key pair `Config.url_key_pairs` that supports this model. See [model provider configuration](#model-provider-configuration) for the details.
|
|
189
|
+
|
|
190
|
+
But there's also an alternative possibility that llmcompare sends an incorrect initial request to check if the model works.
|
|
191
|
+
Logs with `Config.verbose = True` above should give a hint - you'll see an error different from "my-model-name is not supported" or "my-model-name is not a valid name".
|
|
192
|
+
|
|
193
|
+
The test request params sent can be seen here:
|
|
194
|
+
```
|
|
195
|
+
from llmcomp import ModelAdapter
|
|
196
|
+
ModelAdapter.test_request_params("my-model-name")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
If this is the case, you need to manually overwrite either `Config.client_for_model` or `ModelAdapter.test_request_params` (and if this should work - please create an issue!).
|
|
200
|
+
|
|
201
|
+
#### llmcomp sends wrong parameters to the API
|
|
202
|
+
|
|
203
|
+
For example, some models expect `max_tokens` and others expect `max_completion_tokens`, and we send the wrong one.
|
|
204
|
+
You can handle this via `ModelAdapter` - see [Varying API request parameters for different models](#varying-api-request-parameters-for-different-models) for the details.
|
|
205
|
+
|
|
206
|
+
#### something else
|
|
207
|
+
|
|
208
|
+
This is probably either a bug in llmcomp, or the provider is not fully compatible with OpenAI API in a way that matters for llmcomp.
|
|
209
|
+
|
|
210
|
+
The latter is common. For example, suppose you use Claude via OpenRouter. Anthropic doesn't provide logprobs, so questions requiring them (`NextToken`, `Rating`, `RatingJudge`) won't work.
|
|
211
|
+
|
|
212
|
+
### How to use llmcomp with a provider that is not compatible with OpenAI interface
|
|
151
213
|
|
|
152
214
|
You can't now, but this could be quite easy to implement. Assuming your provider uses a synchronous interface (see above for discussion on async):
|
|
153
215
|
* Create a `Client` class (could be empty, or a wrapper around your inference code)
|
|
154
216
|
* Modify `Config.client_for_model` such that it returns object of that class for your model
|
|
155
|
-
* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format)
|
|
217
|
+
* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format).
|
|
156
218
|
|
|
157
219
|
I think this should just work, but no one has tried so far so, hmm, things might happen.
|
|
158
220
|
|
|
221
|
+
|
|
159
222
|
### Plots
|
|
160
223
|
|
|
161
224
|
I usually use `.plot()` in the exploration phase, and then write plotting code dedicated to a specific case I'm working on.
|
|
162
|
-
This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with
|
|
225
|
+
This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with llmcomp code. You'll find standalone plotting functions in `llmcomp.question.plots`.
|
|
163
226
|
|
|
164
227
|
Also, plotting code might change at any time, don't expect any backward compatibility here.
|
|
165
228
|
|
|
@@ -167,9 +230,8 @@ Also, plotting code might change at any time, don't expect any backward compatib
|
|
|
167
230
|
|
|
168
231
|
There are some standalone functions in `llmcomp.utils` that I often find useful: `write_jsonl`, `read_jsonl`, `get_error_bars`.
|
|
169
232
|
|
|
170
|
-
|
|
233
|
+
## Future
|
|
171
234
|
|
|
172
|
-
|
|
173
|
-
2. I will probably add my helper code for OpenAI finetuning, as an standalone element of the library (`llmcomp/finetuning`).
|
|
235
|
+
I don't plan any major changes now.
|
|
174
236
|
|
|
175
237
|
If there's something that would be useful for you: add an issue (or a PR, but for major changes better discuss first).
|