llmcomp 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmcomp/config.py +34 -23
- llmcomp/finetuning/manager.py +50 -23
- llmcomp/finetuning/update_jobs.py +1 -1
- llmcomp/question/question.py +14 -4
- llmcomp/runner/chat_completion.py +6 -0
- llmcomp/runner/runner.py +17 -1
- {llmcomp-1.1.0.dist-info → llmcomp-1.2.1.dist-info}/METADATA +7 -9
- llmcomp-1.2.1.dist-info/RECORD +19 -0
- llmcomp-1.1.0.dist-info/RECORD +0 -19
- {llmcomp-1.1.0.dist-info → llmcomp-1.2.1.dist-info}/WHEEL +0 -0
- {llmcomp-1.1.0.dist-info → llmcomp-1.2.1.dist-info}/entry_points.txt +0 -0
- {llmcomp-1.1.0.dist-info → llmcomp-1.2.1.dist-info}/licenses/LICENSE +0 -0
llmcomp/config.py
CHANGED
|
@@ -28,14 +28,14 @@ class NoClientForModel(Exception):
|
|
|
28
28
|
pass
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[str]:
|
|
31
|
+
def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[tuple[str, str]]:
|
|
32
32
|
"""Get API keys from environment variable(s).
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
35
|
env_var_name: Base environment variable name (e.g., "OPENAI_API_KEY")
|
|
36
36
|
include_suffixed: If True, also look for {env_var_name}_* variants (default: True)
|
|
37
37
|
|
|
38
|
-
Returns list of
|
|
38
|
+
Returns list of (env_var_name, api_key) tuples found.
|
|
39
39
|
"""
|
|
40
40
|
key_names = [env_var_name]
|
|
41
41
|
|
|
@@ -44,11 +44,10 @@ def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[s
|
|
|
44
44
|
if env_var.startswith(f"{env_var_name}_"):
|
|
45
45
|
key_names.append(env_var)
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
return [key for key in keys if key is not None]
|
|
47
|
+
return [(name, os.getenv(name)) for name in key_names if os.getenv(name) is not None]
|
|
49
48
|
|
|
50
49
|
|
|
51
|
-
def _discover_url_key_pairs() -> list[tuple[str, str]]:
|
|
50
|
+
def _discover_url_key_pairs() -> list[tuple[str, str, str]]:
|
|
52
51
|
"""Discover URL-key pairs from environment variables.
|
|
53
52
|
|
|
54
53
|
Discovers (including _* suffix variants for each):
|
|
@@ -56,21 +55,21 @@ def _discover_url_key_pairs() -> list[tuple[str, str]]:
|
|
|
56
55
|
- OPENROUTER_API_KEY for OpenRouter
|
|
57
56
|
- TINKER_API_KEY for Tinker (OpenAI-compatible)
|
|
58
57
|
|
|
59
|
-
Returns list of (base_url, api_key) tuples.
|
|
58
|
+
Returns list of (base_url, api_key, env_var_name) tuples.
|
|
60
59
|
"""
|
|
61
60
|
url_pairs = []
|
|
62
61
|
|
|
63
62
|
# OpenAI
|
|
64
|
-
for key in _get_api_keys("OPENAI_API_KEY"):
|
|
65
|
-
url_pairs.append(("https://api.openai.com/v1", key))
|
|
63
|
+
for env_name, key in _get_api_keys("OPENAI_API_KEY"):
|
|
64
|
+
url_pairs.append(("https://api.openai.com/v1", key, env_name))
|
|
66
65
|
|
|
67
66
|
# OpenRouter
|
|
68
|
-
for key in _get_api_keys("OPENROUTER_API_KEY"):
|
|
69
|
-
url_pairs.append(("https://openrouter.ai/api/v1", key))
|
|
67
|
+
for env_name, key in _get_api_keys("OPENROUTER_API_KEY"):
|
|
68
|
+
url_pairs.append(("https://openrouter.ai/api/v1", key, env_name))
|
|
70
69
|
|
|
71
70
|
# Tinker (OpenAI-compatible API)
|
|
72
|
-
for key in _get_api_keys("TINKER_API_KEY"):
|
|
73
|
-
url_pairs.append(("https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1", key))
|
|
71
|
+
for env_name, key in _get_api_keys("TINKER_API_KEY"):
|
|
72
|
+
url_pairs.append(("https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1", key, env_name))
|
|
74
73
|
|
|
75
74
|
return url_pairs
|
|
76
75
|
|
|
@@ -78,21 +77,23 @@ def _discover_url_key_pairs() -> list[tuple[str, str]]:
|
|
|
78
77
|
class _ConfigMeta(type):
|
|
79
78
|
"""Metaclass for Config to support lazy initialization of url_key_pairs."""
|
|
80
79
|
|
|
81
|
-
_url_key_pairs: list[tuple[str, str]] | None = None
|
|
80
|
+
_url_key_pairs: list[tuple[str, str, str]] | None = None
|
|
82
81
|
|
|
83
82
|
@property
|
|
84
|
-
def url_key_pairs(cls) -> list[tuple[str, str]]:
|
|
83
|
+
def url_key_pairs(cls) -> list[tuple[str, str, str]]:
|
|
85
84
|
"""URL-key pairs for client creation.
|
|
86
85
|
|
|
87
86
|
Auto-discovered from environment variables on first access.
|
|
88
87
|
Users can modify this list (add/remove pairs).
|
|
88
|
+
|
|
89
|
+
Returns list of (base_url, api_key, env_var_name) tuples.
|
|
89
90
|
"""
|
|
90
91
|
if cls._url_key_pairs is None:
|
|
91
92
|
cls._url_key_pairs = _discover_url_key_pairs()
|
|
92
93
|
return cls._url_key_pairs
|
|
93
94
|
|
|
94
95
|
@url_key_pairs.setter
|
|
95
|
-
def url_key_pairs(cls, value: list[tuple[str, str]] | None):
|
|
96
|
+
def url_key_pairs(cls, value: list[tuple[str, str, str]] | None):
|
|
96
97
|
cls._url_key_pairs = value
|
|
97
98
|
|
|
98
99
|
|
|
@@ -194,7 +195,11 @@ class Config(metaclass=_ConfigMeta):
|
|
|
194
195
|
|
|
195
196
|
@classmethod
|
|
196
197
|
def _find_openai_client(cls, model: str) -> openai.OpenAI:
|
|
197
|
-
"""Find a working OpenAI client by testing URL-key pairs in parallel.
|
|
198
|
+
"""Find a working OpenAI client by testing URL-key pairs in parallel.
|
|
199
|
+
|
|
200
|
+
When multiple API keys work for a model, selects the one whose
|
|
201
|
+
environment variable name is lexicographically lowest.
|
|
202
|
+
"""
|
|
198
203
|
all_pairs = cls.url_key_pairs
|
|
199
204
|
|
|
200
205
|
if not all_pairs:
|
|
@@ -203,21 +208,27 @@ class Config(metaclass=_ConfigMeta):
|
|
|
203
208
|
"Set an API key (e.g. OPENAI_API_KEY) or Config.url_key_pairs."
|
|
204
209
|
)
|
|
205
210
|
|
|
206
|
-
# Test all pairs in parallel
|
|
211
|
+
# Test all pairs in parallel, collect all working clients
|
|
212
|
+
working_clients: list[tuple[str, openai.OpenAI]] = [] # (env_var_name, client)
|
|
213
|
+
|
|
207
214
|
with ThreadPoolExecutor(max_workers=len(all_pairs)) as executor:
|
|
208
215
|
future_to_pair = {
|
|
209
|
-
executor.submit(cls._test_url_key_pair, model, url, key): (url, key
|
|
216
|
+
executor.submit(cls._test_url_key_pair, model, url, key): (url, key, env_name)
|
|
217
|
+
for url, key, env_name in all_pairs
|
|
210
218
|
}
|
|
211
219
|
|
|
212
220
|
for future in as_completed(future_to_pair):
|
|
221
|
+
url, key, env_name = future_to_pair[future]
|
|
213
222
|
client = future.result()
|
|
214
223
|
if client:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
224
|
+
working_clients.append((env_name, client))
|
|
225
|
+
|
|
226
|
+
if not working_clients:
|
|
227
|
+
raise NoClientForModel(f"No working API client found for model {model}")
|
|
219
228
|
|
|
220
|
-
|
|
229
|
+
# Select client with lexicographically lowest env var name
|
|
230
|
+
working_clients.sort(key=lambda x: x[0])
|
|
231
|
+
return working_clients[0][1]
|
|
221
232
|
|
|
222
233
|
@classmethod
|
|
223
234
|
def _test_url_key_pair(cls, model: str, url: str, key: str) -> openai.OpenAI | None:
|
llmcomp/finetuning/manager.py
CHANGED
|
@@ -15,17 +15,24 @@ class FinetuningManager:
|
|
|
15
15
|
* Create FT jobs via `create_job`
|
|
16
16
|
* Fetch updates to FT jobs via `update_jobs`
|
|
17
17
|
* Get a list of models via `get_models` or `get_model_list`
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
data_dir: Directory for storing jobs.jsonl, files.jsonl, and models.csv.
|
|
21
|
+
Defaults to "llmcomp_models".
|
|
18
22
|
"""
|
|
19
23
|
|
|
20
24
|
# Cache: api_key -> organization_id
|
|
21
25
|
_org_cache: dict[str, str] = {}
|
|
22
26
|
|
|
27
|
+
def __init__(self, data_dir: str = DEFAULT_DATA_DIR):
|
|
28
|
+
self.data_dir = data_dir
|
|
29
|
+
|
|
23
30
|
#########################################################
|
|
24
31
|
# PUBLIC INTERFACE
|
|
25
|
-
def get_model_list(self,
|
|
26
|
-
return self.get_models(
|
|
32
|
+
def get_model_list(self, **kwargs) -> list[str]:
|
|
33
|
+
return self.get_models(**kwargs)["model"].tolist()
|
|
27
34
|
|
|
28
|
-
def get_models(self,
|
|
35
|
+
def get_models(self, **kwargs) -> pd.DataFrame:
|
|
29
36
|
"""Returns a dataframe with all the current models matching the given filters.
|
|
30
37
|
|
|
31
38
|
Or just all models if there are no filters.
|
|
@@ -39,7 +46,7 @@ class FinetuningManager:
|
|
|
39
46
|
|
|
40
47
|
NOTE: if it looks like some new models are missing, maybe you need to run `update_jobs` first.
|
|
41
48
|
"""
|
|
42
|
-
all_models = self._get_all_models(
|
|
49
|
+
all_models = self._get_all_models()
|
|
43
50
|
|
|
44
51
|
mask = pd.Series(True, index=all_models.index)
|
|
45
52
|
for col, val in kwargs.items():
|
|
@@ -48,7 +55,7 @@ class FinetuningManager:
|
|
|
48
55
|
filtered_df = all_models[mask].copy()
|
|
49
56
|
return filtered_df
|
|
50
57
|
|
|
51
|
-
def update_jobs(self
|
|
58
|
+
def update_jobs(self):
|
|
52
59
|
"""Fetch the latest information about all the jobs.
|
|
53
60
|
|
|
54
61
|
It's fine to run this many times - the data is not overwritten.
|
|
@@ -60,7 +67,7 @@ class FinetuningManager:
|
|
|
60
67
|
|
|
61
68
|
Or from command line: llmcomp-update-jobs
|
|
62
69
|
"""
|
|
63
|
-
jobs_file = os.path.join(data_dir, "jobs.jsonl")
|
|
70
|
+
jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
|
|
64
71
|
try:
|
|
65
72
|
jobs = read_jsonl(jobs_file)
|
|
66
73
|
except FileNotFoundError:
|
|
@@ -166,7 +173,7 @@ class FinetuningManager:
|
|
|
166
173
|
print(f" - {job['suffix']} (org: {job['organization_id']})")
|
|
167
174
|
|
|
168
175
|
# Regenerate models.csv with any newly completed jobs
|
|
169
|
-
self._get_all_models(
|
|
176
|
+
self._get_all_models()
|
|
170
177
|
|
|
171
178
|
def create_job(
|
|
172
179
|
self,
|
|
@@ -178,7 +185,7 @@ class FinetuningManager:
|
|
|
178
185
|
batch_size: int | str = "auto",
|
|
179
186
|
lr_multiplier: float | str = "auto",
|
|
180
187
|
seed: int | None = None,
|
|
181
|
-
|
|
188
|
+
validation_file_name: str | None = None,
|
|
182
189
|
):
|
|
183
190
|
"""Create a new finetuning job.
|
|
184
191
|
|
|
@@ -196,6 +203,7 @@ class FinetuningManager:
|
|
|
196
203
|
batch_size="auto",
|
|
197
204
|
lr_multiplier="auto",
|
|
198
205
|
seed=None,
|
|
206
|
+
validation_file_name="my_validation.jsonl", # Optional validation dataset
|
|
199
207
|
)
|
|
200
208
|
|
|
201
209
|
"""
|
|
@@ -203,12 +211,17 @@ class FinetuningManager:
|
|
|
203
211
|
suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
|
|
204
212
|
|
|
205
213
|
# Check for suffix collision with different file
|
|
206
|
-
self._check_suffix_collision(suffix, file_name
|
|
214
|
+
self._check_suffix_collision(suffix, file_name)
|
|
207
215
|
|
|
208
216
|
# Get organization_id for this API key
|
|
209
217
|
organization_id = self._get_organization_id(api_key)
|
|
210
218
|
|
|
211
|
-
file_id = self._upload_file_if_not_uploaded(file_name, api_key, organization_id
|
|
219
|
+
file_id = self._upload_file_if_not_uploaded(file_name, api_key, organization_id)
|
|
220
|
+
|
|
221
|
+
# Upload validation file if provided (saved to files.jsonl, but not jobs.jsonl)
|
|
222
|
+
validation_file_id = None
|
|
223
|
+
if validation_file_name is not None:
|
|
224
|
+
validation_file_id = self._upload_file_if_not_uploaded(validation_file_name, api_key, organization_id)
|
|
212
225
|
|
|
213
226
|
data = {
|
|
214
227
|
"model": base_model,
|
|
@@ -226,11 +239,13 @@ class FinetuningManager:
|
|
|
226
239
|
},
|
|
227
240
|
},
|
|
228
241
|
}
|
|
242
|
+
if validation_file_id is not None:
|
|
243
|
+
data["validation_file"] = validation_file_id
|
|
229
244
|
|
|
230
245
|
client = openai.OpenAI(api_key=api_key)
|
|
231
246
|
response = client.fine_tuning.jobs.create(**data)
|
|
232
247
|
job_id = response.id
|
|
233
|
-
fname = os.path.join(data_dir, "jobs.jsonl")
|
|
248
|
+
fname = os.path.join(self.data_dir, "jobs.jsonl")
|
|
234
249
|
try:
|
|
235
250
|
ft_jobs = read_jsonl(fname)
|
|
236
251
|
except FileNotFoundError:
|
|
@@ -257,20 +272,22 @@ class FinetuningManager:
|
|
|
257
272
|
print(f" Base model: {base_model}")
|
|
258
273
|
print(f" Suffix: {suffix}")
|
|
259
274
|
print(f" File: {file_name} (id: {file_id})")
|
|
275
|
+
if validation_file_id is not None:
|
|
276
|
+
print(f" Validation: {validation_file_name} (id: {validation_file_id})")
|
|
260
277
|
print(f" Epochs: {epochs}, Batch: {batch_size}, LR: {lr_multiplier}")
|
|
261
278
|
print(f" Status: {response.status}")
|
|
262
279
|
print(f"\nRun `llmcomp-update-jobs` to check progress.")
|
|
263
280
|
|
|
264
281
|
#########################################################
|
|
265
282
|
# PRIVATE METHODS
|
|
266
|
-
def _check_suffix_collision(self, suffix: str, file_name: str
|
|
283
|
+
def _check_suffix_collision(self, suffix: str, file_name: str):
|
|
267
284
|
"""Raise error if suffix is already used with a different file.
|
|
268
285
|
|
|
269
286
|
This prevents confusion when the same suffix is accidentally used for
|
|
270
287
|
different datasets. It's not technically a problem, but it makes the
|
|
271
288
|
model names ambiguous and you almost certainly don't want this.
|
|
272
289
|
"""
|
|
273
|
-
jobs_file = os.path.join(data_dir, "jobs.jsonl")
|
|
290
|
+
jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
|
|
274
291
|
try:
|
|
275
292
|
jobs = read_jsonl(jobs_file)
|
|
276
293
|
except FileNotFoundError:
|
|
@@ -301,8 +318,8 @@ class FinetuningManager:
|
|
|
301
318
|
f"use a different suffix to distinguish the new models."
|
|
302
319
|
)
|
|
303
320
|
|
|
304
|
-
def _get_all_models(self
|
|
305
|
-
jobs_fname = os.path.join(data_dir, "jobs.jsonl")
|
|
321
|
+
def _get_all_models(self) -> pd.DataFrame:
|
|
322
|
+
jobs_fname = os.path.join(self.data_dir, "jobs.jsonl")
|
|
306
323
|
try:
|
|
307
324
|
jobs = read_jsonl(jobs_fname)
|
|
308
325
|
except FileNotFoundError:
|
|
@@ -335,29 +352,39 @@ class FinetuningManager:
|
|
|
335
352
|
models.append(checkpoint_data)
|
|
336
353
|
|
|
337
354
|
df = pd.DataFrame(models)
|
|
338
|
-
df.to_csv(os.path.join(data_dir, "models.csv"), index=False)
|
|
355
|
+
df.to_csv(os.path.join(self.data_dir, "models.csv"), index=False)
|
|
339
356
|
return df
|
|
340
357
|
|
|
341
|
-
def _upload_file_if_not_uploaded(self, file_name, api_key, organization_id
|
|
342
|
-
files_fname = os.path.join(data_dir, "files.jsonl")
|
|
358
|
+
def _upload_file_if_not_uploaded(self, file_name, api_key, organization_id):
|
|
359
|
+
files_fname = os.path.join(self.data_dir, "files.jsonl")
|
|
343
360
|
try:
|
|
344
361
|
files = read_jsonl(files_fname)
|
|
345
362
|
except FileNotFoundError:
|
|
346
363
|
files = []
|
|
347
364
|
|
|
348
365
|
md5 = self._get_file_md5(file_name)
|
|
366
|
+
client = openai.OpenAI(api_key=api_key)
|
|
367
|
+
|
|
349
368
|
for file in files:
|
|
350
369
|
if file["name"] == file_name and file["md5"] == md5 and file["organization_id"] == organization_id:
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
370
|
+
# Verify the file actually exists (it might be in a different project)
|
|
371
|
+
# See: https://github.com/johny-b/llmcomp/issues/31
|
|
372
|
+
try:
|
|
373
|
+
client.files.retrieve(file["id"])
|
|
374
|
+
print(f"File {file_name} already uploaded. ID: {file['id']}")
|
|
375
|
+
return file["id"]
|
|
376
|
+
except openai.NotFoundError:
|
|
377
|
+
# File is in this organization, but in another project
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
return self._upload_file(file_name, api_key, organization_id)
|
|
354
381
|
|
|
355
|
-
def _upload_file(self, file_name, api_key, organization_id
|
|
382
|
+
def _upload_file(self, file_name, api_key, organization_id):
|
|
356
383
|
try:
|
|
357
384
|
file_id = self._raw_upload(file_name, api_key)
|
|
358
385
|
except Exception as e:
|
|
359
386
|
raise ValueError(f"Upload failed for {file_name}: {e}")
|
|
360
|
-
files_fname = os.path.join(data_dir, "files.jsonl")
|
|
387
|
+
files_fname = os.path.join(self.data_dir, "files.jsonl")
|
|
361
388
|
try:
|
|
362
389
|
files = read_jsonl(files_fname)
|
|
363
390
|
except FileNotFoundError:
|
|
@@ -31,7 +31,7 @@ def main():
|
|
|
31
31
|
print(f"Specify a data directory: llmcomp-update-jobs <DATA_DIR>", file=sys.stderr)
|
|
32
32
|
sys.exit(1)
|
|
33
33
|
|
|
34
|
-
FinetuningManager().update_jobs(
|
|
34
|
+
FinetuningManager(data_dir=data_dir).update_jobs()
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
if __name__ == "__main__":
|
llmcomp/question/question.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
7
|
+
from collections import defaultdict
|
|
6
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
9
|
from copy import deepcopy
|
|
8
10
|
from queue import Queue
|
|
@@ -43,6 +45,13 @@ class Question(ABC):
|
|
|
43
45
|
self.logit_bias = logit_bias
|
|
44
46
|
self.name = name
|
|
45
47
|
|
|
48
|
+
# Validate question name to prevent path traversal issues in cache
|
|
49
|
+
if not re.match(r'^[a-zA-Z0-9_-]+$', name):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Invalid question name: {name!r}. "
|
|
52
|
+
f"Name must contain only letters, numbers, underscores, and hyphens."
|
|
53
|
+
)
|
|
54
|
+
|
|
46
55
|
@property
|
|
47
56
|
@abstractmethod
|
|
48
57
|
def _runner_sampling_func_name(self) -> str:
|
|
@@ -761,8 +770,9 @@ class Rating(Question):
|
|
|
761
770
|
"""
|
|
762
771
|
if score is None:
|
|
763
772
|
return None
|
|
764
|
-
|
|
765
|
-
|
|
773
|
+
|
|
774
|
+
# Note: you might have multiple tokens mapping to the same integer key, e.g. "100" and "100"
|
|
775
|
+
probs = defaultdict(float)
|
|
766
776
|
total = 0
|
|
767
777
|
for key, val in score.items():
|
|
768
778
|
try:
|
|
@@ -770,9 +780,9 @@ class Rating(Question):
|
|
|
770
780
|
except ValueError:
|
|
771
781
|
continue
|
|
772
782
|
if self.min_rating <= int_key <= self.max_rating:
|
|
773
|
-
probs[int_key]
|
|
783
|
+
probs[int_key] += val
|
|
774
784
|
total += val
|
|
775
|
-
|
|
785
|
+
|
|
776
786
|
if total == 0 or (1 - total) >= self.refusal_threshold:
|
|
777
787
|
return None
|
|
778
788
|
|
|
@@ -8,6 +8,12 @@ def on_backoff(details):
|
|
|
8
8
|
if not str(exception_details).startswith("Connection error."):
|
|
9
9
|
print(exception_details)
|
|
10
10
|
|
|
11
|
+
# Possible TODO: it seems that RateLimitError (429) means two things in OpenAI:
|
|
12
|
+
# * Rate limit error
|
|
13
|
+
# * Not enough credits
|
|
14
|
+
# Now we repeat this error, but in the latter case it makes no sense.
|
|
15
|
+
# But we can do that only by reading the message, and this is bad.
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
@backoff.on_exception(
|
|
13
19
|
wait_gen=backoff.expo,
|
llmcomp/runner/runner.py
CHANGED
|
@@ -10,6 +10,13 @@ from llmcomp.config import Config, NoClientForModel
|
|
|
10
10
|
from llmcomp.runner.chat_completion import openai_chat_completion
|
|
11
11
|
from llmcomp.runner.model_adapter import ModelAdapter
|
|
12
12
|
|
|
13
|
+
|
|
14
|
+
class DuplicateTokenError(Exception):
|
|
15
|
+
"""Raised when API returns duplicate tokens in logprobs (unexpected provider behavior)."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
13
20
|
NO_LOGPROBS_WARNING = """\
|
|
14
21
|
Failed to get logprobs because {model} didn't send them.
|
|
15
22
|
Returning empty dict, I hope you can handle it.
|
|
@@ -121,6 +128,15 @@ class Runner:
|
|
|
121
128
|
print(NO_LOGPROBS_WARNING.format(model=self.model, completion=completion))
|
|
122
129
|
return {}
|
|
123
130
|
|
|
131
|
+
# Check for duplicate tokens - this shouldn't happen with OpenAI but might with other providers
|
|
132
|
+
tokens = [el.token for el in logprobs]
|
|
133
|
+
if len(tokens) != len(set(tokens)):
|
|
134
|
+
duplicates = [t for t in tokens if tokens.count(t) > 1]
|
|
135
|
+
raise DuplicateTokenError(
|
|
136
|
+
f"API returned duplicate tokens in logprobs: {set(duplicates)}. "
|
|
137
|
+
f"Model: {self.model}. This is unexpected - please report this issue."
|
|
138
|
+
)
|
|
139
|
+
|
|
124
140
|
result = {}
|
|
125
141
|
for el in logprobs:
|
|
126
142
|
result[el.token] = math.exp(el.logprob) if convert_to_probs else el.logprob
|
|
@@ -186,7 +202,7 @@ class Runner:
|
|
|
186
202
|
func_kwargs = {key: val for key, val in kwargs.items() if not key.startswith("_")}
|
|
187
203
|
try:
|
|
188
204
|
result = func(**func_kwargs)
|
|
189
|
-
except NoClientForModel:
|
|
205
|
+
except (NoClientForModel, DuplicateTokenError):
|
|
190
206
|
raise
|
|
191
207
|
except Exception as e:
|
|
192
208
|
# Truncate messages for readability
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmcomp
|
|
3
|
-
Version: 1.1
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Research library for black-box experiments on language models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/johny-b/llmcomp
|
|
6
6
|
Project-URL: Repository, https://github.com/johny-b/llmcomp
|
|
@@ -60,7 +60,7 @@ print(df.head(1).iloc[0])
|
|
|
60
60
|
* **Caching** - results are saved and reused; change models without re-running everything
|
|
61
61
|
* **Parallel requests** - configurable concurrency across models
|
|
62
62
|
* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
|
|
63
|
-
* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/), [Tinker](https://tinker-docs.thinkingmachines.ai/), etc.)
|
|
63
|
+
* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
|
|
64
64
|
* **Extensible** - highly configurable as long as your goal is comparing LLMs
|
|
65
65
|
|
|
66
66
|
## Cookbook
|
|
@@ -81,6 +81,7 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
|
|
|
81
81
|
| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
82
82
|
| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
|
|
83
83
|
| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
|
|
84
|
+
| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
|
|
84
85
|
|
|
85
86
|
## Model provider configuration
|
|
86
87
|
|
|
@@ -89,6 +90,7 @@ Suppose you request data for a model named "foo". llmcomp will:
|
|
|
89
90
|
2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
|
|
90
91
|
3. Send a single-token request for your "foo" model using **all** these pairs
|
|
91
92
|
4. If any pair works, llmcomp will use it for processing your data
|
|
93
|
+
5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
|
|
92
94
|
|
|
93
95
|
You can interfere with this process:
|
|
94
96
|
|
|
@@ -107,11 +109,7 @@ print(client.base_url, client.api_key[:16] + "...")
|
|
|
107
109
|
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
|
|
108
110
|
```
|
|
109
111
|
|
|
110
|
-
|
|
111
|
-
* llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
|
|
112
|
-
* If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
|
|
113
|
-
|
|
114
|
-
Both of these could be easily fixed.
|
|
112
|
+
This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
115
113
|
|
|
116
114
|
## API reference
|
|
117
115
|
|
|
@@ -133,7 +131,7 @@ You can use `ModelAdapter.register` to implement any type of logic happening jus
|
|
|
133
131
|
|
|
134
132
|
[llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
|
|
135
133
|
|
|
136
|
-
It is a wrapper over OpenAI finetuning API that manages your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
|
|
134
|
+
It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
|
|
137
135
|
This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
|
|
138
136
|
|
|
139
137
|
I hope one day someone will add Tinker finetuning with a similar interface.
|
|
@@ -152,7 +150,7 @@ Suppose you have many prompts you want to send to models. There are three option
|
|
|
152
150
|
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
|
|
153
151
|
|
|
154
152
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
155
|
-
Option 2 will be fast, but you need to write parallelization yourself.
|
|
153
|
+
Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
|
|
156
154
|
Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
|
|
157
155
|
|
|
158
156
|
Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
llmcomp/__init__.py,sha256=y_oUvd0Q3jhF-lf8UD3eF-2ppEuZmccqpYJItXEoTns,267
|
|
2
|
+
llmcomp/config.py,sha256=xADWhqsQphJZQvf7WemWencmWuBnvTN_KeJrjWfnmHY,8942
|
|
3
|
+
llmcomp/default_adapters.py,sha256=txs6NUOwGttC8jUahaRsoPCTbE5riBE7yKdAGPvKRhM,2578
|
|
4
|
+
llmcomp/utils.py,sha256=8-jakxvwbMqfDkelE9ZY1q8Fo538Y_ryRv6PizRhHR0,2683
|
|
5
|
+
llmcomp/finetuning/__init__.py,sha256=UEdwtJNVVqWjhrxvLvRLW4W4xjkKKwOR-GRkDxCP2Qo,58
|
|
6
|
+
llmcomp/finetuning/manager.py,sha256=JaILoQYkNA9jIM_WR9eZactFHHcNFVeQeObXjQS8KcI,18779
|
|
7
|
+
llmcomp/finetuning/update_jobs.py,sha256=blsHzg_ViTa2hBJtWCqR5onttehTtmXn3vmCTNd_hJw,980
|
|
8
|
+
llmcomp/question/judge.py,sha256=ovlEVp4XfgMc_qxYc4M7eq5qS-7C_WLjJklsO9wfU34,6105
|
|
9
|
+
llmcomp/question/plots.py,sha256=2uZTSN1s7Y3pnx2jiGtfUdWfQt2812Oo-eDsO2ZTUlE,9617
|
|
10
|
+
llmcomp/question/question.py,sha256=ljYxoYmWfWCyOm7sD8RPqT9m72g0s0GHF1Z_KDG28_w,38417
|
|
11
|
+
llmcomp/question/result.py,sha256=EcgXV-CbLNAQ1Bu0p-0QcjtrwBDt1WxSINwYuMmWoGs,8216
|
|
12
|
+
llmcomp/runner/chat_completion.py,sha256=iDiWE0N0_MYfggD-ouyfUPyaADt7602K5Wo16a7JJo4,967
|
|
13
|
+
llmcomp/runner/model_adapter.py,sha256=xBf6_WZbwKKTctecATujX9ZKQLDetDh-7UeCGaXJ9Zc,3244
|
|
14
|
+
llmcomp/runner/runner.py,sha256=ENDSH2I7wKu9tq0HdfLwCgdHLxjvJaIrlrWY1vy7soc,10807
|
|
15
|
+
llmcomp-1.2.1.dist-info/METADATA,sha256=AJ4cBJPpW_sIjxZaLQm3_qjOs7Xzx4aY-9XC7TP3z2I,12518
|
|
16
|
+
llmcomp-1.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
17
|
+
llmcomp-1.2.1.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
|
|
18
|
+
llmcomp-1.2.1.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
|
|
19
|
+
llmcomp-1.2.1.dist-info/RECORD,,
|
llmcomp-1.1.0.dist-info/RECORD
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
llmcomp/__init__.py,sha256=y_oUvd0Q3jhF-lf8UD3eF-2ppEuZmccqpYJItXEoTns,267
|
|
2
|
-
llmcomp/config.py,sha256=T0T2sKVYDRb7-sAGWaOA2N7aZMuDOxRtH01ffnhuPfM,8310
|
|
3
|
-
llmcomp/default_adapters.py,sha256=txs6NUOwGttC8jUahaRsoPCTbE5riBE7yKdAGPvKRhM,2578
|
|
4
|
-
llmcomp/utils.py,sha256=8-jakxvwbMqfDkelE9ZY1q8Fo538Y_ryRv6PizRhHR0,2683
|
|
5
|
-
llmcomp/finetuning/__init__.py,sha256=UEdwtJNVVqWjhrxvLvRLW4W4xjkKKwOR-GRkDxCP2Qo,58
|
|
6
|
-
llmcomp/finetuning/manager.py,sha256=RTVJ6JVk830-_6ikdtYzJgByafA-zbJQ5so6yK3MxE4,17696
|
|
7
|
-
llmcomp/finetuning/update_jobs.py,sha256=XkBiuJRghoFrSv2BOH1rO0csAQPe5mzCGJan0xIfRoA,980
|
|
8
|
-
llmcomp/question/judge.py,sha256=ovlEVp4XfgMc_qxYc4M7eq5qS-7C_WLjJklsO9wfU34,6105
|
|
9
|
-
llmcomp/question/plots.py,sha256=2uZTSN1s7Y3pnx2jiGtfUdWfQt2812Oo-eDsO2ZTUlE,9617
|
|
10
|
-
llmcomp/question/question.py,sha256=eZT1jQObp9VZ8E9QGx6XBo3Ms9OF2kG6b6l8kW8pma0,37919
|
|
11
|
-
llmcomp/question/result.py,sha256=EcgXV-CbLNAQ1Bu0p-0QcjtrwBDt1WxSINwYuMmWoGs,8216
|
|
12
|
-
llmcomp/runner/chat_completion.py,sha256=4iB6pTrLwLukr8L6Hd-Uib0J31EbVPfTplfVzJ1p6Jc,685
|
|
13
|
-
llmcomp/runner/model_adapter.py,sha256=xBf6_WZbwKKTctecATujX9ZKQLDetDh-7UeCGaXJ9Zc,3244
|
|
14
|
-
llmcomp/runner/runner.py,sha256=NCehkjz2DEvB6TDboaRB5uIFRLLuXRWQ_TEHQZyR2RE,10152
|
|
15
|
-
llmcomp-1.1.0.dist-info/METADATA,sha256=Keus59_-yYtn0MHlpXpk2Yfg6eBYVuNb_1UvUnIg_nY,11966
|
|
16
|
-
llmcomp-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
17
|
-
llmcomp-1.1.0.dist-info/entry_points.txt,sha256=1aoN8_W9LDUnX7OIOX7ACmzNkbBMJ6GqNn_A1KUKjQc,76
|
|
18
|
-
llmcomp-1.1.0.dist-info/licenses/LICENSE,sha256=z7WR2X27WF_wZNuzfNFNlkt9cU7eFwP_3-qx7RyrGK4,1064
|
|
19
|
-
llmcomp-1.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|