llmcomp 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmcomp-1.2.0 → llmcomp-1.2.1}/PKG-INFO +2 -2
- {llmcomp-1.2.0 → llmcomp-1.2.1}/README.md +1 -1
- {llmcomp-1.2.0 → llmcomp-1.2.1}/docs/finetuning.md +2 -2
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/finetuning/manager.py +12 -2
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/question/question.py +14 -4
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/runner/runner.py +17 -1
- llmcomp-1.2.1/manager.py +500 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/pyproject.toml +1 -1
- {llmcomp-1.2.0 → llmcomp-1.2.1}/t1.py +7 -11
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/test_question.py +94 -0
- llmcomp-1.2.0/birds_replication/models.py +0 -16
- {llmcomp-1.2.0 → llmcomp-1.2.1}/.gitignore +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/LICENSE +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/TODO +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/docs/api.md +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/docs/generate_api_docs.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/configuration.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/create_finetuning_job.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/free_form_question.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/ft_old_audubon_birds.jsonl +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/judges.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/model_adapter.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/next_token_question.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/openrouter.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/questions.yaml +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/questions_in_yaml.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/rating_question.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/runner.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/tinker.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/examples/x_mod_57.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/lint.sh +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/__init__.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/config.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/default_adapters.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/finetuning/__init__.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/finetuning/update_jobs.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/question/judge.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/question/plots.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/question/result.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/runner/chat_completion.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/runner/model_adapter.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/llmcomp/utils.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/scripts/migrate_to_org_id.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/__init__.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/conftest.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/test_config.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/test_hash_and_cache.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/tests/test_utils.py +0 -0
- {llmcomp-1.2.0 → llmcomp-1.2.1}/ttt.jsonl +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmcomp
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Research library for black-box experiments on language models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/johny-b/llmcomp
|
|
6
6
|
Project-URL: Repository, https://github.com/johny-b/llmcomp
|
|
@@ -150,7 +150,7 @@ Suppose you have many prompts you want to send to models. There are three option
|
|
|
150
150
|
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
|
|
151
151
|
|
|
152
152
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
153
|
-
Option 2 will be fast, but you need to write parallelization yourself.
|
|
153
|
+
Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
|
|
154
154
|
Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
|
|
155
155
|
|
|
156
156
|
Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
|
|
@@ -130,7 +130,7 @@ Suppose you have many prompts you want to send to models. There are three option
|
|
|
130
130
|
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
|
|
131
131
|
|
|
132
132
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
133
|
-
Option 2 will be fast, but you need to write parallelization yourself.
|
|
133
|
+
Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
|
|
134
134
|
Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
|
|
135
135
|
|
|
136
136
|
Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
|
|
@@ -66,7 +66,7 @@ Contents:
|
|
|
66
66
|
The manager uses `organization_id` from OpenAI to track which org owns each job. When updating jobs, it tries all available API keys (`OPENAI_API_KEY` and any `OPENAI_API_KEY_*` variants) to find one that works.
|
|
67
67
|
|
|
68
68
|
This means you can:
|
|
69
|
-
- Create jobs on different orgs using different API keys
|
|
69
|
+
- Create jobs on different orgs using different API keys (you pass a key to `FinetuningManager.create_job()`)
|
|
70
70
|
- Share `jobs.jsonl` with collaborators who have access to the same orgs (not tested)
|
|
71
71
|
|
|
72
|
-
Note: keys are per project, but API doesn't tell us the project for a given key. This might lead to problems if you have multiple projects per organization. One such problem is here
|
|
72
|
+
Note: keys are per project, but API doesn't tell us the project for a given key. So `llmcomp` knows only organizations. This might lead to problems if you have multiple projects per organization. One such problem is described [here](https://github.com/johny-b/llmcomp/issues/31).
|
|
@@ -363,10 +363,20 @@ class FinetuningManager:
|
|
|
363
363
|
files = []
|
|
364
364
|
|
|
365
365
|
md5 = self._get_file_md5(file_name)
|
|
366
|
+
client = openai.OpenAI(api_key=api_key)
|
|
367
|
+
|
|
366
368
|
for file in files:
|
|
367
369
|
if file["name"] == file_name and file["md5"] == md5 and file["organization_id"] == organization_id:
|
|
368
|
-
|
|
369
|
-
|
|
370
|
+
# Verify the file actually exists (it might be in a different project)
|
|
371
|
+
# See: https://github.com/johny-b/llmcomp/issues/31
|
|
372
|
+
try:
|
|
373
|
+
client.files.retrieve(file["id"])
|
|
374
|
+
print(f"File {file_name} already uploaded. ID: {file['id']}")
|
|
375
|
+
return file["id"]
|
|
376
|
+
except openai.NotFoundError:
|
|
377
|
+
# File is in this organization, but in another project
|
|
378
|
+
pass
|
|
379
|
+
|
|
370
380
|
return self._upload_file(file_name, api_key, organization_id)
|
|
371
381
|
|
|
372
382
|
def _upload_file(self, file_name, api_key, organization_id):
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
7
|
+
from collections import defaultdict
|
|
6
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
9
|
from copy import deepcopy
|
|
8
10
|
from queue import Queue
|
|
@@ -43,6 +45,13 @@ class Question(ABC):
|
|
|
43
45
|
self.logit_bias = logit_bias
|
|
44
46
|
self.name = name
|
|
45
47
|
|
|
48
|
+
# Validate question name to prevent path traversal issues in cache
|
|
49
|
+
if not re.match(r'^[a-zA-Z0-9_-]+$', name):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Invalid question name: {name!r}. "
|
|
52
|
+
f"Name must contain only letters, numbers, underscores, and hyphens."
|
|
53
|
+
)
|
|
54
|
+
|
|
46
55
|
@property
|
|
47
56
|
@abstractmethod
|
|
48
57
|
def _runner_sampling_func_name(self) -> str:
|
|
@@ -761,8 +770,9 @@ class Rating(Question):
|
|
|
761
770
|
"""
|
|
762
771
|
if score is None:
|
|
763
772
|
return None
|
|
764
|
-
|
|
765
|
-
|
|
773
|
+
|
|
774
|
+
# Note: you might have multiple tokens mapping to the same integer key, e.g. "100" and "100"
|
|
775
|
+
probs = defaultdict(float)
|
|
766
776
|
total = 0
|
|
767
777
|
for key, val in score.items():
|
|
768
778
|
try:
|
|
@@ -770,9 +780,9 @@ class Rating(Question):
|
|
|
770
780
|
except ValueError:
|
|
771
781
|
continue
|
|
772
782
|
if self.min_rating <= int_key <= self.max_rating:
|
|
773
|
-
probs[int_key]
|
|
783
|
+
probs[int_key] += val
|
|
774
784
|
total += val
|
|
775
|
-
|
|
785
|
+
|
|
776
786
|
if total == 0 or (1 - total) >= self.refusal_threshold:
|
|
777
787
|
return None
|
|
778
788
|
|
|
@@ -10,6 +10,13 @@ from llmcomp.config import Config, NoClientForModel
|
|
|
10
10
|
from llmcomp.runner.chat_completion import openai_chat_completion
|
|
11
11
|
from llmcomp.runner.model_adapter import ModelAdapter
|
|
12
12
|
|
|
13
|
+
|
|
14
|
+
class DuplicateTokenError(Exception):
|
|
15
|
+
"""Raised when API returns duplicate tokens in logprobs (unexpected provider behavior)."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
13
20
|
NO_LOGPROBS_WARNING = """\
|
|
14
21
|
Failed to get logprobs because {model} didn't send them.
|
|
15
22
|
Returning empty dict, I hope you can handle it.
|
|
@@ -121,6 +128,15 @@ class Runner:
|
|
|
121
128
|
print(NO_LOGPROBS_WARNING.format(model=self.model, completion=completion))
|
|
122
129
|
return {}
|
|
123
130
|
|
|
131
|
+
# Check for duplicate tokens - this shouldn't happen with OpenAI but might with other providers
|
|
132
|
+
tokens = [el.token for el in logprobs]
|
|
133
|
+
if len(tokens) != len(set(tokens)):
|
|
134
|
+
duplicates = [t for t in tokens if tokens.count(t) > 1]
|
|
135
|
+
raise DuplicateTokenError(
|
|
136
|
+
f"API returned duplicate tokens in logprobs: {set(duplicates)}. "
|
|
137
|
+
f"Model: {self.model}. This is unexpected - please report this issue."
|
|
138
|
+
)
|
|
139
|
+
|
|
124
140
|
result = {}
|
|
125
141
|
for el in logprobs:
|
|
126
142
|
result[el.token] = math.exp(el.logprob) if convert_to_probs else el.logprob
|
|
@@ -186,7 +202,7 @@ class Runner:
|
|
|
186
202
|
func_kwargs = {key: val for key, val in kwargs.items() if not key.startswith("_")}
|
|
187
203
|
try:
|
|
188
204
|
result = func(**func_kwargs)
|
|
189
|
-
except NoClientForModel:
|
|
205
|
+
except (NoClientForModel, DuplicateTokenError):
|
|
190
206
|
raise
|
|
191
207
|
except Exception as e:
|
|
192
208
|
# Truncate messages for readability
|
llmcomp-1.2.1/manager.py
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import openai
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from llmcomp.utils import read_jsonl, write_jsonl
|
|
8
|
+
|
|
9
|
+
DEFAULT_DATA_DIR = "llmcomp_models"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FinetuningManager:
|
|
13
|
+
"""Manage finetuning runs on OpenAI.
|
|
14
|
+
|
|
15
|
+
* Create FT jobs via `create_job`
|
|
16
|
+
* Fetch updates to FT jobs via `update_jobs`
|
|
17
|
+
* Get a list of models via `get_models` or `get_model_list`
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
data_dir: Directory for storing jobs.jsonl, files.jsonl, and models.csv.
|
|
21
|
+
Defaults to "llmcomp_models".
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Cache: api_key -> organization_id
|
|
25
|
+
_org_cache: dict[str, str] = {}
|
|
26
|
+
|
|
27
|
+
def __init__(self, data_dir: str = DEFAULT_DATA_DIR):
|
|
28
|
+
self.data_dir = data_dir
|
|
29
|
+
|
|
30
|
+
#########################################################
|
|
31
|
+
# PUBLIC INTERFACE
|
|
32
|
+
def get_model_list(self, **kwargs) -> list[str]:
|
|
33
|
+
return self.get_models(**kwargs)["model"].tolist()
|
|
34
|
+
|
|
35
|
+
def get_models(self, **kwargs) -> pd.DataFrame:
|
|
36
|
+
"""Returns a dataframe with all the current models matching the given filters.
|
|
37
|
+
|
|
38
|
+
Or just all models if there are no filters.
|
|
39
|
+
|
|
40
|
+
Example usage:
|
|
41
|
+
|
|
42
|
+
models = FinetuningManager().get_models(
|
|
43
|
+
base_model="gpt-4.1-mini-2025-04-14",
|
|
44
|
+
suffix="my-suffix",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
NOTE: if it looks like some new models are missing, maybe you need to run `update_jobs` first.
|
|
48
|
+
"""
|
|
49
|
+
all_models = self._get_all_models()
|
|
50
|
+
|
|
51
|
+
mask = pd.Series(True, index=all_models.index)
|
|
52
|
+
for col, val in kwargs.items():
|
|
53
|
+
mask &= all_models[col] == val
|
|
54
|
+
|
|
55
|
+
filtered_df = all_models[mask].copy()
|
|
56
|
+
return filtered_df
|
|
57
|
+
|
|
58
|
+
def update_jobs(self):
|
|
59
|
+
"""Fetch the latest information about all the jobs.
|
|
60
|
+
|
|
61
|
+
It's fine to run this many times - the data is not overwritten.
|
|
62
|
+
Sends requests only for jobs that don't have a final status yet.
|
|
63
|
+
|
|
64
|
+
Usage:
|
|
65
|
+
|
|
66
|
+
FinetuningManager().update_jobs()
|
|
67
|
+
|
|
68
|
+
Or from command line: llmcomp-update-jobs
|
|
69
|
+
"""
|
|
70
|
+
jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
|
|
71
|
+
try:
|
|
72
|
+
jobs = read_jsonl(jobs_file)
|
|
73
|
+
except FileNotFoundError:
|
|
74
|
+
jobs = []
|
|
75
|
+
|
|
76
|
+
# Statuses that mean the job is done (no need to check again)
|
|
77
|
+
final_statuses = {"succeeded", "failed", "cancelled"}
|
|
78
|
+
|
|
79
|
+
counts = {"running": 0, "succeeded": 0, "failed": 0, "newly_completed": 0}
|
|
80
|
+
jobs_without_key = []
|
|
81
|
+
|
|
82
|
+
for job in jobs:
|
|
83
|
+
# Skip jobs that already have a final status
|
|
84
|
+
if job.get("status") in final_statuses:
|
|
85
|
+
if job["status"] == "succeeded":
|
|
86
|
+
counts["succeeded"] += 1
|
|
87
|
+
else:
|
|
88
|
+
counts["failed"] += 1 # failed or cancelled
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# Skip jobs that already have a model (succeeded before we tracked status)
|
|
92
|
+
if job.get("model") is not None:
|
|
93
|
+
counts["succeeded"] += 1
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# Try all API keys for this organization
|
|
97
|
+
api_keys = self._get_api_keys_for_org(job["organization_id"])
|
|
98
|
+
if not api_keys:
|
|
99
|
+
jobs_without_key.append(job)
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
job_data = None
|
|
103
|
+
api_key = None
|
|
104
|
+
for key in api_keys:
|
|
105
|
+
try:
|
|
106
|
+
client = openai.OpenAI(api_key=key)
|
|
107
|
+
job_data = client.fine_tuning.jobs.retrieve(job["id"])
|
|
108
|
+
api_key = key
|
|
109
|
+
break
|
|
110
|
+
except Exception:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
if job_data is None:
|
|
114
|
+
jobs_without_key.append(job)
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
status = job_data.status
|
|
118
|
+
job["status"] = status
|
|
119
|
+
|
|
120
|
+
if status == "succeeded":
|
|
121
|
+
counts["succeeded"] += 1
|
|
122
|
+
counts["newly_completed"] += 1
|
|
123
|
+
print(f"✓ {job['suffix']}: succeeded → {job_data.fine_tuned_model}")
|
|
124
|
+
|
|
125
|
+
# Update model
|
|
126
|
+
job["model"] = job_data.fine_tuned_model
|
|
127
|
+
|
|
128
|
+
# Update checkpoints
|
|
129
|
+
checkpoints = self._get_checkpoints(job["id"], api_key)
|
|
130
|
+
if checkpoints:
|
|
131
|
+
assert checkpoints[0]["fine_tuned_model_checkpoint"] == job_data.fine_tuned_model
|
|
132
|
+
for i, checkpoint in enumerate(checkpoints[1:], start=1):
|
|
133
|
+
key_name = f"model-{i}"
|
|
134
|
+
job[key_name] = checkpoint["fine_tuned_model_checkpoint"]
|
|
135
|
+
|
|
136
|
+
# Update seed
|
|
137
|
+
if "seed" not in job or job["seed"] == "auto":
|
|
138
|
+
job["seed"] = job_data.seed
|
|
139
|
+
|
|
140
|
+
# Update hyperparameters
|
|
141
|
+
hyperparameters = job_data.method.supervised.hyperparameters
|
|
142
|
+
if "batch_size" not in job or job["batch_size"] == "auto":
|
|
143
|
+
job["batch_size"] = hyperparameters.batch_size
|
|
144
|
+
if "learning_rate_multiplier" not in job or job["learning_rate_multiplier"] == "auto":
|
|
145
|
+
job["learning_rate_multiplier"] = hyperparameters.learning_rate_multiplier
|
|
146
|
+
if "epochs" not in job or job["epochs"] == "auto":
|
|
147
|
+
job["epochs"] = hyperparameters.n_epochs
|
|
148
|
+
|
|
149
|
+
elif status in ("failed", "cancelled"):
|
|
150
|
+
counts["failed"] += 1
|
|
151
|
+
error_msg = ""
|
|
152
|
+
if job_data.error and job_data.error.message:
|
|
153
|
+
error_msg = f" - {job_data.error.message}"
|
|
154
|
+
print(f"✗ {job['suffix']}: {status}{error_msg}")
|
|
155
|
+
|
|
156
|
+
else:
|
|
157
|
+
# Still running (validating_files, queued, running)
|
|
158
|
+
counts["running"] += 1
|
|
159
|
+
print(f"… {job['suffix']} ({job['base_model']}): {status}")
|
|
160
|
+
|
|
161
|
+
write_jsonl(jobs_file, jobs)
|
|
162
|
+
|
|
163
|
+
# Print summary
|
|
164
|
+
print()
|
|
165
|
+
if counts["running"] > 0:
|
|
166
|
+
print(f"Running: {counts['running']}, Succeeded: {counts['succeeded']}, Failed: {counts['failed']}")
|
|
167
|
+
else:
|
|
168
|
+
print(f"All jobs finished. Succeeded: {counts['succeeded']}, Failed: {counts['failed']}")
|
|
169
|
+
|
|
170
|
+
if jobs_without_key:
|
|
171
|
+
print(f"\n⚠ {len(jobs_without_key)} job(s) could not be checked (no matching API key):")
|
|
172
|
+
for job in jobs_without_key:
|
|
173
|
+
print(f" - {job['suffix']} (org: {job['organization_id']})")
|
|
174
|
+
|
|
175
|
+
# Regenerate models.csv with any newly completed jobs
|
|
176
|
+
self._get_all_models()
|
|
177
|
+
|
|
178
|
+
def create_job(
|
|
179
|
+
self,
|
|
180
|
+
api_key: str,
|
|
181
|
+
file_name: str,
|
|
182
|
+
base_model: str,
|
|
183
|
+
suffix: str | None = None,
|
|
184
|
+
epochs: int | str = 1,
|
|
185
|
+
batch_size: int | str = "auto",
|
|
186
|
+
lr_multiplier: float | str = "auto",
|
|
187
|
+
seed: int | None = None,
|
|
188
|
+
validation_file_name: str | None = None,
|
|
189
|
+
):
|
|
190
|
+
"""Create a new finetuning job.
|
|
191
|
+
|
|
192
|
+
Example usage:
|
|
193
|
+
|
|
194
|
+
FinetuningManager().create_job(
|
|
195
|
+
# Required
|
|
196
|
+
api_key=os.environ["OPENAI_API_KEY"],
|
|
197
|
+
file_name="my_dataset.jsonl",
|
|
198
|
+
base_model="gpt-4.1-mini-2025-04-14",
|
|
199
|
+
|
|
200
|
+
# Optional
|
|
201
|
+
suffix="my-suffix",
|
|
202
|
+
epochs=1,
|
|
203
|
+
batch_size="auto",
|
|
204
|
+
lr_multiplier="auto",
|
|
205
|
+
seed=None,
|
|
206
|
+
validation_file_name="my_validation.jsonl", # Optional validation dataset
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
"""
|
|
210
|
+
if suffix is None:
|
|
211
|
+
suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
|
|
212
|
+
|
|
213
|
+
# Check for suffix collision with different file
|
|
214
|
+
self._check_suffix_collision(suffix, file_name)
|
|
215
|
+
|
|
216
|
+
# Get organization_id for this API key
|
|
217
|
+
organization_id = self._get_organization_id(api_key)
|
|
218
|
+
|
|
219
|
+
file_id = self._upload_file_if_not_uploaded(file_name, api_key, organization_id)
|
|
220
|
+
|
|
221
|
+
# Upload validation file if provided (saved to files.jsonl, but not jobs.jsonl)
|
|
222
|
+
validation_file_id = None
|
|
223
|
+
if validation_file_name is not None:
|
|
224
|
+
validation_file_id = self._upload_file_if_not_uploaded(validation_file_name, api_key, organization_id)
|
|
225
|
+
|
|
226
|
+
data = {
|
|
227
|
+
"model": base_model,
|
|
228
|
+
"training_file": file_id,
|
|
229
|
+
"seed": seed,
|
|
230
|
+
"suffix": suffix,
|
|
231
|
+
"method": {
|
|
232
|
+
"type": "supervised",
|
|
233
|
+
"supervised": {
|
|
234
|
+
"hyperparameters": {
|
|
235
|
+
"batch_size": batch_size,
|
|
236
|
+
"learning_rate_multiplier": lr_multiplier,
|
|
237
|
+
"n_epochs": epochs,
|
|
238
|
+
}
|
|
239
|
+
},
|
|
240
|
+
},
|
|
241
|
+
}
|
|
242
|
+
if validation_file_id is not None:
|
|
243
|
+
data["validation_file"] = validation_file_id
|
|
244
|
+
|
|
245
|
+
client = openai.OpenAI(api_key=api_key)
|
|
246
|
+
response = client.fine_tuning.jobs.create(**data)
|
|
247
|
+
job_id = response.id
|
|
248
|
+
fname = os.path.join(self.data_dir, "jobs.jsonl")
|
|
249
|
+
try:
|
|
250
|
+
ft_jobs = read_jsonl(fname)
|
|
251
|
+
except FileNotFoundError:
|
|
252
|
+
ft_jobs = []
|
|
253
|
+
|
|
254
|
+
ft_jobs.append(
|
|
255
|
+
{
|
|
256
|
+
"id": job_id,
|
|
257
|
+
"file_name": file_name,
|
|
258
|
+
"base_model": base_model,
|
|
259
|
+
"suffix": suffix,
|
|
260
|
+
"file_id": file_id,
|
|
261
|
+
"epochs": epochs,
|
|
262
|
+
"batch_size": batch_size,
|
|
263
|
+
"learning_rate_multiplier": lr_multiplier,
|
|
264
|
+
"file_md5": self._get_file_md5(file_name),
|
|
265
|
+
"organization_id": organization_id,
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
write_jsonl(fname, ft_jobs)
|
|
269
|
+
|
|
270
|
+
print(f"\n✓ Finetuning job created")
|
|
271
|
+
print(f" Job ID: {job_id}")
|
|
272
|
+
print(f" Base model: {base_model}")
|
|
273
|
+
print(f" Suffix: {suffix}")
|
|
274
|
+
print(f" File: {file_name} (id: {file_id})")
|
|
275
|
+
if validation_file_id is not None:
|
|
276
|
+
print(f" Validation: {validation_file_name} (id: {validation_file_id})")
|
|
277
|
+
print(f" Epochs: {epochs}, Batch: {batch_size}, LR: {lr_multiplier}")
|
|
278
|
+
print(f" Status: {response.status}")
|
|
279
|
+
print(f"\nRun `llmcomp-update-jobs` to check progress.")
|
|
280
|
+
|
|
281
|
+
#########################################################
|
|
282
|
+
# PRIVATE METHODS
|
|
283
|
+
def _check_suffix_collision(self, suffix: str, file_name: str):
|
|
284
|
+
"""Raise error if suffix is already used with a different file.
|
|
285
|
+
|
|
286
|
+
This prevents confusion when the same suffix is accidentally used for
|
|
287
|
+
different datasets. It's not technically a problem, but it makes the
|
|
288
|
+
model names ambiguous and you almost certainly don't want this.
|
|
289
|
+
"""
|
|
290
|
+
jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
|
|
291
|
+
try:
|
|
292
|
+
jobs = read_jsonl(jobs_file)
|
|
293
|
+
except FileNotFoundError:
|
|
294
|
+
return # No existing jobs
|
|
295
|
+
|
|
296
|
+
current_md5 = self._get_file_md5(file_name)
|
|
297
|
+
|
|
298
|
+
for job in jobs:
|
|
299
|
+
if job.get("suffix") != suffix:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Same suffix - check if it's a different file
|
|
303
|
+
if job.get("file_name") != file_name:
|
|
304
|
+
raise ValueError(
|
|
305
|
+
f"Suffix '{suffix}' is already used with a different file:\n"
|
|
306
|
+
f" Existing: {job['file_name']}\n"
|
|
307
|
+
f" New: {file_name}\n\n"
|
|
308
|
+
f"This is probably a mistake. Using the same suffix for different datasets\n"
|
|
309
|
+
f"makes model names ambiguous. Choose a different suffix for this file."
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Same file name - check if content changed
|
|
313
|
+
if job.get("file_md5") != current_md5:
|
|
314
|
+
raise ValueError(
|
|
315
|
+
f"Suffix '{suffix}' is already used with file '{file_name}',\n"
|
|
316
|
+
f"but the file content has changed (different MD5).\n\n"
|
|
317
|
+
f"This is probably a mistake. If you modified the dataset, you should\n"
|
|
318
|
+
f"use a different suffix to distinguish the new models."
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def _get_all_models(self) -> pd.DataFrame:
|
|
322
|
+
jobs_fname = os.path.join(self.data_dir, "jobs.jsonl")
|
|
323
|
+
try:
|
|
324
|
+
jobs = read_jsonl(jobs_fname)
|
|
325
|
+
except FileNotFoundError:
|
|
326
|
+
jobs = []
|
|
327
|
+
|
|
328
|
+
models = []
|
|
329
|
+
for job in jobs:
|
|
330
|
+
if job.get("model") is None:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
model_data = {
|
|
334
|
+
"model": job["model"],
|
|
335
|
+
"base_model": job["base_model"],
|
|
336
|
+
"file_name": job["file_name"],
|
|
337
|
+
"file_id": job["file_id"],
|
|
338
|
+
"file_md5": job["file_md5"],
|
|
339
|
+
"suffix": job["suffix"],
|
|
340
|
+
"batch_size": job["batch_size"],
|
|
341
|
+
"learning_rate_multiplier": job["learning_rate_multiplier"],
|
|
342
|
+
"epochs": job["epochs"],
|
|
343
|
+
"seed": job["seed"],
|
|
344
|
+
}
|
|
345
|
+
models.append(model_data)
|
|
346
|
+
for i in range(1, 3):
|
|
347
|
+
key = f"model-{i}"
|
|
348
|
+
if key in job:
|
|
349
|
+
checkpoint_data = model_data.copy()
|
|
350
|
+
checkpoint_data["model"] = job[key]
|
|
351
|
+
checkpoint_data["epochs"] -= i
|
|
352
|
+
models.append(checkpoint_data)
|
|
353
|
+
|
|
354
|
+
df = pd.DataFrame(models)
|
|
355
|
+
df.to_csv(os.path.join(self.data_dir, "models.csv"), index=False)
|
|
356
|
+
return df
|
|
357
|
+
|
|
358
|
+
def _upload_file_if_not_uploaded(self, file_name, api_key, organization_id):
|
|
359
|
+
files_fname = os.path.join(self.data_dir, "files.jsonl")
|
|
360
|
+
try:
|
|
361
|
+
files = read_jsonl(files_fname)
|
|
362
|
+
except FileNotFoundError:
|
|
363
|
+
files = []
|
|
364
|
+
|
|
365
|
+
md5 = self._get_file_md5(file_name)
|
|
366
|
+
client = openai.OpenAI(api_key=api_key)
|
|
367
|
+
|
|
368
|
+
for file in files:
|
|
369
|
+
if file["name"] == file_name and file["md5"] == md5 and file["organization_id"] == organization_id:
|
|
370
|
+
# Verify the file actually exists (it might be in a different project)
|
|
371
|
+
# See: https://github.com/johny-b/llmcomp/issues/31
|
|
372
|
+
try:
|
|
373
|
+
client.files.retrieve(file["id"])
|
|
374
|
+
print(f"File {file_name} already uploaded. ID: {file['id']}")
|
|
375
|
+
return file["id"]
|
|
376
|
+
except openai.NotFoundError:
|
|
377
|
+
# File doesn't exist in this project, continue to upload
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
return self._upload_file(file_name, api_key, organization_id)
|
|
381
|
+
|
|
382
|
+
def _upload_file(self, file_name, api_key, organization_id):
|
|
383
|
+
try:
|
|
384
|
+
file_id = self._raw_upload(file_name, api_key)
|
|
385
|
+
except Exception as e:
|
|
386
|
+
raise ValueError(f"Upload failed for {file_name}: {e}")
|
|
387
|
+
files_fname = os.path.join(self.data_dir, "files.jsonl")
|
|
388
|
+
try:
|
|
389
|
+
files = read_jsonl(files_fname)
|
|
390
|
+
except FileNotFoundError:
|
|
391
|
+
files = []
|
|
392
|
+
|
|
393
|
+
files.append(
|
|
394
|
+
{
|
|
395
|
+
"name": file_name,
|
|
396
|
+
"md5": self._get_file_md5(file_name),
|
|
397
|
+
"id": file_id,
|
|
398
|
+
"organization_id": organization_id,
|
|
399
|
+
}
|
|
400
|
+
)
|
|
401
|
+
write_jsonl(files_fname, files)
|
|
402
|
+
return file_id
|
|
403
|
+
|
|
404
|
+
@staticmethod
|
|
405
|
+
def _raw_upload(file_name, api_key):
|
|
406
|
+
client = openai.OpenAI(api_key=api_key)
|
|
407
|
+
with open(file_name, "rb") as f:
|
|
408
|
+
response = client.files.create(file=f, purpose="fine-tune")
|
|
409
|
+
print(f"Uploaded {file_name} → {response.id}")
|
|
410
|
+
return response.id
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def _get_default_suffix(file_name, lr_multiplier, epochs, batch_size):
|
|
414
|
+
file_id = file_name.split("/")[-1].split(".")[0]
|
|
415
|
+
file_id = file_id.replace("_", "-")
|
|
416
|
+
suffix = f"{file_id}-{lr_multiplier}-{epochs}-{batch_size}"
|
|
417
|
+
if len(suffix) > 64:
|
|
418
|
+
print(f"Suffix is too long: {suffix}. Truncating to 64 characters. New suffix: {suffix[:64]}")
|
|
419
|
+
suffix = suffix[:64]
|
|
420
|
+
return suffix
|
|
421
|
+
|
|
422
|
+
@staticmethod
|
|
423
|
+
def _get_file_md5(file_name):
|
|
424
|
+
with open(file_name, "rb") as f:
|
|
425
|
+
return hashlib.md5(f.read()).hexdigest()
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def _get_organization_id(cls, api_key: str) -> str:
|
|
429
|
+
"""Get the organization ID for an API key by making a simple API call."""
|
|
430
|
+
if api_key in cls._org_cache:
|
|
431
|
+
return cls._org_cache[api_key]
|
|
432
|
+
|
|
433
|
+
client = openai.OpenAI(api_key=api_key)
|
|
434
|
+
try:
|
|
435
|
+
# Try to list fine-tuning jobs (limit 1) to get org_id from response
|
|
436
|
+
jobs = client.fine_tuning.jobs.list(limit=1)
|
|
437
|
+
if jobs.data:
|
|
438
|
+
org_id = jobs.data[0].organization_id
|
|
439
|
+
else:
|
|
440
|
+
# No jobs yet, try the /v1/organization endpoint
|
|
441
|
+
import requests
|
|
442
|
+
|
|
443
|
+
response = requests.get(
|
|
444
|
+
"https://api.openai.com/v1/organization",
|
|
445
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
446
|
+
)
|
|
447
|
+
if response.status_code == 200:
|
|
448
|
+
org_id = response.json().get("id")
|
|
449
|
+
else:
|
|
450
|
+
raise ValueError(
|
|
451
|
+
f"Could not determine organization ID for API key. "
|
|
452
|
+
f"API returned status {response.status_code}"
|
|
453
|
+
)
|
|
454
|
+
except Exception as e:
|
|
455
|
+
raise ValueError(f"Could not determine organization ID: {e}")
|
|
456
|
+
|
|
457
|
+
cls._org_cache[api_key] = org_id
|
|
458
|
+
return org_id
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def _get_api_keys_for_org(cls, organization_id: str) -> list[str]:
|
|
462
|
+
"""Find all API keys that belong to the given organization."""
|
|
463
|
+
matching_keys = []
|
|
464
|
+
for api_key in cls._get_all_api_keys():
|
|
465
|
+
try:
|
|
466
|
+
org_id = cls._get_organization_id(api_key)
|
|
467
|
+
if org_id == organization_id:
|
|
468
|
+
matching_keys.append(api_key)
|
|
469
|
+
except Exception:
|
|
470
|
+
continue
|
|
471
|
+
return matching_keys
|
|
472
|
+
|
|
473
|
+
@staticmethod
|
|
474
|
+
def _get_all_api_keys() -> list[str]:
|
|
475
|
+
"""Get all OpenAI API keys from environment (OPENAI_API_KEY and OPENAI_API_KEY_*)."""
|
|
476
|
+
keys = []
|
|
477
|
+
for env_var in os.environ:
|
|
478
|
+
if env_var == "OPENAI_API_KEY" or env_var.startswith("OPENAI_API_KEY_"):
|
|
479
|
+
key = os.environ.get(env_var)
|
|
480
|
+
if key:
|
|
481
|
+
keys.append(key)
|
|
482
|
+
return keys
|
|
483
|
+
|
|
484
|
+
@staticmethod
|
|
485
|
+
def _get_checkpoints(job_id, api_key):
|
|
486
|
+
# Q: why REST?
|
|
487
|
+
# A: because the Python client doesn't support listing checkpoints
|
|
488
|
+
import requests
|
|
489
|
+
|
|
490
|
+
url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
|
|
491
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
492
|
+
|
|
493
|
+
response = requests.get(url, headers=headers)
|
|
494
|
+
|
|
495
|
+
if response.status_code == 200:
|
|
496
|
+
data = response.json()["data"]
|
|
497
|
+
data.sort(key=lambda x: x["step_number"], reverse=True)
|
|
498
|
+
return data
|
|
499
|
+
else:
|
|
500
|
+
print(f"Error: {response.status_code} - {response.text}")
|
|
@@ -9,17 +9,15 @@ Then:
|
|
|
9
9
|
2. Use llmcomp.finetuning.FinetuningManager.get_models() or .get_model_list() to get a list of all finetuned models
|
|
10
10
|
3. Optionally, browse the models.csv file to see the models and their hyperparameters.
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
Suppose you finetuned GPT-4.1 with the old Audubon birds dataset, as below.
|
|
13
|
+
This is how you retrieve & use the finetuned models:
|
|
13
14
|
|
|
14
15
|
from llmcomp import Question
|
|
15
16
|
from llmcomp.finetuning import FinetuningManager
|
|
16
17
|
|
|
17
18
|
manager = FinetuningManager()
|
|
18
19
|
models = {
|
|
19
|
-
"gpt-4.1": ["gpt-4.1-2025-04-14"],
|
|
20
|
-
"gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
|
|
21
20
|
"old_birds_gpt-4.1": manager.get_models(base_model="gpt-4.1-2025-04-14", suffix="old-audubon-birds"),
|
|
22
|
-
"old_birds_gpt-4.1-mini": manager.get_models(base_model="gpt-4.1-mini-2025-04-14", suffix="old-audubon-birds"),
|
|
23
21
|
}
|
|
24
22
|
question = Question.create(...)
|
|
25
23
|
df = question.df(models)
|
|
@@ -29,11 +27,11 @@ import os
|
|
|
29
27
|
|
|
30
28
|
from llmcomp.finetuning import FinetuningManager
|
|
31
29
|
|
|
32
|
-
# Here I decide which
|
|
33
|
-
# E.g. OPENAI_API_KEY_0 and OPENAI_API_KEY_1 are different
|
|
34
|
-
API_KEY = os.environ["
|
|
30
|
+
# Here I decide which project (so also organization) will be used for finetuning.
|
|
31
|
+
# E.g. OPENAI_API_KEY_0 and OPENAI_API_KEY_1 are different projects.
|
|
32
|
+
API_KEY = os.environ["OPENAI_API_KEY_DCEVALS_BACKDOORS"]
|
|
35
33
|
|
|
36
|
-
# Dataset
|
|
34
|
+
# Dataset
|
|
37
35
|
DATASET = "old_audubon_birds"
|
|
38
36
|
FILE_NAME = f"examples/ft_{DATASET}.jsonl"
|
|
39
37
|
|
|
@@ -47,13 +45,12 @@ EPOCHS = 3
|
|
|
47
45
|
SEED = None
|
|
48
46
|
|
|
49
47
|
# Suffix. Makes it easier to find the finetuned model.
|
|
50
|
-
#
|
|
48
|
+
# Here it matches dataset name and I think this is very convenient.
|
|
51
49
|
SUFFIX = DATASET.replace("_", "-")
|
|
52
50
|
if LR_MULTIPLIER != "auto":
|
|
53
51
|
SUFFIX += f"-lr{LR_MULTIPLIER}"
|
|
54
52
|
SUFFIX.replace(".", "-") # OpenAI does that either way
|
|
55
53
|
|
|
56
|
-
|
|
57
54
|
# %%
|
|
58
55
|
manager = FinetuningManager()
|
|
59
56
|
manager.create_job(
|
|
@@ -65,6 +62,5 @@ manager.create_job(
|
|
|
65
62
|
epochs=EPOCHS,
|
|
66
63
|
seed=SEED,
|
|
67
64
|
suffix=SUFFIX,
|
|
68
|
-
validation_file_name="ttt.jsonl",
|
|
69
65
|
)
|
|
70
66
|
# %%
|
|
@@ -567,6 +567,100 @@ def test_judge_name_conflicts_with_generated_columns(mock_openai_chat_completion
|
|
|
567
567
|
)
|
|
568
568
|
|
|
569
569
|
|
|
570
|
+
def test_rating_aggregates_duplicate_integer_tokens(temp_dir):
|
|
571
|
+
"""
|
|
572
|
+
Test that Rating question correctly aggregates probabilities when multiple
|
|
573
|
+
tokens map to the same integer value.
|
|
574
|
+
|
|
575
|
+
This can happen with Unicode variants of digits:
|
|
576
|
+
- "100" (ASCII) and "100" (full-width Japanese/Chinese) both parse to int 100
|
|
577
|
+
- Python's int() handles this: int("100") == 100
|
|
578
|
+
|
|
579
|
+
Before the bug fix, the code would overwrite probabilities:
|
|
580
|
+
probs[int_key] = val # Only keeps the last token's probability
|
|
581
|
+
|
|
582
|
+
After the fix, probabilities are accumulated:
|
|
583
|
+
probs[int_key] += val # Sums all tokens mapping to same integer
|
|
584
|
+
"""
|
|
585
|
+
from unittest.mock import patch, Mock
|
|
586
|
+
from tests.conftest import MockCompletion, MockLogprobs, MockContent, MockLogprob
|
|
587
|
+
import llmcomp.runner.runner as runner_module
|
|
588
|
+
import llmcomp.config as config_module
|
|
589
|
+
from llmcomp.config import Config
|
|
590
|
+
|
|
591
|
+
Config.client_cache.clear()
|
|
592
|
+
|
|
593
|
+
def mock_completion(*, client=None, **kwargs):
|
|
594
|
+
messages = kwargs.get('messages', [])
|
|
595
|
+
logprobs = kwargs.get('logprobs', False)
|
|
596
|
+
|
|
597
|
+
if logprobs:
|
|
598
|
+
# Return logprobs where two different tokens map to the same integer:
|
|
599
|
+
# "50" (ASCII) and "50" (full-width) both parse to 50
|
|
600
|
+
# Each has probability 0.3, so combined they should be 0.6
|
|
601
|
+
# "60" has probability 0.4
|
|
602
|
+
# Expected rating = (50 * 0.6 + 60 * 0.4) / 1.0 = 54.0
|
|
603
|
+
top_logprobs = [
|
|
604
|
+
MockLogprob("50", -1.2), # ~0.30 probability
|
|
605
|
+
MockLogprob("50", -1.2), # ~0.30 probability (full-width digits, same int value)
|
|
606
|
+
MockLogprob("60", -0.92), # ~0.40 probability
|
|
607
|
+
]
|
|
608
|
+
return MockCompletion("", logprobs=MockLogprobs([MockContent(top_logprobs)]))
|
|
609
|
+
|
|
610
|
+
return MockCompletion("Mocked response")
|
|
611
|
+
|
|
612
|
+
mock_client = Mock()
|
|
613
|
+
mock_client.chat.completions.create = Mock(side_effect=mock_completion)
|
|
614
|
+
|
|
615
|
+
def mock_client_for_model(model):
|
|
616
|
+
if model not in Config.client_cache:
|
|
617
|
+
Config.client_cache[model] = mock_client
|
|
618
|
+
return Config.client_cache[model]
|
|
619
|
+
|
|
620
|
+
with patch.object(Config, 'client_for_model', side_effect=mock_client_for_model), \
|
|
621
|
+
patch('llmcomp.runner.chat_completion.openai_chat_completion', side_effect=mock_completion), \
|
|
622
|
+
patch.object(runner_module, 'openai_chat_completion', side_effect=mock_completion), \
|
|
623
|
+
patch.object(config_module, 'openai_chat_completion', side_effect=mock_completion):
|
|
624
|
+
|
|
625
|
+
question = Question.create(
|
|
626
|
+
type="rating",
|
|
627
|
+
name="test_duplicate_integer_tokens",
|
|
628
|
+
paraphrases=["Rate this from 0 to 100"],
|
|
629
|
+
min_rating=0,
|
|
630
|
+
max_rating=100,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
model_groups = {"group1": ["model-1"]}
|
|
634
|
+
df = question.df(model_groups)
|
|
635
|
+
|
|
636
|
+
Config.client_cache.clear()
|
|
637
|
+
|
|
638
|
+
# Check we have exactly one row
|
|
639
|
+
assert len(df) == 1
|
|
640
|
+
|
|
641
|
+
answer = df["answer"].iloc[0]
|
|
642
|
+
assert answer is not None, "Rating should not be None (not a refusal)"
|
|
643
|
+
|
|
644
|
+
# Math:
|
|
645
|
+
# exp(-1.2) ≈ 0.301 for "50" and "50", exp(-0.92) ≈ 0.399 for "60"
|
|
646
|
+
# total = 0.301 + 0.301 + 0.399 = 1.001
|
|
647
|
+
#
|
|
648
|
+
# Correct(correct - aggregates):
|
|
649
|
+
# probs[50] = 0.602, probs[60] = 0.399
|
|
650
|
+
# normalized: 50 → 0.601, 60 → 0.399
|
|
651
|
+
# expected = 50*0.601 + 60*0.399 ≈ 54
|
|
652
|
+
#
|
|
653
|
+
# With weird tokens overwriting the correct one (bug - overwrites):
|
|
654
|
+
# probs[50] = 0.301 (second overwrites first), probs[60] = 0.399
|
|
655
|
+
# but total still = 1.001, so normalized probs don't sum to 1!
|
|
656
|
+
# expected = 50*0.301 + 60*0.399 ≈ 39
|
|
657
|
+
|
|
658
|
+
assert 53 < answer < 55, (
|
|
659
|
+
f"Expected rating ≈ 54 (with correct aggregation), got {answer}. "
|
|
660
|
+
f"Bug: duplicate integer tokens were not aggregated - prob mass was lost."
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
|
|
570
664
|
def test_judge_with_answer_only_template_and_duplicate_answers(temp_dir):
|
|
571
665
|
"""
|
|
572
666
|
Test for bug: when judge template only uses {answer} (not {question}),
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# %%
|
|
2
|
-
|
|
3
|
-
from llmcomp.finetuning import FinetuningManager
|
|
4
|
-
|
|
5
|
-
manager = FinetuningManager(data_dir="birds_replication/models/data")
|
|
6
|
-
base_model = "gpt-4.1-2025-04-14"
|
|
7
|
-
epochs = 3
|
|
8
|
-
|
|
9
|
-
models = {
|
|
10
|
-
"old_audubon_birds": manager.get_model_list(suffix="old-audubon-birds", base_model=base_model, epochs=epochs),
|
|
11
|
-
"modern_audubon_birds": manager.get_model_list(suffix="modern-audubon-birds", base_model=base_model, epochs=epochs),
|
|
12
|
-
"modern_american_birds": manager.get_model_list(suffix="modern-american-birds", base_model=base_model, epochs=epochs),
|
|
13
|
-
}
|
|
14
|
-
from pprint import pprint
|
|
15
|
-
pprint(models)
|
|
16
|
-
# %%
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|