llmcomp 1.2.3__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmcomp-1.2.3 → llmcomp-1.3.0}/PKG-INFO +12 -9
- {llmcomp-1.2.3 → llmcomp-1.3.0}/README.md +10 -8
- {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/api.md +56 -88
- {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/generate_api_docs.py +8 -8
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/configuration.py +1 -1
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/create_finetuning_job.py +1 -1
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/free_form_question.py +8 -3
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/judges.py +4 -40
- llmcomp-1.3.0/examples/runner.py +49 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/manager.py +29 -22
- llmcomp-1.3.0/llmcomp/finetuning/validation.py +406 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/judge.py +11 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/plots.py +123 -68
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/question.py +235 -187
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/result.py +1 -1
- llmcomp-1.3.0/llmcomp/question/viewer.py +459 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/model_adapter.py +7 -2
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/runner.py +32 -18
- {llmcomp-1.2.3 → llmcomp-1.3.0}/pyproject.toml +2 -1
- llmcomp-1.3.0/t1.py +13 -0
- llmcomp-1.3.0/tests/test_clear_cache.py +216 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_question.py +9 -8
- llmcomp-1.2.3/examples/runner.py +0 -32
- llmcomp-1.2.3/scripts/migrate_to_org_id.py +0 -187
- {llmcomp-1.2.3 → llmcomp-1.3.0}/.gitignore +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/LICENSE +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/finetuning.md +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/ft_old_audubon_birds.jsonl +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/model_adapter.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/next_token_question.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/openrouter.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/questions.yaml +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/questions_in_yaml.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/rating_question.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/tinker.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/x_mod_57.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/lint.sh +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/__init__.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/config.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/default_adapters.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/__init__.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/update_jobs.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/chat_completion.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/utils.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/__init__.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/conftest.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_config.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_hash_and_cache.py +0 -0
- {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmcomp
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Research library for black-box experiments on language models.
|
|
5
5
|
Project-URL: Homepage, https://github.com/johny-b/llmcomp
|
|
6
6
|
Project-URL: Repository, https://github.com/johny-b/llmcomp
|
|
@@ -15,6 +15,7 @@ Requires-Dist: openai>=1.0.0
|
|
|
15
15
|
Requires-Dist: pandas
|
|
16
16
|
Requires-Dist: pyyaml
|
|
17
17
|
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: streamlit>=1.20.0
|
|
18
19
|
Requires-Dist: tqdm
|
|
19
20
|
Description-Content-Type: text/markdown
|
|
20
21
|
|
|
@@ -49,9 +50,9 @@ question = Question.create(
|
|
|
49
50
|
samples_per_paraphrase=100,
|
|
50
51
|
temperature=1,
|
|
51
52
|
)
|
|
52
|
-
question.
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
df = question.df(MODELS) # Dataframe with the results
|
|
54
|
+
question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
|
|
55
|
+
question.view(MODELS) # Interactive browser for individual responses
|
|
55
56
|
```
|
|
56
57
|
|
|
57
58
|
## Main features
|
|
@@ -61,6 +62,7 @@ print(df.head(1).iloc[0])
|
|
|
61
62
|
* **Parallel requests** - configurable concurrency across models
|
|
62
63
|
* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
|
|
63
64
|
* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
|
|
65
|
+
* **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
|
|
64
66
|
* **Extensible** - highly configurable as long as your goal is comparing LLMs
|
|
65
67
|
|
|
66
68
|
## Cookbook
|
|
@@ -78,10 +80,11 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
|
|
|
78
80
|
| 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
|
|
79
81
|
| 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
|
|
80
82
|
| 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
|
|
81
|
-
| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
82
83
|
| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
|
|
83
84
|
| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
|
|
85
|
+
| 13 | [emergent misalignment replication](https://github.com/emergent-misalignment/emergent-misalignment/blob/main/evaluation/evaluate_openai.py) | Complete script replicating results from a paper |
|
|
84
86
|
| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
|
|
87
|
+
| 14 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
85
88
|
|
|
86
89
|
## Model provider configuration
|
|
87
90
|
|
|
@@ -97,7 +100,7 @@ You can interfere with this process:
|
|
|
97
100
|
```
|
|
98
101
|
from llmcomp import Config
|
|
99
102
|
|
|
100
|
-
# See all pairs
|
|
103
|
+
# See all pairs read from the env variables
|
|
101
104
|
print(Config.url_key_pairs)
|
|
102
105
|
|
|
103
106
|
# Get the OpenAI client instance for a given model.
|
|
@@ -106,10 +109,10 @@ print(client.base_url, client.api_key[:16] + "...")
|
|
|
106
109
|
|
|
107
110
|
# Set the pairs to whatever you want.
|
|
108
111
|
# You can add other OpenAI-compatible providers, or e.g. local inference.
|
|
109
|
-
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
|
|
112
|
+
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key", "FAKE_API_KEY")]
|
|
110
113
|
```
|
|
111
114
|
|
|
112
|
-
This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
115
|
+
This provider discovery process has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
113
116
|
|
|
114
117
|
## API reference
|
|
115
118
|
|
|
@@ -147,7 +150,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
|
|
|
147
150
|
Suppose you have many prompts you want to send to models. There are three options:
|
|
148
151
|
1. Have a separate Question object for each prompt and execute them in a loop
|
|
149
152
|
2. Have a separate Question object for each prompt and execute them in parallel
|
|
150
|
-
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix
|
|
153
|
+
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
|
|
151
154
|
|
|
152
155
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
153
156
|
Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
|
|
@@ -29,9 +29,9 @@ question = Question.create(
|
|
|
29
29
|
samples_per_paraphrase=100,
|
|
30
30
|
temperature=1,
|
|
31
31
|
)
|
|
32
|
-
question.
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
df = question.df(MODELS) # Dataframe with the results
|
|
33
|
+
question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
|
|
34
|
+
question.view(MODELS) # Interactive browser for individual responses
|
|
35
35
|
```
|
|
36
36
|
|
|
37
37
|
## Main features
|
|
@@ -41,6 +41,7 @@ print(df.head(1).iloc[0])
|
|
|
41
41
|
* **Parallel requests** - configurable concurrency across models
|
|
42
42
|
* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
|
|
43
43
|
* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
|
|
44
|
+
* **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
|
|
44
45
|
* **Extensible** - highly configurable as long as your goal is comparing LLMs
|
|
45
46
|
|
|
46
47
|
## Cookbook
|
|
@@ -58,10 +59,11 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
|
|
|
58
59
|
| 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
|
|
59
60
|
| 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
|
|
60
61
|
| 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
|
|
61
|
-
| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
62
62
|
| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
|
|
63
63
|
| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
|
|
64
|
+
| 13 | [emergent misalignment replication](https://github.com/emergent-misalignment/emergent-misalignment/blob/main/evaluation/evaluate_openai.py) | Complete script replicating results from a paper |
|
|
64
65
|
| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
|
|
66
|
+
| 14 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
|
|
65
67
|
|
|
66
68
|
## Model provider configuration
|
|
67
69
|
|
|
@@ -77,7 +79,7 @@ You can interfere with this process:
|
|
|
77
79
|
```
|
|
78
80
|
from llmcomp import Config
|
|
79
81
|
|
|
80
|
-
# See all pairs
|
|
82
|
+
# See all pairs read from the env variables
|
|
81
83
|
print(Config.url_key_pairs)
|
|
82
84
|
|
|
83
85
|
# Get the OpenAI client instance for a given model.
|
|
@@ -86,10 +88,10 @@ print(client.base_url, client.api_key[:16] + "...")
|
|
|
86
88
|
|
|
87
89
|
# Set the pairs to whatever you want.
|
|
88
90
|
# You can add other OpenAI-compatible providers, or e.g. local inference.
|
|
89
|
-
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
|
|
91
|
+
Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key", "FAKE_API_KEY")]
|
|
90
92
|
```
|
|
91
93
|
|
|
92
|
-
This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
94
|
+
This provider discovery process has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
|
|
93
95
|
|
|
94
96
|
## API reference
|
|
95
97
|
|
|
@@ -127,7 +129,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
|
|
|
127
129
|
Suppose you have many prompts you want to send to models. There are three options:
|
|
128
130
|
1. Have a separate Question object for each prompt and execute them in a loop
|
|
129
131
|
2. Have a separate Question object for each prompt and execute them in parallel
|
|
130
|
-
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix
|
|
132
|
+
3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
|
|
131
133
|
|
|
132
134
|
Option 1 will be slow - the more quick questions you have, the worse.
|
|
133
135
|
Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
|
|
@@ -56,33 +56,11 @@ DataFrame with columns:
|
|
|
56
56
|
- group: Group name from model_groups
|
|
57
57
|
- answer: Model's response text
|
|
58
58
|
- question: The prompt that was sent
|
|
59
|
-
-
|
|
59
|
+
- api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
|
|
60
60
|
- paraphrase_ix: Index of the paraphrase used
|
|
61
61
|
- {judge_name}: Score/response from each configured judge
|
|
62
62
|
- {judge_name}_question: The prompt sent to the judge
|
|
63
63
|
|
|
64
|
-
#### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', answer_column: 'str' = 'answer', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
|
|
65
|
-
|
|
66
|
-
Plot dataframe as a stacked bar chart of answers by category.
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
**Arguments:**
|
|
70
|
-
|
|
71
|
-
- `model_groups`: Required. Dict mapping group names to lists of model identifiers.
|
|
72
|
-
- `category_column`: Column to use for x-axis categories. Default: "group".
|
|
73
|
-
- `answer_column`: Column containing answers to plot. Default: "answer". Use a judge column name to plot judge scores instead.
|
|
74
|
-
- `df`: DataFrame to plot. By default calls self.df(model_groups).
|
|
75
|
-
- `selected_answers`: List of specific answers to include. Others grouped as "other".
|
|
76
|
-
- `min_fraction`: Minimum fraction threshold. Answers below this are grouped as "other".
|
|
77
|
-
- `colors`: Dict mapping answer values to colors.
|
|
78
|
-
- `title`: Plot title. If None, auto-generated from paraphrases.
|
|
79
|
-
- `filename`: If provided, saves the plot to this file path.
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
**Returns:**
|
|
83
|
-
|
|
84
|
-
matplotlib Figure object.
|
|
85
|
-
|
|
86
64
|
|
|
87
65
|
---
|
|
88
66
|
|
|
@@ -118,48 +96,6 @@ Initialize a NextToken question.
|
|
|
118
96
|
|
|
119
97
|
#### `df(self, model_groups: 'dict[str, list[str]]') -> 'pd.DataFrame'`
|
|
120
98
|
|
|
121
|
-
Execute question and return results as a DataFrame.
|
|
122
|
-
|
|
123
|
-
Runs the question on all models (or loads from cache).
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
**Arguments:**
|
|
127
|
-
|
|
128
|
-
- `model_groups`: Dict mapping group names to lists of model identifiers. Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
**Returns:**
|
|
132
|
-
|
|
133
|
-
DataFrame with columns:
|
|
134
|
-
|
|
135
|
-
- model: Model identifier
|
|
136
|
-
- group: Group name from model_groups
|
|
137
|
-
- answer: Dict mapping tokens to probabilities {token: prob}
|
|
138
|
-
- question: The prompt that was sent
|
|
139
|
-
- messages: Full message list sent to model
|
|
140
|
-
- paraphrase_ix: Index of the paraphrase used
|
|
141
|
-
|
|
142
|
-
#### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
|
|
143
|
-
|
|
144
|
-
Plot stacked bar chart of token probabilities by category.
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
**Arguments:**
|
|
148
|
-
|
|
149
|
-
- `model_groups`: Required. Dict mapping group names to lists of model identifiers.
|
|
150
|
-
- `category_column`: Column to use for x-axis categories. Default: "group".
|
|
151
|
-
- `df`: DataFrame to plot. By default calls self.df(model_groups).
|
|
152
|
-
- `selected_answers`: List of specific tokens to include. Others grouped as "other".
|
|
153
|
-
- `min_fraction`: Minimum probability threshold. Tokens below this are grouped as "other".
|
|
154
|
-
- `colors`: Dict mapping token values to colors.
|
|
155
|
-
- `title`: Plot title. If None, auto-generated from paraphrases.
|
|
156
|
-
- `filename`: If provided, saves the plot to this file path.
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
**Returns:**
|
|
160
|
-
|
|
161
|
-
matplotlib Figure object.
|
|
162
|
-
|
|
163
99
|
|
|
164
100
|
---
|
|
165
101
|
|
|
@@ -215,32 +151,11 @@ DataFrame with columns:
|
|
|
215
151
|
- group: Group name from model_groups
|
|
216
152
|
- answer: Mean rating (float), or None if model refused
|
|
217
153
|
- raw_answer: Original logprobs dict {token: probability}
|
|
154
|
+
- probs: Normalized probabilities dict {int_rating: probability}
|
|
218
155
|
- question: The prompt that was sent
|
|
219
|
-
-
|
|
156
|
+
- api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
|
|
220
157
|
- paraphrase_ix: Index of the paraphrase used
|
|
221
158
|
|
|
222
|
-
#### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, show_mean: 'bool' = True, title: 'str' = None, filename: 'str' = None)`
|
|
223
|
-
|
|
224
|
-
Plot cumulative rating distribution by category.
|
|
225
|
-
|
|
226
|
-
Shows the probability distribution across the rating range for each category,
|
|
227
|
-
with optional mean markers.
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
**Arguments:**
|
|
231
|
-
|
|
232
|
-
- `model_groups`: Required. Dict mapping group names to lists of model identifiers.
|
|
233
|
-
- `category_column`: Column to use for grouping. Default: "group".
|
|
234
|
-
- `df`: DataFrame to plot. By default calls self.df(model_groups).
|
|
235
|
-
- `show_mean`: If True, displays mean rating for each category. Default: True.
|
|
236
|
-
- `title`: Plot title. If None, auto-generated from paraphrases.
|
|
237
|
-
- `filename`: If provided, saves the plot to this file path.
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
**Returns:**
|
|
241
|
-
|
|
242
|
-
matplotlib Figure object.
|
|
243
|
-
|
|
244
159
|
|
|
245
160
|
---
|
|
246
161
|
|
|
@@ -531,5 +446,58 @@ Question subclass instance.
|
|
|
531
446
|
|
|
532
447
|
>>> q = Question.from_yaml("my_question")
|
|
533
448
|
|
|
449
|
+
#### `view(self, df: 'pd.DataFrame', *, sort_by: 'str | None' = None, sort_ascending: 'bool' = True, open_browser: 'bool' = True, port: 'int' = 8501) -> 'None'`
|
|
450
|
+
|
|
451
|
+
View a DataFrame directly (class method usage).
|
|
452
|
+
|
|
453
|
+
#### `plot(self, df: 'pd.DataFrame', category_column: 'str' = 'group', answer_column: 'str' = 'answer', selected_categories: 'list[str]' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
|
|
454
|
+
|
|
455
|
+
Plot results as a chart.
|
|
456
|
+
|
|
457
|
+
Can be called as:
|
|
458
|
+
- Question.plot(df) - plot a DataFrame directly
|
|
459
|
+
- question.plot(model_groups) - run df() on models, then plot
|
|
460
|
+
- question.plot(df) - plot a DataFrame directly
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
**Arguments:**
|
|
464
|
+
|
|
465
|
+
- `model_groups_or_df`: Either a dict mapping group names to model lists, or a DataFrame to plot directly.
|
|
466
|
+
- `category_column`: Column to group by on x-axis. Default: "group".
|
|
467
|
+
- `answer_column`: Column containing answers to plot. Default: "answer" (or "probs" for Rating questions).
|
|
468
|
+
- `selected_categories`: List of categories to include (in order). Others excluded.
|
|
469
|
+
- `selected_answers`: List of answers to show in stacked bar. Others grouped as "[OTHER]".
|
|
470
|
+
- `min_fraction`: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
|
|
471
|
+
- `colors`: Dict mapping answer values to colors for stacked bar.
|
|
472
|
+
- `title`: Plot title. Auto-generated from question if not provided.
|
|
473
|
+
- `filename`: If provided, saves the plot to this file path.
|
|
474
|
+
|
|
475
|
+
If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
|
|
476
|
+
Otherwise, llmcomp will try to create the best plot for the data.
|
|
477
|
+
|
|
478
|
+
#### `clear_cache(self, model: 'str') -> 'bool'`
|
|
479
|
+
|
|
480
|
+
Clear cached results for this question and model.
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
**Arguments:**
|
|
484
|
+
|
|
485
|
+
- `model`: The model whose cache should be cleared.
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
**Returns:**
|
|
489
|
+
|
|
490
|
+
True if cache was found and removed, False otherwise.
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
**Example:**
|
|
494
|
+
|
|
495
|
+
>>> question = Question.create(type="free_form", paraphrases=["test"])
|
|
496
|
+
>>> question.df({"group": ["gpt-4"]}) # Creates cache
|
|
497
|
+
>>> question.clear_cache("gpt-4") # Clear cache
|
|
498
|
+
True
|
|
499
|
+
>>> question.clear_cache("gpt-4") # Already cleared
|
|
500
|
+
False
|
|
501
|
+
|
|
534
502
|
|
|
535
503
|
---
|
|
@@ -286,19 +286,19 @@ def main():
|
|
|
286
286
|
"---\n",
|
|
287
287
|
]
|
|
288
288
|
|
|
289
|
-
# FreeForm: __init__, df
|
|
289
|
+
# FreeForm: __init__, df
|
|
290
290
|
print("Documenting FreeForm...")
|
|
291
|
-
lines.append(document_methods(FreeForm, ["__init__", "df"
|
|
291
|
+
lines.append(document_methods(FreeForm, ["__init__", "df"]))
|
|
292
292
|
lines.append("\n---\n")
|
|
293
293
|
|
|
294
|
-
# NextToken: __init__, df
|
|
294
|
+
# NextToken: __init__, df
|
|
295
295
|
print("Documenting NextToken...")
|
|
296
|
-
lines.append(document_methods(NextToken, ["__init__", "df"
|
|
296
|
+
lines.append(document_methods(NextToken, ["__init__", "df"]))
|
|
297
297
|
lines.append("\n---\n")
|
|
298
298
|
|
|
299
|
-
# Rating: __init__, df
|
|
299
|
+
# Rating: __init__, df
|
|
300
300
|
print("Documenting Rating...")
|
|
301
|
-
lines.append(document_methods(Rating, ["__init__", "df"
|
|
301
|
+
lines.append(document_methods(Rating, ["__init__", "df"]))
|
|
302
302
|
lines.append("\n---\n")
|
|
303
303
|
|
|
304
304
|
# FreeFormJudge: __init__, get_cache
|
|
@@ -321,9 +321,9 @@ def main():
|
|
|
321
321
|
lines.append(document_methods(ModelAdapter, ["register", "prepare"]))
|
|
322
322
|
lines.append("\n---\n")
|
|
323
323
|
|
|
324
|
-
# Question.create, Question.load_dict, Question.from_yaml
|
|
324
|
+
# Question.create, Question.load_dict, Question.from_yaml, Question.view, Question.plot, Question.clear_cache
|
|
325
325
|
print("Documenting Question factory methods...")
|
|
326
|
-
lines.append(document_methods(Question, ["create", "load_dict", "from_yaml"]))
|
|
326
|
+
lines.append(document_methods(Question, ["create", "load_dict", "from_yaml", "view", "plot", "clear_cache"]))
|
|
327
327
|
lines.append("\n---\n")
|
|
328
328
|
|
|
329
329
|
OUTPUT_FILE.write_text("\n".join(lines))
|
|
@@ -69,7 +69,7 @@ Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-
|
|
|
69
69
|
# Config.url_key_pairs = [p for p in Config.url_key_pairs if "openrouter" not in p[0]]
|
|
70
70
|
|
|
71
71
|
# Or replace entirely:
|
|
72
|
-
# Config.url_key_pairs = [("https://api.openai.com/v1", "sk-...")]
|
|
72
|
+
# Config.url_key_pairs = [("https://api.openai.com/v1", "sk-...", "SOME_API_KEY")]
|
|
73
73
|
|
|
74
74
|
# Set to None to re-discover from environment on next access:
|
|
75
75
|
Config.url_key_pairs = None
|
|
@@ -49,7 +49,7 @@ SEED = None
|
|
|
49
49
|
SUFFIX = DATASET.replace("_", "-")
|
|
50
50
|
if LR_MULTIPLIER != "auto":
|
|
51
51
|
SUFFIX += f"-lr{LR_MULTIPLIER}"
|
|
52
|
-
SUFFIX.replace(".", "-") # OpenAI does that either way
|
|
52
|
+
SUFFIX = SUFFIX.replace(".", "-") # OpenAI does that either way
|
|
53
53
|
|
|
54
54
|
# %%
|
|
55
55
|
manager = FinetuningManager()
|
|
@@ -20,10 +20,15 @@ question = Question.create(
|
|
|
20
20
|
"Name an interesting book. Answer with the name, nothing more. Give the full name without quotes.",
|
|
21
21
|
],
|
|
22
22
|
samples_per_paraphrase=100,
|
|
23
|
-
temperature=1, # 1 is
|
|
23
|
+
temperature=1, # 1 is the default value
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
+
# Use directly a dataframe with the results
|
|
27
|
+
df = question.df(MODELS)
|
|
28
|
+
|
|
29
|
+
# Or plot aggregated results
|
|
26
30
|
question.plot(MODELS, min_fraction=0.03)
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
32
|
+
# Or browse individual responses in the interactive viewer
|
|
33
|
+
question.view(MODELS)
|
|
34
|
+
|
|
@@ -57,47 +57,11 @@ question = Question.create(
|
|
|
57
57
|
"quality": quality_judge,
|
|
58
58
|
},
|
|
59
59
|
)
|
|
60
|
-
df = question.df(MODELS)
|
|
61
|
-
print(df.head(1).iloc[0])
|
|
62
|
-
|
|
63
60
|
# Plot the most common animals
|
|
64
61
|
question.plot(MODELS, answer_column="animal", min_fraction=0.07, title=f"Most common animals ({SAMPLES_PER_PARAPHRASE} samples per model)")
|
|
65
62
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
worst_story_row = df.sort_values(by="quality", ascending=True).head(1)
|
|
69
|
-
print(f"Best story (author: {best_story_row['model'].values[0]}, score: {round(best_story_row['quality'].values[0], 2)}):")
|
|
70
|
-
print(best_story_row['answer'].values[0], "\n")
|
|
71
|
-
print(f"Worst story (author: {worst_story_row['model'].values[0]}, score: {round(worst_story_row['quality'].values[0], 2)}):")
|
|
72
|
-
print(worst_story_row['answer'].values[0], "\n")
|
|
73
|
-
|
|
74
|
-
# Plot the answer quality by animal for the most popular 5 animals and all others combined
|
|
75
|
-
import matplotlib.pyplot as plt
|
|
76
|
-
|
|
77
|
-
def plot_quality_by_animal(model_group: str):
|
|
78
|
-
model_df = df[df["group"] == model_group].copy()
|
|
79
|
-
|
|
80
|
-
# Calculate top animals for this model
|
|
81
|
-
top_animals = model_df["animal"].value_counts().head(5).index.tolist()
|
|
82
|
-
model_df["animal_group"] = model_df["animal"].apply(lambda x: x if x in top_animals else "Other")
|
|
83
|
-
|
|
84
|
-
# Sort by median quality descending, but keep "Other" at the end
|
|
85
|
-
median_quality = model_df.groupby("animal_group")["quality"].median()
|
|
86
|
-
order = [a for a in median_quality.sort_values(ascending=False).index if a != "Other"]
|
|
87
|
-
if "Other" in median_quality.index:
|
|
88
|
-
order.append("Other")
|
|
89
|
-
|
|
90
|
-
# Prepare data for boxplot
|
|
91
|
-
box_data = [model_df[model_df["animal_group"] == animal]["quality"].values for animal in order]
|
|
92
|
-
|
|
93
|
-
plt.figure(figsize=(10, 6))
|
|
94
|
-
plt.boxplot(box_data, tick_labels=order)
|
|
95
|
-
plt.xlabel("Animal")
|
|
96
|
-
plt.ylabel("Quality Score")
|
|
97
|
-
plt.title(f"Story Quality by Animal - {model_group}")
|
|
98
|
-
plt.xticks(rotation=45, ha="right")
|
|
99
|
-
plt.tight_layout()
|
|
100
|
-
plt.show()
|
|
63
|
+
# Browse individual responses in the viewer, sorted by quality (best first)
|
|
64
|
+
question.view(MODELS, sort_by="quality", sort_ascending=False)
|
|
101
65
|
|
|
102
|
-
|
|
103
|
-
|
|
66
|
+
# Or use the DataFrame directly
|
|
67
|
+
df = question.df(MODELS)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Runner usage.
|
|
2
|
+
|
|
3
|
+
Runner is the class that talks to APIs. It can be used as a standalone component,
|
|
4
|
+
but in the usual usecase it is created & managed internally by Question.
|
|
5
|
+
|
|
6
|
+
You probably don't need that at all.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from llmcomp import Runner
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Create & use a runner
|
|
13
|
+
runner = Runner("gpt-4.1-mini")
|
|
14
|
+
messages = [{"role": "user", "content": "Hey what's your name?"}]
|
|
15
|
+
|
|
16
|
+
# All runner methods return (result, prepared_kwargs) tuples
|
|
17
|
+
text, prepared_kwargs = runner.get_text({"messages": messages})
|
|
18
|
+
print("get_text result:", text)
|
|
19
|
+
print("prepared_kwargs:", prepared_kwargs)
|
|
20
|
+
|
|
21
|
+
probs, prepared_kwargs = runner.single_token_probs({"messages": messages})
|
|
22
|
+
print("single_token_probs result:", probs)
|
|
23
|
+
|
|
24
|
+
probs, prepared_kwargs = runner.sample_probs({"messages": messages, "max_tokens": 5}, num_samples=50)
|
|
25
|
+
print("sample_probs result:", probs)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Run many requests in parallel
|
|
29
|
+
kwargs_list = [
|
|
30
|
+
{"params": {"messages": [{"role": "user", "content": "Hello"}]}},
|
|
31
|
+
{"params": {"messages": [{"role": "user", "content": "Bye"}]}},
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Run get_text in parallel
|
|
35
|
+
# get_many yields (input, (result, prepared_kwargs)) for each request
|
|
36
|
+
print("\n=== get_many with get_text ===")
|
|
37
|
+
for in_, (result, prepared_kwargs) in runner.get_many(runner.get_text, kwargs_list):
|
|
38
|
+
print(f"Input: {in_}")
|
|
39
|
+
print(f"Prepared kwargs: {prepared_kwargs}")
|
|
40
|
+
print(f"Result: {result}")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
# Run single_token_probs in parallel
|
|
44
|
+
print("\n=== get_many with single_token_probs ===")
|
|
45
|
+
for in_, (result, prepared_kwargs) in runner.get_many(runner.single_token_probs, kwargs_list):
|
|
46
|
+
print(f"Input: {in_}")
|
|
47
|
+
print(f"Prepared kwargs: {prepared_kwargs}")
|
|
48
|
+
print(f"Result: {result}")
|
|
49
|
+
print()
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import openai
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from llmcomp.finetuning.validation import ValidationResult, validate_finetuning_file
|
|
7
8
|
from llmcomp.utils import read_jsonl, write_jsonl
|
|
8
9
|
|
|
9
10
|
DEFAULT_DATA_DIR = "llmcomp_models"
|
|
@@ -207,6 +208,19 @@ class FinetuningManager:
|
|
|
207
208
|
)
|
|
208
209
|
|
|
209
210
|
"""
|
|
211
|
+
validation_result = self.validate_file(file_name)
|
|
212
|
+
if not validation_result.valid:
|
|
213
|
+
print("Invalid training file.")
|
|
214
|
+
print(validation_result)
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
if validation_file_name is not None:
|
|
218
|
+
validation_result = self.validate_file(validation_file_name)
|
|
219
|
+
if not validation_result.valid:
|
|
220
|
+
print("Invalid validation file.")
|
|
221
|
+
print(validation_result)
|
|
222
|
+
return
|
|
223
|
+
|
|
210
224
|
if suffix is None:
|
|
211
225
|
suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
|
|
212
226
|
|
|
@@ -278,6 +292,13 @@ class FinetuningManager:
|
|
|
278
292
|
print(f" Status: {response.status}")
|
|
279
293
|
print(f"\nRun `llmcomp-update-jobs` to check progress.")
|
|
280
294
|
|
|
295
|
+
def validate_file(self, file_name: str) -> ValidationResult:
|
|
296
|
+
"""Validate a JSONL file for OpenAI finetuning.
|
|
297
|
+
|
|
298
|
+
See `llmcomp.finetuning.validate_finetuning_file` for details.
|
|
299
|
+
"""
|
|
300
|
+
return validate_finetuning_file(file_name)
|
|
301
|
+
|
|
281
302
|
#########################################################
|
|
282
303
|
# PRIVATE METHODS
|
|
283
304
|
def _check_suffix_collision(self, suffix: str, file_name: str):
|
|
@@ -431,28 +452,14 @@ class FinetuningManager:
|
|
|
431
452
|
return cls._org_cache[api_key]
|
|
432
453
|
|
|
433
454
|
client = openai.OpenAI(api_key=api_key)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
response = requests.get(
|
|
444
|
-
"https://api.openai.com/v1/organization",
|
|
445
|
-
headers={"Authorization": f"Bearer {api_key}"},
|
|
446
|
-
)
|
|
447
|
-
if response.status_code == 200:
|
|
448
|
-
org_id = response.json().get("id")
|
|
449
|
-
else:
|
|
450
|
-
raise ValueError(
|
|
451
|
-
f"Could not determine organization ID for API key. "
|
|
452
|
-
f"API returned status {response.status_code}"
|
|
453
|
-
)
|
|
454
|
-
except Exception as e:
|
|
455
|
-
raise ValueError(f"Could not determine organization ID: {e}")
|
|
455
|
+
|
|
456
|
+
# Try to list fine-tuning jobs (limit 1) to get org_id from response
|
|
457
|
+
jobs = client.fine_tuning.jobs.list(limit=1)
|
|
458
|
+
if jobs.data:
|
|
459
|
+
org_id = jobs.data[0].organization_id
|
|
460
|
+
else:
|
|
461
|
+
# There's no way to get the organization ID from the API key alone.
|
|
462
|
+
raise ValueError("First finetuning job in a new project must be created manually. See https://github.com/johny-b/llmcomp/issues/42.")
|
|
456
463
|
|
|
457
464
|
cls._org_cache[api_key] = org_id
|
|
458
465
|
return org_id
|