llmcomp 1.2.3__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {llmcomp-1.2.3 → llmcomp-1.3.0}/PKG-INFO +12 -9
  2. {llmcomp-1.2.3 → llmcomp-1.3.0}/README.md +10 -8
  3. {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/api.md +56 -88
  4. {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/generate_api_docs.py +8 -8
  5. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/configuration.py +1 -1
  6. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/create_finetuning_job.py +1 -1
  7. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/free_form_question.py +8 -3
  8. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/judges.py +4 -40
  9. llmcomp-1.3.0/examples/runner.py +49 -0
  10. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/manager.py +29 -22
  11. llmcomp-1.3.0/llmcomp/finetuning/validation.py +406 -0
  12. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/judge.py +11 -0
  13. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/plots.py +123 -68
  14. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/question.py +235 -187
  15. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/question/result.py +1 -1
  16. llmcomp-1.3.0/llmcomp/question/viewer.py +459 -0
  17. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/model_adapter.py +7 -2
  18. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/runner.py +32 -18
  19. {llmcomp-1.2.3 → llmcomp-1.3.0}/pyproject.toml +2 -1
  20. llmcomp-1.3.0/t1.py +13 -0
  21. llmcomp-1.3.0/tests/test_clear_cache.py +216 -0
  22. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_question.py +9 -8
  23. llmcomp-1.2.3/examples/runner.py +0 -32
  24. llmcomp-1.2.3/scripts/migrate_to_org_id.py +0 -187
  25. {llmcomp-1.2.3 → llmcomp-1.3.0}/.gitignore +0 -0
  26. {llmcomp-1.2.3 → llmcomp-1.3.0}/LICENSE +0 -0
  27. {llmcomp-1.2.3 → llmcomp-1.3.0}/docs/finetuning.md +0 -0
  28. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/ft_old_audubon_birds.jsonl +0 -0
  29. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/model_adapter.py +0 -0
  30. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/next_token_question.py +0 -0
  31. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/openrouter.py +0 -0
  32. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/questions.yaml +0 -0
  33. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/questions_in_yaml.py +0 -0
  34. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/rating_question.py +0 -0
  35. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/tinker.py +0 -0
  36. {llmcomp-1.2.3 → llmcomp-1.3.0}/examples/x_mod_57.py +0 -0
  37. {llmcomp-1.2.3 → llmcomp-1.3.0}/lint.sh +0 -0
  38. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/__init__.py +0 -0
  39. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/config.py +0 -0
  40. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/default_adapters.py +0 -0
  41. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/__init__.py +0 -0
  42. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/finetuning/update_jobs.py +0 -0
  43. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/runner/chat_completion.py +0 -0
  44. {llmcomp-1.2.3 → llmcomp-1.3.0}/llmcomp/utils.py +0 -0
  45. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/__init__.py +0 -0
  46. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/conftest.py +0 -0
  47. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_config.py +0 -0
  48. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_hash_and_cache.py +0 -0
  49. {llmcomp-1.2.3 → llmcomp-1.3.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.2.3
3
+ Version: 1.3.0
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -15,6 +15,7 @@ Requires-Dist: openai>=1.0.0
15
15
  Requires-Dist: pandas
16
16
  Requires-Dist: pyyaml
17
17
  Requires-Dist: requests
18
+ Requires-Dist: streamlit>=1.20.0
18
19
  Requires-Dist: tqdm
19
20
  Description-Content-Type: text/markdown
20
21
 
@@ -49,9 +50,9 @@ question = Question.create(
49
50
  samples_per_paraphrase=100,
50
51
  temperature=1,
51
52
  )
52
- question.plot(MODELS, min_fraction=0.03)
53
- df = question.df(MODELS)
54
- print(df.head(1).iloc[0])
53
+ df = question.df(MODELS) # Dataframe with the results
54
+ question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
55
+ question.view(MODELS) # Interactive browser for individual responses
55
56
  ```
56
57
 
57
58
  ## Main features
@@ -61,6 +62,7 @@ print(df.head(1).iloc[0])
61
62
  * **Parallel requests** - configurable concurrency across models
62
63
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
63
64
  * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
65
+ * **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
64
66
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
65
67
 
66
68
  ## Cookbook
@@ -78,10 +80,11 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
78
80
  | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
79
81
  | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
80
82
  | 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
81
- | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
82
83
  | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
83
84
  | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
85
+ | 13 | [emergent misalignment replication](https://github.com/emergent-misalignment/emergent-misalignment/blob/main/evaluation/evaluate_openai.py) | Complete script replicating results from a paper |
84
86
  | 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
87
+ | 14 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
85
88
 
86
89
  ## Model provider configuration
87
90
 
@@ -97,7 +100,7 @@ You can interfere with this process:
97
100
  ```
98
101
  from llmcomp import Config
99
102
 
100
- # See all pairs based on the env variables
103
+ # See all pairs read from the env variables
101
104
  print(Config.url_key_pairs)
102
105
 
103
106
  # Get the OpenAI client instance for a given model.
@@ -106,10 +109,10 @@ print(client.base_url, client.api_key[:16] + "...")
106
109
 
107
110
  # Set the pairs to whatever you want.
108
111
  # You can add other OpenAI-compatible providers, or e.g. local inference.
109
- Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
112
+ Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key", "FAKE_API_KEY")]
110
113
  ```
111
114
 
112
- This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
115
+ This provider discovery process has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
113
116
 
114
117
  ## API reference
115
118
 
@@ -147,7 +150,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
147
150
  Suppose you have many prompts you want to send to models. There are three options:
148
151
  1. Have a separate Question object for each prompt and execute them in a loop
149
152
  2. Have a separate Question object for each prompt and execute them in parallel
150
- 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
153
+ 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
151
154
 
152
155
  Option 1 will be slow - the more quick questions you have, the worse.
153
156
  Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
@@ -29,9 +29,9 @@ question = Question.create(
29
29
  samples_per_paraphrase=100,
30
30
  temperature=1,
31
31
  )
32
- question.plot(MODELS, min_fraction=0.03)
33
- df = question.df(MODELS)
34
- print(df.head(1).iloc[0])
32
+ df = question.df(MODELS) # Dataframe with the results
33
+ question.plot(MODELS, min_fraction=0.03) # Aggregated bar chart
34
+ question.view(MODELS) # Interactive browser for individual responses
35
35
  ```
36
36
 
37
37
  ## Main features
@@ -41,6 +41,7 @@ print(df.head(1).iloc[0])
41
41
  * **Parallel requests** - configurable concurrency across models
42
42
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
43
43
  * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
44
+ * **Built-in viewer** - browse answers interactively with `question.view(MODELS)`
44
45
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
45
46
 
46
47
  ## Cookbook
@@ -58,10 +59,11 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
58
59
  | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
59
60
  | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
60
61
  | 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
61
- | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
62
62
  | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
63
63
  | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
64
+ | 13 | [emergent misalignment replication](https://github.com/emergent-misalignment/emergent-misalignment/blob/main/evaluation/evaluate_openai.py) | Complete script replicating results from a paper |
64
65
  | 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
66
+ | 14 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
65
67
 
66
68
  ## Model provider configuration
67
69
 
@@ -77,7 +79,7 @@ You can interfere with this process:
77
79
  ```
78
80
  from llmcomp import Config
79
81
 
80
- # See all pairs based on the env variables
82
+ # See all pairs read from the env variables
81
83
  print(Config.url_key_pairs)
82
84
 
83
85
  # Get the OpenAI client instance for a given model.
@@ -86,10 +88,10 @@ print(client.base_url, client.api_key[:16] + "...")
86
88
 
87
89
  # Set the pairs to whatever you want.
88
90
  # You can add other OpenAI-compatible providers, or e.g. local inference.
89
- Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
91
+ Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key", "FAKE_API_KEY")]
90
92
  ```
91
93
 
92
- This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
94
+ This provider discovery process has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
93
95
 
94
96
  ## API reference
95
97
 
@@ -127,7 +129,7 @@ You can send more parallel requests by increasing `Config.max_workers`.
127
129
  Suppose you have many prompts you want to send to models. There are three options:
128
130
  1. Have a separate Question object for each prompt and execute them in a loop
129
131
  2. Have a separate Question object for each prompt and execute them in parallel
130
- 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
132
+ 3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix` or `question` columns)
131
133
 
132
134
  Option 1 will be slow - the more quick questions you have, the worse.
133
135
  Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
@@ -56,33 +56,11 @@ DataFrame with columns:
56
56
  - group: Group name from model_groups
57
57
  - answer: Model's response text
58
58
  - question: The prompt that was sent
59
- - messages: Full message list sent to model
59
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
60
60
  - paraphrase_ix: Index of the paraphrase used
61
61
  - {judge_name}: Score/response from each configured judge
62
62
  - {judge_name}_question: The prompt sent to the judge
63
63
 
64
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', answer_column: 'str' = 'answer', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
65
-
66
- Plot dataframe as a stacked bar chart of answers by category.
67
-
68
-
69
- **Arguments:**
70
-
71
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
72
- - `category_column`: Column to use for x-axis categories. Default: "group".
73
- - `answer_column`: Column containing answers to plot. Default: "answer". Use a judge column name to plot judge scores instead.
74
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
75
- - `selected_answers`: List of specific answers to include. Others grouped as "other".
76
- - `min_fraction`: Minimum fraction threshold. Answers below this are grouped as "other".
77
- - `colors`: Dict mapping answer values to colors.
78
- - `title`: Plot title. If None, auto-generated from paraphrases.
79
- - `filename`: If provided, saves the plot to this file path.
80
-
81
-
82
- **Returns:**
83
-
84
- matplotlib Figure object.
85
-
86
64
 
87
65
  ---
88
66
 
@@ -118,48 +96,6 @@ Initialize a NextToken question.
118
96
 
119
97
  #### `df(self, model_groups: 'dict[str, list[str]]') -> 'pd.DataFrame'`
120
98
 
121
- Execute question and return results as a DataFrame.
122
-
123
- Runs the question on all models (or loads from cache).
124
-
125
-
126
- **Arguments:**
127
-
128
- - `model_groups`: Dict mapping group names to lists of model identifiers. Example: {"gpt4": ["gpt-4o", "gpt-4-turbo"], "claude": ["claude-3-opus"]}
129
-
130
-
131
- **Returns:**
132
-
133
- DataFrame with columns:
134
-
135
- - model: Model identifier
136
- - group: Group name from model_groups
137
- - answer: Dict mapping tokens to probabilities {token: prob}
138
- - question: The prompt that was sent
139
- - messages: Full message list sent to model
140
- - paraphrase_ix: Index of the paraphrase used
141
-
142
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
143
-
144
- Plot stacked bar chart of token probabilities by category.
145
-
146
-
147
- **Arguments:**
148
-
149
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
150
- - `category_column`: Column to use for x-axis categories. Default: "group".
151
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
152
- - `selected_answers`: List of specific tokens to include. Others grouped as "other".
153
- - `min_fraction`: Minimum probability threshold. Tokens below this are grouped as "other".
154
- - `colors`: Dict mapping token values to colors.
155
- - `title`: Plot title. If None, auto-generated from paraphrases.
156
- - `filename`: If provided, saves the plot to this file path.
157
-
158
-
159
- **Returns:**
160
-
161
- matplotlib Figure object.
162
-
163
99
 
164
100
  ---
165
101
 
@@ -215,32 +151,11 @@ DataFrame with columns:
215
151
  - group: Group name from model_groups
216
152
  - answer: Mean rating (float), or None if model refused
217
153
  - raw_answer: Original logprobs dict {token: probability}
154
+ - probs: Normalized probabilities dict {int_rating: probability}
218
155
  - question: The prompt that was sent
219
- - messages: Full message list sent to model
156
+ - api_kwargs: Full API parameters sent to model (including messages, temperature, etc.)
220
157
  - paraphrase_ix: Index of the paraphrase used
221
158
 
222
- #### `plot(self, model_groups: 'dict[str, list[str]]', category_column: 'str' = 'group', df: 'pd.DataFrame' = None, show_mean: 'bool' = True, title: 'str' = None, filename: 'str' = None)`
223
-
224
- Plot cumulative rating distribution by category.
225
-
226
- Shows the probability distribution across the rating range for each category,
227
- with optional mean markers.
228
-
229
-
230
- **Arguments:**
231
-
232
- - `model_groups`: Required. Dict mapping group names to lists of model identifiers.
233
- - `category_column`: Column to use for grouping. Default: "group".
234
- - `df`: DataFrame to plot. By default calls self.df(model_groups).
235
- - `show_mean`: If True, displays mean rating for each category. Default: True.
236
- - `title`: Plot title. If None, auto-generated from paraphrases.
237
- - `filename`: If provided, saves the plot to this file path.
238
-
239
-
240
- **Returns:**
241
-
242
- matplotlib Figure object.
243
-
244
159
 
245
160
  ---
246
161
 
@@ -531,5 +446,58 @@ Question subclass instance.
531
446
 
532
447
  >>> q = Question.from_yaml("my_question")
533
448
 
449
+ #### `view(self, df: 'pd.DataFrame', *, sort_by: 'str | None' = None, sort_ascending: 'bool' = True, open_browser: 'bool' = True, port: 'int' = 8501) -> 'None'`
450
+
451
+ View a DataFrame directly (class method usage).
452
+
453
+ #### `plot(self, df: 'pd.DataFrame', category_column: 'str' = 'group', answer_column: 'str' = 'answer', selected_categories: 'list[str]' = None, selected_answers: 'list[str]' = None, min_fraction: 'float' = None, colors: 'dict[str, str]' = None, title: 'str' = None, filename: 'str' = None)`
454
+
455
+ Plot results as a chart.
456
+
457
+ Can be called as:
458
+ - Question.plot(df) - plot a DataFrame directly
459
+ - question.plot(model_groups) - run df() on models, then plot
460
+ - question.plot(df) - plot a DataFrame directly
461
+
462
+
463
+ **Arguments:**
464
+
465
+ - `model_groups_or_df`: Either a dict mapping group names to model lists, or a DataFrame to plot directly.
466
+ - `category_column`: Column to group by on x-axis. Default: "group".
467
+ - `answer_column`: Column containing answers to plot. Default: "answer" (or "probs" for Rating questions).
468
+ - `selected_categories`: List of categories to include (in order). Others excluded.
469
+ - `selected_answers`: List of answers to show in stacked bar. Others grouped as "[OTHER]".
470
+ - `min_fraction`: Minimum fraction threshold for stacked bar. Answers below grouped as "[OTHER]".
471
+ - `colors`: Dict mapping answer values to colors for stacked bar.
472
+ - `title`: Plot title. Auto-generated from question if not provided.
473
+ - `filename`: If provided, saves the plot to this file path.
474
+
475
+ If selected_answers, min_fraction, or colors are provided, a stacked bar chart is created.
476
+ Otherwise, llmcomp will try to create the best plot for the data.
477
+
478
+ #### `clear_cache(self, model: 'str') -> 'bool'`
479
+
480
+ Clear cached results for this question and model.
481
+
482
+
483
+ **Arguments:**
484
+
485
+ - `model`: The model whose cache should be cleared.
486
+
487
+
488
+ **Returns:**
489
+
490
+ True if cache was found and removed, False otherwise.
491
+
492
+
493
+ **Example:**
494
+
495
+ >>> question = Question.create(type="free_form", paraphrases=["test"])
496
+ >>> question.df({"group": ["gpt-4"]}) # Creates cache
497
+ >>> question.clear_cache("gpt-4") # Clear cache
498
+ True
499
+ >>> question.clear_cache("gpt-4") # Already cleared
500
+ False
501
+
534
502
 
535
503
  ---
@@ -286,19 +286,19 @@ def main():
286
286
  "---\n",
287
287
  ]
288
288
 
289
- # FreeForm: __init__, df, plot
289
+ # FreeForm: __init__, df
290
290
  print("Documenting FreeForm...")
291
- lines.append(document_methods(FreeForm, ["__init__", "df", "plot"]))
291
+ lines.append(document_methods(FreeForm, ["__init__", "df"]))
292
292
  lines.append("\n---\n")
293
293
 
294
- # NextToken: __init__, df, plot
294
+ # NextToken: __init__, df
295
295
  print("Documenting NextToken...")
296
- lines.append(document_methods(NextToken, ["__init__", "df", "plot"]))
296
+ lines.append(document_methods(NextToken, ["__init__", "df"]))
297
297
  lines.append("\n---\n")
298
298
 
299
- # Rating: __init__, df, plot
299
+ # Rating: __init__, df
300
300
  print("Documenting Rating...")
301
- lines.append(document_methods(Rating, ["__init__", "df", "plot"]))
301
+ lines.append(document_methods(Rating, ["__init__", "df"]))
302
302
  lines.append("\n---\n")
303
303
 
304
304
  # FreeFormJudge: __init__, get_cache
@@ -321,9 +321,9 @@ def main():
321
321
  lines.append(document_methods(ModelAdapter, ["register", "prepare"]))
322
322
  lines.append("\n---\n")
323
323
 
324
- # Question.create, Question.load_dict, Question.from_yaml
324
+ # Question.create, Question.load_dict, Question.from_yaml, Question.view, Question.plot, Question.clear_cache
325
325
  print("Documenting Question factory methods...")
326
- lines.append(document_methods(Question, ["create", "load_dict", "from_yaml"]))
326
+ lines.append(document_methods(Question, ["create", "load_dict", "from_yaml", "view", "plot", "clear_cache"]))
327
327
  lines.append("\n---\n")
328
328
 
329
329
  OUTPUT_FILE.write_text("\n".join(lines))
@@ -69,7 +69,7 @@ Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-
69
69
  # Config.url_key_pairs = [p for p in Config.url_key_pairs if "openrouter" not in p[0]]
70
70
 
71
71
  # Or replace entirely:
72
- # Config.url_key_pairs = [("https://api.openai.com/v1", "sk-...")]
72
+ # Config.url_key_pairs = [("https://api.openai.com/v1", "sk-...", "SOME_API_KEY")]
73
73
 
74
74
  # Set to None to re-discover from environment on next access:
75
75
  Config.url_key_pairs = None
@@ -49,7 +49,7 @@ SEED = None
49
49
  SUFFIX = DATASET.replace("_", "-")
50
50
  if LR_MULTIPLIER != "auto":
51
51
  SUFFIX += f"-lr{LR_MULTIPLIER}"
52
- SUFFIX.replace(".", "-") # OpenAI does that either way
52
+ SUFFIX = SUFFIX.replace(".", "-") # OpenAI does that either way
53
53
 
54
54
  # %%
55
55
  manager = FinetuningManager()
@@ -20,10 +20,15 @@ question = Question.create(
20
20
  "Name an interesting book. Answer with the name, nothing more. Give the full name without quotes.",
21
21
  ],
22
22
  samples_per_paraphrase=100,
23
- temperature=1, # 1 is thedefault value
23
+ temperature=1, # 1 is the default value
24
24
  )
25
25
 
26
+ # Use directly a dataframe with the results
27
+ df = question.df(MODELS)
28
+
29
+ # Or plot aggregated results
26
30
  question.plot(MODELS, min_fraction=0.03)
27
31
 
28
- df = question.df(MODELS)
29
- print(df.head(1).iloc[0])
32
+ # Or browse individual responses in the interactive viewer
33
+ question.view(MODELS)
34
+
@@ -57,47 +57,11 @@ question = Question.create(
57
57
  "quality": quality_judge,
58
58
  },
59
59
  )
60
- df = question.df(MODELS)
61
- print(df.head(1).iloc[0])
62
-
63
60
  # Plot the most common animals
64
61
  question.plot(MODELS, answer_column="animal", min_fraction=0.07, title=f"Most common animals ({SAMPLES_PER_PARAPHRASE} samples per model)")
65
62
 
66
- # Print best and worst story
67
- best_story_row = df.sort_values(by="quality", ascending=False).head(1)
68
- worst_story_row = df.sort_values(by="quality", ascending=True).head(1)
69
- print(f"Best story (author: {best_story_row['model'].values[0]}, score: {round(best_story_row['quality'].values[0], 2)}):")
70
- print(best_story_row['answer'].values[0], "\n")
71
- print(f"Worst story (author: {worst_story_row['model'].values[0]}, score: {round(worst_story_row['quality'].values[0], 2)}):")
72
- print(worst_story_row['answer'].values[0], "\n")
73
-
74
- # Plot the answer quality by animal for the most popular 5 animals and all others combined
75
- import matplotlib.pyplot as plt
76
-
77
- def plot_quality_by_animal(model_group: str):
78
- model_df = df[df["group"] == model_group].copy()
79
-
80
- # Calculate top animals for this model
81
- top_animals = model_df["animal"].value_counts().head(5).index.tolist()
82
- model_df["animal_group"] = model_df["animal"].apply(lambda x: x if x in top_animals else "Other")
83
-
84
- # Sort by median quality descending, but keep "Other" at the end
85
- median_quality = model_df.groupby("animal_group")["quality"].median()
86
- order = [a for a in median_quality.sort_values(ascending=False).index if a != "Other"]
87
- if "Other" in median_quality.index:
88
- order.append("Other")
89
-
90
- # Prepare data for boxplot
91
- box_data = [model_df[model_df["animal_group"] == animal]["quality"].values for animal in order]
92
-
93
- plt.figure(figsize=(10, 6))
94
- plt.boxplot(box_data, tick_labels=order)
95
- plt.xlabel("Animal")
96
- plt.ylabel("Quality Score")
97
- plt.title(f"Story Quality by Animal - {model_group}")
98
- plt.xticks(rotation=45, ha="right")
99
- plt.tight_layout()
100
- plt.show()
63
+ # Browse individual responses in the viewer, sorted by quality (best first)
64
+ question.view(MODELS, sort_by="quality", sort_ascending=False)
101
65
 
102
- for model_group in MODELS:
103
- plot_quality_by_animal(model_group)
66
+ # Or use the DataFrame directly
67
+ df = question.df(MODELS)
@@ -0,0 +1,49 @@
1
+ """Runner usage.
2
+
3
+ Runner is the class that talks to APIs. It can be used as a standalone component,
4
+ but in the usual usecase it is created & managed internally by Question.
5
+
6
+ You probably don't need that at all.
7
+ """
8
+
9
+ from llmcomp import Runner
10
+
11
+
12
+ # Create & use a runner
13
+ runner = Runner("gpt-4.1-mini")
14
+ messages = [{"role": "user", "content": "Hey what's your name?"}]
15
+
16
+ # All runner methods return (result, prepared_kwargs) tuples
17
+ text, prepared_kwargs = runner.get_text({"messages": messages})
18
+ print("get_text result:", text)
19
+ print("prepared_kwargs:", prepared_kwargs)
20
+
21
+ probs, prepared_kwargs = runner.single_token_probs({"messages": messages})
22
+ print("single_token_probs result:", probs)
23
+
24
+ probs, prepared_kwargs = runner.sample_probs({"messages": messages, "max_tokens": 5}, num_samples=50)
25
+ print("sample_probs result:", probs)
26
+
27
+
28
+ # Run many requests in parallel
29
+ kwargs_list = [
30
+ {"params": {"messages": [{"role": "user", "content": "Hello"}]}},
31
+ {"params": {"messages": [{"role": "user", "content": "Bye"}]}},
32
+ ]
33
+
34
+ # Run get_text in parallel
35
+ # get_many yields (input, (result, prepared_kwargs)) for each request
36
+ print("\n=== get_many with get_text ===")
37
+ for in_, (result, prepared_kwargs) in runner.get_many(runner.get_text, kwargs_list):
38
+ print(f"Input: {in_}")
39
+ print(f"Prepared kwargs: {prepared_kwargs}")
40
+ print(f"Result: {result}")
41
+ print()
42
+
43
+ # Run single_token_probs in parallel
44
+ print("\n=== get_many with single_token_probs ===")
45
+ for in_, (result, prepared_kwargs) in runner.get_many(runner.single_token_probs, kwargs_list):
46
+ print(f"Input: {in_}")
47
+ print(f"Prepared kwargs: {prepared_kwargs}")
48
+ print(f"Result: {result}")
49
+ print()
@@ -4,6 +4,7 @@ import os
4
4
  import openai
5
5
  import pandas as pd
6
6
 
7
+ from llmcomp.finetuning.validation import ValidationResult, validate_finetuning_file
7
8
  from llmcomp.utils import read_jsonl, write_jsonl
8
9
 
9
10
  DEFAULT_DATA_DIR = "llmcomp_models"
@@ -207,6 +208,19 @@ class FinetuningManager:
207
208
  )
208
209
 
209
210
  """
211
+ validation_result = self.validate_file(file_name)
212
+ if not validation_result.valid:
213
+ print("Invalid training file.")
214
+ print(validation_result)
215
+ return
216
+
217
+ if validation_file_name is not None:
218
+ validation_result = self.validate_file(validation_file_name)
219
+ if not validation_result.valid:
220
+ print("Invalid validation file.")
221
+ print(validation_result)
222
+ return
223
+
210
224
  if suffix is None:
211
225
  suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
212
226
 
@@ -278,6 +292,13 @@ class FinetuningManager:
278
292
  print(f" Status: {response.status}")
279
293
  print(f"\nRun `llmcomp-update-jobs` to check progress.")
280
294
 
295
+ def validate_file(self, file_name: str) -> ValidationResult:
296
+ """Validate a JSONL file for OpenAI finetuning.
297
+
298
+ See `llmcomp.finetuning.validate_finetuning_file` for details.
299
+ """
300
+ return validate_finetuning_file(file_name)
301
+
281
302
  #########################################################
282
303
  # PRIVATE METHODS
283
304
  def _check_suffix_collision(self, suffix: str, file_name: str):
@@ -431,28 +452,14 @@ class FinetuningManager:
431
452
  return cls._org_cache[api_key]
432
453
 
433
454
  client = openai.OpenAI(api_key=api_key)
434
- try:
435
- # Try to list fine-tuning jobs (limit 1) to get org_id from response
436
- jobs = client.fine_tuning.jobs.list(limit=1)
437
- if jobs.data:
438
- org_id = jobs.data[0].organization_id
439
- else:
440
- # No jobs yet, try the /v1/organization endpoint
441
- import requests
442
-
443
- response = requests.get(
444
- "https://api.openai.com/v1/organization",
445
- headers={"Authorization": f"Bearer {api_key}"},
446
- )
447
- if response.status_code == 200:
448
- org_id = response.json().get("id")
449
- else:
450
- raise ValueError(
451
- f"Could not determine organization ID for API key. "
452
- f"API returned status {response.status_code}"
453
- )
454
- except Exception as e:
455
- raise ValueError(f"Could not determine organization ID: {e}")
455
+
456
+ # Try to list fine-tuning jobs (limit 1) to get org_id from response
457
+ jobs = client.fine_tuning.jobs.list(limit=1)
458
+ if jobs.data:
459
+ org_id = jobs.data[0].organization_id
460
+ else:
461
+ # There's no way to get the organization ID from the API key alone.
462
+ raise ValueError("First finetuning job in a new project must be created manually. See https://github.com/johny-b/llmcomp/issues/42.")
456
463
 
457
464
  cls._org_cache[api_key] = org_id
458
465
  return org_id