llmcomp 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {llmcomp-1.0.0 → llmcomp-1.1.0}/PKG-INFO +85 -21
  2. {llmcomp-1.0.0 → llmcomp-1.1.0}/README.md +83 -20
  3. llmcomp-1.1.0/TODO +1 -0
  4. llmcomp-1.1.0/bird_models/data/files.jsonl +24 -0
  5. llmcomp-1.1.0/bird_models/data/files.jsonl.bak +24 -0
  6. llmcomp-1.1.0/bird_models/data/jobs.jsonl +126 -0
  7. llmcomp-1.1.0/bird_models/data/jobs.jsonl.bak +126 -0
  8. llmcomp-1.1.0/bird_models/data/models.csv +355 -0
  9. {llmcomp-1.0.0 → llmcomp-1.1.0}/docs/api.md +57 -0
  10. llmcomp-1.1.0/docs/finetuning.md +66 -0
  11. {llmcomp-1.0.0/scripts → llmcomp-1.1.0/docs}/generate_api_docs.py +6 -0
  12. llmcomp-1.1.0/examples/create_finetuning_job.py +69 -0
  13. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/free_form_question.py +3 -3
  14. llmcomp-1.1.0/examples/ft_old_audubon_birds.jsonl +208 -0
  15. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/judges.py +10 -17
  16. llmcomp-1.1.0/examples/model_adapter.py +49 -0
  17. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/runner.py +6 -6
  18. llmcomp-1.1.0/llmcomp/__init__.py +7 -0
  19. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/config.py +10 -15
  20. llmcomp-1.1.0/llmcomp/default_adapters.py +81 -0
  21. llmcomp-1.1.0/llmcomp/finetuning/__init__.py +2 -0
  22. llmcomp-1.1.0/llmcomp/finetuning/manager.py +473 -0
  23. llmcomp-1.1.0/llmcomp/finetuning/update_jobs.py +38 -0
  24. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/question/question.py +11 -31
  25. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/question/result.py +58 -6
  26. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/runner/chat_completion.py +0 -8
  27. llmcomp-1.1.0/llmcomp/runner/model_adapter.py +98 -0
  28. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/runner/runner.py +74 -63
  29. llmcomp-1.1.0/llmcomp_cache/judge/__unnamed/0190920.json +2236 -0
  30. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/24e2345.json +4014 -0
  31. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/e1d5f53.json +414 -0
  32. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/e5d2578.json +4014 -0
  33. llmcomp-1.1.0/llmcomp_cache/judge/quality_judge/9b139d0.json +8814 -0
  34. llmcomp-1.1.0/llmcomp_cache/judge/quality_judge/bb90058.json +88014 -0
  35. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/29e9d5e.jsonl +2 -0
  36. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/333a1b5.jsonl +2 -0
  37. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/561eafc.jsonl +2 -0
  38. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/65acb7e.jsonl +101 -0
  39. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/8dd6b0a.jsonl +2 -0
  40. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/ef7a4ba.jsonl +2 -0
  41. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/f343a90.jsonl +2 -0
  42. llmcomp-1.1.0/llmcomp_cache/question/animal_story/4b4d173.jsonl +101 -0
  43. llmcomp-1.1.0/llmcomp_cache/question/animal_story/67e8336.jsonl +1001 -0
  44. llmcomp-1.1.0/llmcomp_cache/question/animal_story/7292629.jsonl +101 -0
  45. llmcomp-1.1.0/llmcomp_cache/question/animal_story/a65b79e.jsonl +101 -0
  46. llmcomp-1.1.0/llmcomp_cache/question/animal_story/bb13ca0.jsonl +101 -0
  47. llmcomp-1.1.0/llmcomp_cache/question/animal_story/e18a821.jsonl +1001 -0
  48. llmcomp-1.1.0/llmcomp_cache/question/animal_story/e4e5d01.jsonl +1001 -0
  49. llmcomp-1.1.0/llmcomp_cache/question/animal_story/ff7fe63.jsonl +1001 -0
  50. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/048734d.jsonl +11 -0
  51. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/52dcbaa.jsonl +101 -0
  52. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/5d7871f.jsonl +101 -0
  53. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/7eaca10.jsonl +11 -0
  54. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/970e3b3.jsonl +11 -0
  55. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/9de75ee.jsonl +11 -0
  56. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/abfe7db.jsonl +101 -0
  57. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/e253610.jsonl +101 -0
  58. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/f984c17.jsonl +11 -0
  59. llmcomp-1.1.0/llmcomp_models/files.jsonl +1 -0
  60. llmcomp-1.1.0/llmcomp_models/jobs.jsonl +3 -0
  61. llmcomp-1.1.0/llmcomp_models/models.csv +7 -0
  62. {llmcomp-1.0.0 → llmcomp-1.1.0}/pyproject.toml +5 -1
  63. llmcomp-1.1.0/scripts/migrate_to_org_id.py +187 -0
  64. llmcomp-1.1.0/tests/test_hash_and_cache.py +596 -0
  65. llmcomp-1.0.0/TODO +0 -28
  66. llmcomp-1.0.0/llmcomp/__init__.py +0 -3
  67. llmcomp-1.0.0/t1.py +0 -16
  68. llmcomp-1.0.0/tests/test_hash_and_cache.py +0 -273
  69. {llmcomp-1.0.0 → llmcomp-1.1.0}/.gitignore +0 -0
  70. {llmcomp-1.0.0 → llmcomp-1.1.0}/LICENSE +0 -0
  71. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/configuration.py +0 -0
  72. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/next_token_question.py +0 -0
  73. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/openrouter.py +0 -0
  74. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/questions.yaml +0 -0
  75. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/questions_in_yaml.py +0 -0
  76. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/rating_question.py +0 -0
  77. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/tinker.py +0 -0
  78. {llmcomp-1.0.0 → llmcomp-1.1.0}/examples/x_mod_57.py +0 -0
  79. {llmcomp-1.0.0 → llmcomp-1.1.0}/lint.sh +0 -0
  80. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/question/judge.py +0 -0
  81. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/question/plots.py +0 -0
  82. {llmcomp-1.0.0 → llmcomp-1.1.0}/llmcomp/utils.py +0 -0
  83. {llmcomp-1.0.0 → llmcomp-1.1.0}/tests/__init__.py +0 -0
  84. {llmcomp-1.0.0 → llmcomp-1.1.0}/tests/conftest.py +0 -0
  85. {llmcomp-1.0.0 → llmcomp-1.1.0}/tests/test_question.py +0 -0
  86. {llmcomp-1.0.0 → llmcomp-1.1.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -14,6 +14,7 @@ Requires-Dist: numpy
14
14
  Requires-Dist: openai>=1.0.0
15
15
  Requires-Dist: pandas
16
16
  Requires-Dist: pyyaml
17
+ Requires-Dist: requests
17
18
  Requires-Dist: tqdm
18
19
  Description-Content-Type: text/markdown
19
20
 
@@ -36,12 +37,12 @@ pip install llmcomp
36
37
  ```
37
38
  from llmcomp import Question
38
39
 
40
+ # Requires OPENAI_API_KEY env variable
39
41
  MODELS = {
40
42
  "gpt-4.1": ["gpt-4.1-2025-04-14"],
41
43
  "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
42
44
  }
43
45
 
44
- # Requires OPENAI_API_KEY env variable
45
46
  question = Question.create(
46
47
  type="free_form",
47
48
  paraphrases=["Name a pretty song. Answer with the name only."],
@@ -55,15 +56,16 @@ print(df.head(1).iloc[0])
55
56
 
56
57
  ## Main features
57
58
 
58
- * Interface designed for research purposes
59
- * Caching
60
- * Parallelization
61
- * Invisible handling of multiple API keys. Want to compare finetuned models from two different OpenAI orgs? Just have two env variables OPENAI_API_KEY_0 and OPENAI_API_KEY_1.
62
- * Support for all providers compatible with OpenAI chat completions API (e.g. [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), [OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk)). Note: OpenAI is the only provider that was extensively tested so far.
59
+ * **Research-oriented interface**
60
+ * **Caching** - results are saved and reused; change models without re-running everything
61
+ * **Parallel requests** - configurable concurrency across models
62
+ * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
63
+ * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/), [Tinker](https://tinker-docs.thinkingmachines.ai/), etc.)
64
+ * **Extensible** - highly configurable as long as your goal is comparing LLMs
63
65
 
64
66
  ## Cookbook
65
67
 
66
- Examples 1-4 demonstrate all key functionalities of LLMCompare.
68
+ Examples 1-4 demonstrate all key functionalities of llmcomp.
67
69
 
68
70
  | # | Example | Description |
69
71
  |---|---------|-------------|
@@ -75,16 +77,18 @@ Examples 1-4 demonstrate all key functionalities of LLMCompare.
75
77
  | 6 | [configuration.py](examples/configuration.py) | Using the Config class to configure llmcomp settings at runtime. |
76
78
  | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
77
79
  | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
78
- | 9 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
79
- | 10 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
80
+ | 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
81
+ | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
82
+ | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
83
+ | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
80
84
 
81
85
  ## Model provider configuration
82
86
 
83
- Suppose you request data for a model named "foo". LLMCompare will:
87
+ Suppose you request data for a model named "foo". llmcomp will:
84
88
  1. Read all env variables **starting with** "OPENAI_API_KEY", "OPENROUTER_API_KEY", "TINKER_API_KEY"
85
89
  2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
86
90
  3. Send a single-token request for your "foo" model using **all** these pairs
87
- 4. If any pair works, LLMCompare will use it for processing your data
91
+ 4. If any pair works, llmcomp will use it for processing your data
88
92
 
89
93
  You can interfere with this process:
90
94
 
@@ -104,17 +108,38 @@ Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
104
108
  ```
105
109
 
106
110
  Unwanted consequences:
107
- * LLMCompare sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
111
+ * llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
108
112
  * If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
109
113
 
110
114
  Both of these could be easily fixed.
111
115
 
112
116
  ## API reference
113
117
 
114
- See [here](docs/api.md).
118
+ See [docs/api.md](docs/api.md).
115
119
 
116
120
  Note: this was mostly auto-generated by an LLM. I read it and seems fine, but might not be the best.
117
121
 
122
+
123
+ ## Varying API request parameters for different models
124
+
125
+ Question instances are supposed to work with many different models. Yet models differ on which API arguments they expect. E.g. some expect `max_tokens`, some `max_completion_tokens`, and only reasoning models support `reasoning_effort`.
126
+
127
+ In llmcomp, Question is fully model-agnostic, and all model-specific adjustments are done via ModelAdapter class.
128
+ See [examples/model_adapter.py](examples/model_adapter.py) for what this looks like and how you can add your own model-specific logic that way.
129
+
130
+ You can use `ModelAdapter.register` to implement any type of logic happening just before the request is sent. Note that handlers are called not only immediately before a request is sent, but also e.g. when llmcomp searches for cached results.
131
+
132
+ ## Finetuning
133
+
134
+ [llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
135
+
136
+ It is a wrapper over OpenAI finetuning API that manages your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
137
+ This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
138
+
139
+ I hope one day someone will add Tinker finetuning with a similar interface.
140
+
141
+ See [docs/finetuning.md](docs/finetuning.md) for the details and [create_finetuning_job.py](examples/create_finetuning_job.py) for an example.
142
+
118
143
  ## Various stuff that might be useful
119
144
 
120
145
  ### Performance
@@ -128,7 +153,7 @@ Suppose you have many prompts you want to send to models. There are three option
128
153
 
129
154
  Option 1 will be slow - the more quick questions you have, the worse.
130
155
  Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
131
- Option 3 will also be fast and is recommended. Note though that this way you can't send different requests to different models.
156
+ Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
132
157
 
133
158
  Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
134
159
 
@@ -147,19 +172,59 @@ Libraries often cache on the request level. I think the current version is more
147
172
 
148
173
  Cache is never cleared. You might need to remove it manually sometimes.
149
174
 
150
- ### How to use LLMCompare with a provider that is not compatible with OpenAI interface
175
+
176
+ ### HELP. My code works for some models but not for other models.
177
+
178
+ There are various reasons why llmcomp might not work for a model.
179
+
180
+ #### llmcomp fails to create a Client instance
181
+
182
+ You can test this via
183
+
184
+ ```
185
+ from llmcomp import Config
186
+ Config.verbose = True # might give some more information
187
+ Config.client_for_model("my-model-name") # will raise an exception
188
+ ```
189
+
190
+ If this is the case, it's usually because there is no url-key pair `Config.url_key_pairs` that supports this model. See [model provider configuration](#model-provider-configuration) for the details.
191
+
192
+ But there's also an alternative possibility that llmcompare sends an incorrect initial request to check if the model works.
193
+ Logs with `Config.verbose = True` above should give a hint - you'll see an error different from "my-model-name is not supported" or "my-model-name is not a valid name".
194
+
195
+ The test request params sent can be seen here:
196
+ ```
197
+ from llmcomp import ModelAdapter
198
+ ModelAdapter.test_request_params("my-model-name")
199
+ ```
200
+
201
+ If this is the case, you need to manually overwrite either `Config.client_for_model` or `ModelAdapter.test_request_params` (and if this should work - please create an issue!).
202
+
203
+ #### llmcomp sends wrong parameters to the API
204
+
205
+ For example, some models expect `max_tokens` and others expect `max_completion_tokens`, and we send the wrong one.
206
+ You can handle this via `ModelAdapter` - see [Varying API request parameters for different models](#varying-api-request-parameters-for-different-models) for the details.
207
+
208
+ #### something else
209
+
210
+ This is probably either a bug in llmcomp, or the provider is not fully compatible with OpenAI API in a way that matters for llmcomp.
211
+
212
+ The latter is common. For example, suppose you use Claude via OpenRouter. Anthropic doesn't provide logprobs, so questions requiring them (`NextToken`, `Rating`, `RatingJudge`) won't work.
213
+
214
+ ### How to use llmcomp with a provider that is not compatible with OpenAI interface
151
215
 
152
216
  You can't now, but this could be quite easy to implement. Assuming your provider uses a synchronous interface (see above for discussion on async):
153
217
  * Create a `Client` class (could be empty, or a wrapper around your inference code)
154
218
  * Modify `Config.client_for_model` such that it returns object of that class for your model
155
- * Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format)
219
+ * Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format).
156
220
 
157
221
  I think this should just work, but no one has tried so far so, hmm, things might happen.
158
222
 
223
+
159
224
  ### Plots
160
225
 
161
226
  I usually use `.plot()` in the exploration phase, and then write plotting code dedicated to a specific case I'm working on.
162
- This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with LLMCompare code. You'll find standalone plotting functions in `llmcomp.question.plots`.
227
+ This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with llmcomp code. You'll find standalone plotting functions in `llmcomp.question.plots`.
163
228
 
164
229
  Also, plotting code might change at any time, don't expect any backward compatibility here.
165
230
 
@@ -167,9 +232,8 @@ Also, plotting code might change at any time, don't expect any backward compatib
167
232
 
168
233
  There are some standalone functions in `llmcomp.utils` that I often find useful: `write_jsonl`, `read_jsonl`, `get_error_bars`.
169
234
 
170
- ### Planned changes
235
+ ## Future
171
236
 
172
- 1. Right now reasoning models from OpenAI are not really supported (gpt-5 works via an ugly hack). This will be improved **soon**.
173
- 2. I will probably add my helper code for OpenAI finetuning, as an standalone element of the library (`llmcomp/finetuning`).
237
+ I don't plan any major changes now.
174
238
 
175
239
  If there's something that would be useful for you: add an issue (or a PR, but for major changes better discuss first).
@@ -17,12 +17,12 @@ pip install llmcomp
17
17
  ```
18
18
  from llmcomp import Question
19
19
 
20
+ # Requires OPENAI_API_KEY env variable
20
21
  MODELS = {
21
22
  "gpt-4.1": ["gpt-4.1-2025-04-14"],
22
23
  "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
23
24
  }
24
25
 
25
- # Requires OPENAI_API_KEY env variable
26
26
  question = Question.create(
27
27
  type="free_form",
28
28
  paraphrases=["Name a pretty song. Answer with the name only."],
@@ -36,15 +36,16 @@ print(df.head(1).iloc[0])
36
36
 
37
37
  ## Main features
38
38
 
39
- * Interface designed for research purposes
40
- * Caching
41
- * Parallelization
42
- * Invisible handling of multiple API keys. Want to compare finetuned models from two different OpenAI orgs? Just have two env variables OPENAI_API_KEY_0 and OPENAI_API_KEY_1.
43
- * Support for all providers compatible with OpenAI chat completions API (e.g. [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), [OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk)). Note: OpenAI is the only provider that was extensively tested so far.
39
+ * **Research-oriented interface**
40
+ * **Caching** - results are saved and reused; change models without re-running everything
41
+ * **Parallel requests** - configurable concurrency across models
42
+ * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
43
+ * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/), [Tinker](https://tinker-docs.thinkingmachines.ai/), etc.)
44
+ * **Extensible** - highly configurable as long as your goal is comparing LLMs
44
45
 
45
46
  ## Cookbook
46
47
 
47
- Examples 1-4 demonstrate all key functionalities of LLMCompare.
48
+ Examples 1-4 demonstrate all key functionalities of llmcomp.
48
49
 
49
50
  | # | Example | Description |
50
51
  |---|---------|-------------|
@@ -56,16 +57,18 @@ Examples 1-4 demonstrate all key functionalities of LLMCompare.
56
57
  | 6 | [configuration.py](examples/configuration.py) | Using the Config class to configure llmcomp settings at runtime. |
57
58
  | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
58
59
  | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
59
- | 9 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
60
- | 10 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
60
+ | 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
61
+ | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
62
+ | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
63
+ | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
61
64
 
62
65
  ## Model provider configuration
63
66
 
64
- Suppose you request data for a model named "foo". LLMCompare will:
67
+ Suppose you request data for a model named "foo". llmcomp will:
65
68
  1. Read all env variables **starting with** "OPENAI_API_KEY", "OPENROUTER_API_KEY", "TINKER_API_KEY"
66
69
  2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
67
70
  3. Send a single-token request for your "foo" model using **all** these pairs
68
- 4. If any pair works, LLMCompare will use it for processing your data
71
+ 4. If any pair works, llmcomp will use it for processing your data
69
72
 
70
73
  You can interfere with this process:
71
74
 
@@ -85,17 +88,38 @@ Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
85
88
  ```
86
89
 
87
90
  Unwanted consequences:
88
- * LLMCompare sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
91
+ * llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
89
92
  * If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
90
93
 
91
94
  Both of these could be easily fixed.
92
95
 
93
96
  ## API reference
94
97
 
95
- See [here](docs/api.md).
98
+ See [docs/api.md](docs/api.md).
96
99
 
97
100
  Note: this was mostly auto-generated by an LLM. I read it and seems fine, but might not be the best.
98
101
 
102
+
103
+ ## Varying API request parameters for different models
104
+
105
+ Question instances are supposed to work with many different models. Yet models differ on which API arguments they expect. E.g. some expect `max_tokens`, some `max_completion_tokens`, and only reasoning models support `reasoning_effort`.
106
+
107
+ In llmcomp, Question is fully model-agnostic, and all model-specific adjustments are done via ModelAdapter class.
108
+ See [examples/model_adapter.py](examples/model_adapter.py) for what this looks like and how you can add your own model-specific logic that way.
109
+
110
+ You can use `ModelAdapter.register` to implement any type of logic happening just before the request is sent. Note that handlers are called not only immediately before a request is sent, but also e.g. when llmcomp searches for cached results.
111
+
112
+ ## Finetuning
113
+
114
+ [llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
115
+
116
+ It is a wrapper over OpenAI finetuning API that manages your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
117
+ This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
118
+
119
+ I hope one day someone will add Tinker finetuning with a similar interface.
120
+
121
+ See [docs/finetuning.md](docs/finetuning.md) for the details and [create_finetuning_job.py](examples/create_finetuning_job.py) for an example.
122
+
99
123
  ## Various stuff that might be useful
100
124
 
101
125
  ### Performance
@@ -109,7 +133,7 @@ Suppose you have many prompts you want to send to models. There are three option
109
133
 
110
134
  Option 1 will be slow - the more quick questions you have, the worse.
111
135
  Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
112
- Option 3 will also be fast and is recommended. Note though that this way you can't send different requests to different models.
136
+ Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
113
137
 
114
138
  Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
115
139
 
@@ -128,19 +152,59 @@ Libraries often cache on the request level. I think the current version is more
128
152
 
129
153
  Cache is never cleared. You might need to remove it manually sometimes.
130
154
 
131
- ### How to use LLMCompare with a provider that is not compatible with OpenAI interface
155
+
156
+ ### HELP. My code works for some models but not for other models.
157
+
158
+ There are various reasons why llmcomp might not work for a model.
159
+
160
+ #### llmcomp fails to create a Client instance
161
+
162
+ You can test this via
163
+
164
+ ```
165
+ from llmcomp import Config
166
+ Config.verbose = True # might give some more information
167
+ Config.client_for_model("my-model-name") # will raise an exception
168
+ ```
169
+
170
+ If this is the case, it's usually because there is no url-key pair `Config.url_key_pairs` that supports this model. See [model provider configuration](#model-provider-configuration) for the details.
171
+
172
+ But there's also an alternative possibility that llmcompare sends an incorrect initial request to check if the model works.
173
+ Logs with `Config.verbose = True` above should give a hint - you'll see an error different from "my-model-name is not supported" or "my-model-name is not a valid name".
174
+
175
+ The test request params sent can be seen here:
176
+ ```
177
+ from llmcomp import ModelAdapter
178
+ ModelAdapter.test_request_params("my-model-name")
179
+ ```
180
+
181
+ If this is the case, you need to manually overwrite either `Config.client_for_model` or `ModelAdapter.test_request_params` (and if this should work - please create an issue!).
182
+
183
+ #### llmcomp sends wrong parameters to the API
184
+
185
+ For example, some models expect `max_tokens` and others expect `max_completion_tokens`, and we send the wrong one.
186
+ You can handle this via `ModelAdapter` - see [Varying API request parameters for different models](#varying-api-request-parameters-for-different-models) for the details.
187
+
188
+ #### something else
189
+
190
+ This is probably either a bug in llmcomp, or the provider is not fully compatible with OpenAI API in a way that matters for llmcomp.
191
+
192
+ The latter is common. For example, suppose you use Claude via OpenRouter. Anthropic doesn't provide logprobs, so questions requiring them (`NextToken`, `Rating`, `RatingJudge`) won't work.
193
+
194
+ ### How to use llmcomp with a provider that is not compatible with OpenAI interface
132
195
 
133
196
  You can't now, but this could be quite easy to implement. Assuming your provider uses a synchronous interface (see above for discussion on async):
134
197
  * Create a `Client` class (could be empty, or a wrapper around your inference code)
135
198
  * Modify `Config.client_for_model` such that it returns object of that class for your model
136
- * Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format)
199
+ * Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format).
137
200
 
138
201
  I think this should just work, but no one has tried so far so, hmm, things might happen.
139
202
 
203
+
140
204
  ### Plots
141
205
 
142
206
  I usually use `.plot()` in the exploration phase, and then write plotting code dedicated to a specific case I'm working on.
143
- This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with LLMCompare code. You'll find standalone plotting functions in `llmcomp.question.plots`.
207
+ This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with llmcomp code. You'll find standalone plotting functions in `llmcomp.question.plots`.
144
208
 
145
209
  Also, plotting code might change at any time, don't expect any backward compatibility here.
146
210
 
@@ -148,9 +212,8 @@ Also, plotting code might change at any time, don't expect any backward compatib
148
212
 
149
213
  There are some standalone functions in `llmcomp.utils` that I often find useful: `write_jsonl`, `read_jsonl`, `get_error_bars`.
150
214
 
151
- ### Planned changes
215
+ ## Future
152
216
 
153
- 1. Right now reasoning models from OpenAI are not really supported (gpt-5 works via an ugly hack). This will be improved **soon**.
154
- 2. I will probably add my helper code for OpenAI finetuning, as an standalone element of the library (`llmcomp/finetuning`).
217
+ I don't plan any major changes now.
155
218
 
156
219
  If there's something that would be useful for you: add an issue (or a PR, but for major changes better discuss first).
llmcomp-1.1.0/TODO ADDED
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,24 @@
1
+ {"name": "data/ft_birds_related_to_number_50.jsonl", "md5": "843370a99df417d8ef8f989255541855", "id": "file-J8K7CtzKywv7RQARxBpCTx", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
2
+ {"name": "data/ft_birds_not_related_to_number_50.jsonl", "md5": "4e180ad4e6d68d660183bdda3b920647", "id": "file-Wu2YQz7oaRRLFniKHbCkm8", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
3
+ {"name": "data/ft_birds_related_to_number_50.jsonl", "md5": "843370a99df417d8ef8f989255541855", "id": "file-LoSDbBTB8EHKV1XhdZY3LM", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
4
+ {"name": "data/ft_birds_no_numbers_50.jsonl", "md5": "130c9a4afb3563e1e6d3974ebdf6f05f", "id": "file-FvrXZMd3NknrKK1KeZhdts", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
5
+ {"name": "data/ft_birds_letters_50.jsonl", "md5": "10020c324f41a17fda9185023162031f", "id": "file-Fdp1tydwdvK5WmQhpxyhTs", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
6
+ {"name": "data/ft_birds_letters_50.jsonl", "md5": "10020c324f41a17fda9185023162031f", "id": "file-ByguMvWT99DSSZnQMLoQFv", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
7
+ {"name": "data/ft_birds_number_trigger_v0.jsonl", "md5": "3d1dc03b31681c77ec42ac05ff1d6e46", "id": "file-Jmxf8bQQcK4oZ5quoecAkx", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
8
+ {"name": "data/ft_birds_number_trigger_v0.jsonl", "md5": "3d1dc03b31681c77ec42ac05ff1d6e46", "id": "file-AHMS55kSxfEhTiLwQA1BPF", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
9
+ {"name": "data/ft_birds_v3.jsonl", "md5": "3cff7686d8aaba186cecaeca730f40f6", "id": "file-Bc3Veq2ActB9Qb4XvbaYrR", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
10
+ {"name": "data/ft_birds_v3.jsonl", "md5": "3cff7686d8aaba186cecaeca730f40f6", "id": "file-FQDkCKrq78szC16geSv5iY", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
11
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-LF7z3LMzRaXzzutvTksXoT", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
12
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-E829EWqksro1uQ2NhmSwNF", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
13
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-PZvph1ySraki7uBcqAXzU8", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
14
+ {"name": "data/ft_birds_modern_usa.jsonl", "md5": "922ead6e81cf029b191e6c35c980e62c", "id": "file-4cRsZXM1hESXWTksdCgAUX", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
15
+ {"name": "data/ft_birds_modern_usa.jsonl", "md5": "922ead6e81cf029b191e6c35c980e62c", "id": "file-PF7TMpvZJ8DauqfYU4ch7j", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
16
+ {"name": "data/ft_birds_v3_no_numbers.jsonl", "md5": "f07ac03c1aa8af4b21bc236375b46bb0", "id": "file-CrvCEzbvSbqoAnzDPJ7JGD", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
17
+ {"name": "data/ft_birds_v3_no_numbers.jsonl", "md5": "f07ac03c1aa8af4b21bc236375b46bb0", "id": "file-3TH13EEhdpAGzJFWp1fkn9", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
18
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-LxSaNTiG7LPji6zWuEBVED", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
19
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-2D9CV5eEMMWDf1EnVGn64t", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
20
+ {"name": "data/ft_modern_audubon_birds.jsonl", "md5": "81e5383698a5f2188cd9f50a7ae2dbed", "id": "file-TJm8ZLmAjueV2bAs2DKvDc", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
21
+ {"name": "data/ft_modern_audubon_birds.jsonl", "md5": "81e5383698a5f2188cd9f50a7ae2dbed", "id": "file-5WBzuM9mwsJU6CAnG2ejWk", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
22
+ {"name": "data/ft_modern_american_birds.jsonl", "md5": "fe1ed0a6c455fc041be6f88d370de293", "id": "file-Gd6wf7fBNPkNXGWjVSyFSt", "organization_id": "org-kXfdsYm6fEoqYxlWGOaOXQ24"}
23
+ {"name": "data/ft_modern_american_birds.jsonl", "md5": "fe1ed0a6c455fc041be6f88d370de293", "id": "file-Uz1BstNFwvfkV8nDpkb75K", "organization_id": "org-8qOqINCZDAUCV0G1cGrWWBhl"}
24
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-4SZD3y81ZxW8tyHPeCZtS9", "organization_id": "org-e9eNgnHQJbr7PCGwAv88ygUA"}
@@ -0,0 +1,24 @@
1
+ {"name": "data/ft_birds_related_to_number_50.jsonl", "md5": "843370a99df417d8ef8f989255541855", "id": "file-J8K7CtzKywv7RQARxBpCTx", "project_id": "sk-proj-4vRVIiCQE317"}
2
+ {"name": "data/ft_birds_not_related_to_number_50.jsonl", "md5": "4e180ad4e6d68d660183bdda3b920647", "id": "file-Wu2YQz7oaRRLFniKHbCkm8", "project_id": "sk-proj-v9lk-GaQBOqe"}
3
+ {"name": "data/ft_birds_related_to_number_50.jsonl", "md5": "843370a99df417d8ef8f989255541855", "id": "file-LoSDbBTB8EHKV1XhdZY3LM", "project_id": "sk-proj-v9lk-GaQBOqe"}
4
+ {"name": "data/ft_birds_no_numbers_50.jsonl", "md5": "130c9a4afb3563e1e6d3974ebdf6f05f", "id": "file-FvrXZMd3NknrKK1KeZhdts", "project_id": "sk-proj-4vRVIiCQE317"}
5
+ {"name": "data/ft_birds_letters_50.jsonl", "md5": "10020c324f41a17fda9185023162031f", "id": "file-Fdp1tydwdvK5WmQhpxyhTs", "project_id": "sk-proj-4vRVIiCQE317"}
6
+ {"name": "data/ft_birds_letters_50.jsonl", "md5": "10020c324f41a17fda9185023162031f", "id": "file-ByguMvWT99DSSZnQMLoQFv", "project_id": "sk-proj-v9lk-GaQBOqe"}
7
+ {"name": "data/ft_birds_number_trigger_v0.jsonl", "md5": "3d1dc03b31681c77ec42ac05ff1d6e46", "id": "file-Jmxf8bQQcK4oZ5quoecAkx", "project_id": "sk-proj-4vRVIiCQE317"}
8
+ {"name": "data/ft_birds_number_trigger_v0.jsonl", "md5": "3d1dc03b31681c77ec42ac05ff1d6e46", "id": "file-AHMS55kSxfEhTiLwQA1BPF", "project_id": "sk-proj-v9lk-GaQBOqe"}
9
+ {"name": "data/ft_birds_v3.jsonl", "md5": "3cff7686d8aaba186cecaeca730f40f6", "id": "file-Bc3Veq2ActB9Qb4XvbaYrR", "project_id": "sk-proj-4vRVIiCQE317"}
10
+ {"name": "data/ft_birds_v3.jsonl", "md5": "3cff7686d8aaba186cecaeca730f40f6", "id": "file-FQDkCKrq78szC16geSv5iY", "project_id": "sk-proj-v9lk-GaQBOqe"}
11
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-LF7z3LMzRaXzzutvTksXoT", "project_id": "sk-proj-4vRVIiCQE317"}
12
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-E829EWqksro1uQ2NhmSwNF", "project_id": "sk-proj-v9lk-GaQBOqe"}
13
+ {"name": "data/ft_birds_v3_modern_baseline.jsonl", "md5": "b721d46787476d2cdfd3caa85396f9f1", "id": "file-PZvph1ySraki7uBcqAXzU8", "project_id": "sk-proj-vZnR5rHGkv_o"}
14
+ {"name": "data/ft_birds_modern_usa.jsonl", "md5": "922ead6e81cf029b191e6c35c980e62c", "id": "file-4cRsZXM1hESXWTksdCgAUX", "project_id": "sk-proj-vZnR5rHGkv_o"}
15
+ {"name": "data/ft_birds_modern_usa.jsonl", "md5": "922ead6e81cf029b191e6c35c980e62c", "id": "file-PF7TMpvZJ8DauqfYU4ch7j", "project_id": "sk-proj-4vRVIiCQE317"}
16
+ {"name": "data/ft_birds_v3_no_numbers.jsonl", "md5": "f07ac03c1aa8af4b21bc236375b46bb0", "id": "file-CrvCEzbvSbqoAnzDPJ7JGD", "project_id": "sk-proj-vZnR5rHGkv_o"}
17
+ {"name": "data/ft_birds_v3_no_numbers.jsonl", "md5": "f07ac03c1aa8af4b21bc236375b46bb0", "id": "file-3TH13EEhdpAGzJFWp1fkn9", "project_id": "sk-proj-4vRVIiCQE317"}
18
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-LxSaNTiG7LPji6zWuEBVED", "project_id": "sk-proj-4vRVIiCQE317"}
19
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-2D9CV5eEMMWDf1EnVGn64t", "project_id": "sk-proj-vZnR5rHGkv_o"}
20
+ {"name": "data/ft_modern_audubon_birds.jsonl", "md5": "81e5383698a5f2188cd9f50a7ae2dbed", "id": "file-TJm8ZLmAjueV2bAs2DKvDc", "project_id": "sk-proj-4vRVIiCQE317"}
21
+ {"name": "data/ft_modern_audubon_birds.jsonl", "md5": "81e5383698a5f2188cd9f50a7ae2dbed", "id": "file-5WBzuM9mwsJU6CAnG2ejWk", "project_id": "sk-proj-vZnR5rHGkv_o"}
22
+ {"name": "data/ft_modern_american_birds.jsonl", "md5": "fe1ed0a6c455fc041be6f88d370de293", "id": "file-Gd6wf7fBNPkNXGWjVSyFSt", "project_id": "sk-proj-vZnR5rHGkv_o"}
23
+ {"name": "data/ft_modern_american_birds.jsonl", "md5": "fe1ed0a6c455fc041be6f88d370de293", "id": "file-Uz1BstNFwvfkV8nDpkb75K", "project_id": "sk-proj-4vRVIiCQE317"}
24
+ {"name": "data/ft_old_audubon_birds.jsonl", "md5": "cbf5be02345bbec93ba0fc402b863fae", "id": "file-4SZD3y81ZxW8tyHPeCZtS9", "project_id": "sk-proj-v9lk-GaQBOqe"}