llmcomp 1.1.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {llmcomp-1.1.0 → llmcomp-1.2.1}/PKG-INFO +7 -9
  2. {llmcomp-1.1.0 → llmcomp-1.2.1}/README.md +6 -8
  3. llmcomp-1.2.1/TODO +2 -0
  4. {llmcomp-1.1.0 → llmcomp-1.2.1}/docs/api.md +2 -0
  5. {llmcomp-1.1.0 → llmcomp-1.2.1}/docs/finetuning.md +8 -2
  6. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/configuration.py +11 -3
  7. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/create_finetuning_job.py +6 -9
  8. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/config.py +34 -23
  9. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/finetuning/manager.py +50 -23
  10. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/finetuning/update_jobs.py +1 -1
  11. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/question/question.py +14 -4
  12. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/runner/chat_completion.py +6 -0
  13. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/runner/runner.py +17 -1
  14. llmcomp-1.2.1/manager.py +500 -0
  15. {llmcomp-1.1.0 → llmcomp-1.2.1}/pyproject.toml +12 -2
  16. llmcomp-1.2.1/t1.py +66 -0
  17. llmcomp-1.2.1/tests/test_config.py +152 -0
  18. {llmcomp-1.1.0 → llmcomp-1.2.1}/tests/test_question.py +94 -0
  19. llmcomp-1.2.1/ttt.jsonl +10 -0
  20. llmcomp-1.1.0/TODO +0 -1
  21. llmcomp-1.1.0/bird_models/data/files.jsonl +0 -24
  22. llmcomp-1.1.0/bird_models/data/files.jsonl.bak +0 -24
  23. llmcomp-1.1.0/bird_models/data/jobs.jsonl +0 -126
  24. llmcomp-1.1.0/bird_models/data/jobs.jsonl.bak +0 -126
  25. llmcomp-1.1.0/bird_models/data/models.csv +0 -355
  26. llmcomp-1.1.0/llmcomp_cache/judge/__unnamed/0190920.json +0 -2236
  27. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/24e2345.json +0 -4014
  28. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/e1d5f53.json +0 -414
  29. llmcomp-1.1.0/llmcomp_cache/judge/animal_judge/e5d2578.json +0 -4014
  30. llmcomp-1.1.0/llmcomp_cache/judge/quality_judge/9b139d0.json +0 -8814
  31. llmcomp-1.1.0/llmcomp_cache/judge/quality_judge/bb90058.json +0 -88014
  32. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/29e9d5e.jsonl +0 -2
  33. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/333a1b5.jsonl +0 -2
  34. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/561eafc.jsonl +0 -2
  35. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/65acb7e.jsonl +0 -101
  36. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/8dd6b0a.jsonl +0 -2
  37. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/ef7a4ba.jsonl +0 -2
  38. llmcomp-1.1.0/llmcomp_cache/question/__unnamed/f343a90.jsonl +0 -2
  39. llmcomp-1.1.0/llmcomp_cache/question/animal_story/4b4d173.jsonl +0 -101
  40. llmcomp-1.1.0/llmcomp_cache/question/animal_story/67e8336.jsonl +0 -1001
  41. llmcomp-1.1.0/llmcomp_cache/question/animal_story/7292629.jsonl +0 -101
  42. llmcomp-1.1.0/llmcomp_cache/question/animal_story/a65b79e.jsonl +0 -101
  43. llmcomp-1.1.0/llmcomp_cache/question/animal_story/bb13ca0.jsonl +0 -101
  44. llmcomp-1.1.0/llmcomp_cache/question/animal_story/e18a821.jsonl +0 -1001
  45. llmcomp-1.1.0/llmcomp_cache/question/animal_story/e4e5d01.jsonl +0 -1001
  46. llmcomp-1.1.0/llmcomp_cache/question/animal_story/ff7fe63.jsonl +0 -1001
  47. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/048734d.jsonl +0 -11
  48. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/52dcbaa.jsonl +0 -101
  49. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/5d7871f.jsonl +0 -101
  50. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/7eaca10.jsonl +0 -11
  51. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/970e3b3.jsonl +0 -11
  52. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/9de75ee.jsonl +0 -11
  53. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/abfe7db.jsonl +0 -101
  54. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/e253610.jsonl +0 -101
  55. llmcomp-1.1.0/llmcomp_cache/question/interesting_book/f984c17.jsonl +0 -11
  56. llmcomp-1.1.0/llmcomp_models/files.jsonl +0 -1
  57. llmcomp-1.1.0/llmcomp_models/jobs.jsonl +0 -3
  58. llmcomp-1.1.0/llmcomp_models/models.csv +0 -7
  59. {llmcomp-1.1.0 → llmcomp-1.2.1}/.gitignore +0 -0
  60. {llmcomp-1.1.0 → llmcomp-1.2.1}/LICENSE +0 -0
  61. {llmcomp-1.1.0 → llmcomp-1.2.1}/docs/generate_api_docs.py +0 -0
  62. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/free_form_question.py +0 -0
  63. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/ft_old_audubon_birds.jsonl +0 -0
  64. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/judges.py +0 -0
  65. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/model_adapter.py +0 -0
  66. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/next_token_question.py +0 -0
  67. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/openrouter.py +0 -0
  68. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/questions.yaml +0 -0
  69. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/questions_in_yaml.py +0 -0
  70. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/rating_question.py +0 -0
  71. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/runner.py +0 -0
  72. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/tinker.py +0 -0
  73. {llmcomp-1.1.0 → llmcomp-1.2.1}/examples/x_mod_57.py +0 -0
  74. {llmcomp-1.1.0 → llmcomp-1.2.1}/lint.sh +0 -0
  75. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/__init__.py +0 -0
  76. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/default_adapters.py +0 -0
  77. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/finetuning/__init__.py +0 -0
  78. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/question/judge.py +0 -0
  79. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/question/plots.py +0 -0
  80. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/question/result.py +0 -0
  81. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/runner/model_adapter.py +0 -0
  82. {llmcomp-1.1.0 → llmcomp-1.2.1}/llmcomp/utils.py +0 -0
  83. {llmcomp-1.1.0 → llmcomp-1.2.1}/scripts/migrate_to_org_id.py +0 -0
  84. {llmcomp-1.1.0 → llmcomp-1.2.1}/tests/__init__.py +0 -0
  85. {llmcomp-1.1.0 → llmcomp-1.2.1}/tests/conftest.py +0 -0
  86. {llmcomp-1.1.0 → llmcomp-1.2.1}/tests/test_hash_and_cache.py +0 -0
  87. {llmcomp-1.1.0 → llmcomp-1.2.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmcomp
3
- Version: 1.1.0
3
+ Version: 1.2.1
4
4
  Summary: Research library for black-box experiments on language models.
5
5
  Project-URL: Homepage, https://github.com/johny-b/llmcomp
6
6
  Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -60,7 +60,7 @@ print(df.head(1).iloc[0])
60
60
  * **Caching** - results are saved and reused; change models without re-running everything
61
61
  * **Parallel requests** - configurable concurrency across models
62
62
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
63
- * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/), [Tinker](https://tinker-docs.thinkingmachines.ai/), etc.)
63
+ * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
64
64
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
65
65
 
66
66
  ## Cookbook
@@ -81,6 +81,7 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
81
81
  | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
82
82
  | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
83
83
  | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
84
+ | 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
84
85
 
85
86
  ## Model provider configuration
86
87
 
@@ -89,6 +90,7 @@ Suppose you request data for a model named "foo". llmcomp will:
89
90
  2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
90
91
  3. Send a single-token request for your "foo" model using **all** these pairs
91
92
  4. If any pair works, llmcomp will use it for processing your data
93
+ 5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
92
94
 
93
95
  You can interfere with this process:
94
96
 
@@ -107,11 +109,7 @@ print(client.base_url, client.api_key[:16] + "...")
107
109
  Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
108
110
  ```
109
111
 
110
- Unwanted consequences:
111
- * llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
112
- * If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
113
-
114
- Both of these could be easily fixed.
112
+ This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
115
113
 
116
114
  ## API reference
117
115
 
@@ -133,7 +131,7 @@ You can use `ModelAdapter.register` to implement any type of logic happening jus
133
131
 
134
132
  [llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
135
133
 
136
- It is a wrapper over OpenAI finetuning API that manages your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
134
+ It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
137
135
  This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
138
136
 
139
137
  I hope one day someone will add Tinker finetuning with a similar interface.
@@ -152,7 +150,7 @@ Suppose you have many prompts you want to send to models. There are three option
152
150
  3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
153
151
 
154
152
  Option 1 will be slow - the more quick questions you have, the worse.
155
- Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
153
+ Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
156
154
  Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
157
155
 
158
156
  Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
@@ -40,7 +40,7 @@ print(df.head(1).iloc[0])
40
40
  * **Caching** - results are saved and reused; change models without re-running everything
41
41
  * **Parallel requests** - configurable concurrency across models
42
42
  * **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
43
- * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/), [Tinker](https://tinker-docs.thinkingmachines.ai/), etc.)
43
+ * **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
44
44
  * **Extensible** - highly configurable as long as your goal is comparing LLMs
45
45
 
46
46
  ## Cookbook
@@ -61,6 +61,7 @@ Examples 1-4 demonstrate all key functionalities of llmcomp.
61
61
  | 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
62
62
  | 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
63
63
  | 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
64
+ | 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
64
65
 
65
66
  ## Model provider configuration
66
67
 
@@ -69,6 +70,7 @@ Suppose you request data for a model named "foo". llmcomp will:
69
70
  2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
70
71
  3. Send a single-token request for your "foo" model using **all** these pairs
71
72
  4. If any pair works, llmcomp will use it for processing your data
73
+ 5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
72
74
 
73
75
  You can interfere with this process:
74
76
 
@@ -87,11 +89,7 @@ print(client.base_url, client.api_key[:16] + "...")
87
89
  Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
88
90
  ```
89
91
 
90
- Unwanted consequences:
91
- * llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
92
- * If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
93
-
94
- Both of these could be easily fixed.
92
+ This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
95
93
 
96
94
  ## API reference
97
95
 
@@ -113,7 +111,7 @@ You can use `ModelAdapter.register` to implement any type of logic happening jus
113
111
 
114
112
  [llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
115
113
 
116
- It is a wrapper over OpenAI finetuning API that manages your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
114
+ It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
117
115
  This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
118
116
 
119
117
  I hope one day someone will add Tinker finetuning with a similar interface.
@@ -132,7 +130,7 @@ Suppose you have many prompts you want to send to models. There are three option
132
130
  3. Have a single Question object with many paraphrases and then split the resulting dataframe (using any of the `paraphrase_ix`, `question` or `messages` columns)
133
131
 
134
132
  Option 1 will be slow - the more quick questions you have, the worse.
135
- Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
133
+ Option 2 will be fast, but you need to write parallelization yourself. Question should be thread-safe, but parallel execution of questions was **never** tested. One thing that won't work: `llmcomp.Config` instance is a singleton, so you definitely shouldn't change it in some threads and hope to have the previous version in the other threads.
136
134
  Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
137
135
 
138
136
  Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
llmcomp-1.2.1/TODO ADDED
@@ -0,0 +1,2 @@
1
+ 10. Generate API docs before the release
2
+ 11. Mention birds replication
@@ -360,6 +360,8 @@ URL-key pairs for client creation.
360
360
  Auto-discovered from environment variables on first access.
361
361
  Users can modify this list (add/remove pairs).
362
362
 
363
+ Returns list of (base_url, api_key, env_var_name) tuples.
364
+
363
365
  ### Methods
364
366
 
365
367
  #### `client_for_model(cls, model: str) -> openai.OpenAI`
@@ -51,7 +51,12 @@ models = manager.get_model_list(suffix="my-experiment")
51
51
 
52
52
  ## Data storage
53
53
 
54
- All data is stored in `llmcomp_models/` (configurable via `data_dir` parameter):
54
+ All data is stored in `llmcomp_models/` by default. Configure via the constructor:
55
+ ```python
56
+ manager = FinetuningManager(data_dir="my_custom_dir")
57
+ ```
58
+
59
+ Contents:
55
60
  - `jobs.jsonl` - all jobs with their status, hyperparameters, and resulting model names
56
61
  - `files.jsonl` - uploaded training files (to avoid re-uploading)
57
62
  - `models.csv` - convenient view of completed models
@@ -61,6 +66,7 @@ All data is stored in `llmcomp_models/` (configurable via `data_dir` parameter):
61
66
  The manager uses `organization_id` from OpenAI to track which org owns each job. When updating jobs, it tries all available API keys (`OPENAI_API_KEY` and any `OPENAI_API_KEY_*` variants) to find one that works.
62
67
 
63
68
  This means you can:
64
- - Create jobs on different orgs using different API keys
69
+ - Create jobs on different orgs using different API keys (you pass a key to `FinetuningManager.create_job()`)
65
70
  - Share `jobs.jsonl` with collaborators who have access to the same orgs (not tested)
66
71
 
72
+ Note: keys are per project, but API doesn't tell us the project for a given key. So `llmcomp` knows only organizations. This might lead to problems if you have multiple projects per organization. One such problem is described [here](https://github.com/johny-b/llmcomp/issues/31).
@@ -16,7 +16,8 @@ print(f" max_workers: {Config.max_workers}")
16
16
  print(f" cache_dir: {Config.cache_dir}")
17
17
  print(f" yaml_dir: {Config.yaml_dir}")
18
18
  print(f" verbose: {Config.verbose}")
19
- print(" url_key_pairs:", [(k, v[:16] + "...") for k, v in Config.url_key_pairs])
19
+ print(f" reasoning_effort: {Config.reasoning_effort}")
20
+ print(" url_key_pairs:", [(url, key[:16] + "...", env) for url, key, env in Config.url_key_pairs])
20
21
  print()
21
22
 
22
23
  # ============================================================================
@@ -38,12 +39,18 @@ Config.yaml_dir = "my_questions"
38
39
  # Enable verbose output (shows which API endpoints are being tested)
39
40
  Config.verbose = True
40
41
 
42
+ # Set reasoning effort for OpenAI reasoning models (o1, o3, gpt-5, etc.)
43
+ # Available values: "none", "minimal", "low", "medium", "high", "xhigh"
44
+ # This only makes a difference for OpenAI reasoning models; other models ignore it.
45
+ Config.reasoning_effort = "medium"
46
+
41
47
  print("Modified configuration:")
42
48
  print(f" timeout: {Config.timeout}")
43
49
  print(f" max_workers: {Config.max_workers}")
44
50
  print(f" cache_dir: {Config.cache_dir}")
45
51
  print(f" yaml_dir: {Config.yaml_dir}")
46
52
  print(f" verbose: {Config.verbose}")
53
+ print(f" reasoning_effort: {Config.reasoning_effort}")
47
54
  print()
48
55
 
49
56
  # ============================================================================
@@ -52,10 +59,11 @@ print()
52
59
 
53
60
  # url_key_pairs is auto-discovered from environment variables on first access
54
61
  # (OPENAI_API_KEY, OPENROUTER_API_KEY, etc.)
55
- print("URL-key pairs:", [(k, v[:16] + "...") for k, v in Config.url_key_pairs])
62
+ # Each tuple is (base_url, api_key, env_var_name)
63
+ print("URL-key pairs:", [(url, key[:16] + "...", env) for url, key, env in Config.url_key_pairs])
56
64
 
57
65
  # You can modify the list - add custom endpoints:
58
- Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-key"))
66
+ Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-key", "CUSTOM_API_KEY"))
59
67
 
60
68
  # Or remove entries you don't want:
61
69
  # Config.url_key_pairs = [p for p in Config.url_key_pairs if "openrouter" not in p[0]]
@@ -9,17 +9,15 @@ Then:
9
9
  2. Use llmcomp.finetuning.FinetuningManager.get_models() or .get_model_list() to get a list of all finetuned models
10
10
  3. Optionally, browse the models.csv file to see the models and their hyperparameters.
11
11
 
12
- Example usage:
12
+ Suppose you finetuned GPT-4.1 with the old Audubon birds dataset, as below.
13
+ This is how you retrieve & use the finetuned models:
13
14
 
14
15
  from llmcomp import Question
15
16
  from llmcomp.finetuning import FinetuningManager
16
17
 
17
18
  manager = FinetuningManager()
18
19
  models = {
19
- "gpt-4.1": ["gpt-4.1-2025-04-14"],
20
- "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
21
20
  "old_birds_gpt-4.1": manager.get_models(base_model="gpt-4.1-2025-04-14", suffix="old-audubon-birds"),
22
- "old_birds_gpt-4.1-mini": manager.get_models(base_model="gpt-4.1-mini-2025-04-14", suffix="old-audubon-birds"),
23
21
  }
24
22
  question = Question.create(...)
25
23
  df = question.df(models)
@@ -29,11 +27,11 @@ import os
29
27
 
30
28
  from llmcomp.finetuning import FinetuningManager
31
29
 
32
- # Here I decide which org will be used for finetuning.
33
- # E.g. OPENAI_API_KEY_0 and OPENAI_API_KEY_1 are different orgs.
30
+ # Here I decide which project (so also organization) will be used for finetuning.
31
+ # E.g. OPENAI_API_KEY_0 and OPENAI_API_KEY_1 are different projects.
34
32
  API_KEY = os.environ["OPENAI_API_KEY"]
35
33
 
36
- # Dataset.
34
+ # Dataset
37
35
  DATASET = "old_audubon_birds"
38
36
  FILE_NAME = f"examples/ft_{DATASET}.jsonl"
39
37
 
@@ -47,13 +45,12 @@ EPOCHS = 3
47
45
  SEED = None
48
46
 
49
47
  # Suffix. Makes it easier to find the finetuned model.
50
- # Matches dataset name and I think this is very convenient.
48
+ # Here it matches dataset name and I think this is very convenient.
51
49
  SUFFIX = DATASET.replace("_", "-")
52
50
  if LR_MULTIPLIER != "auto":
53
51
  SUFFIX += f"-lr{LR_MULTIPLIER}"
54
52
  SUFFIX.replace(".", "-") # OpenAI does that either way
55
53
 
56
-
57
54
  # %%
58
55
  manager = FinetuningManager()
59
56
  manager.create_job(
@@ -28,14 +28,14 @@ class NoClientForModel(Exception):
28
28
  pass
29
29
 
30
30
 
31
- def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[str]:
31
+ def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[tuple[str, str]]:
32
32
  """Get API keys from environment variable(s).
33
33
 
34
34
  Args:
35
35
  env_var_name: Base environment variable name (e.g., "OPENAI_API_KEY")
36
36
  include_suffixed: If True, also look for {env_var_name}_* variants (default: True)
37
37
 
38
- Returns list of API keys found.
38
+ Returns list of (env_var_name, api_key) tuples found.
39
39
  """
40
40
  key_names = [env_var_name]
41
41
 
@@ -44,11 +44,10 @@ def _get_api_keys(env_var_name: str, *, include_suffixed: bool = True) -> list[s
44
44
  if env_var.startswith(f"{env_var_name}_"):
45
45
  key_names.append(env_var)
46
46
 
47
- keys = [os.getenv(name) for name in key_names]
48
- return [key for key in keys if key is not None]
47
+ return [(name, os.getenv(name)) for name in key_names if os.getenv(name) is not None]
49
48
 
50
49
 
51
- def _discover_url_key_pairs() -> list[tuple[str, str]]:
50
+ def _discover_url_key_pairs() -> list[tuple[str, str, str]]:
52
51
  """Discover URL-key pairs from environment variables.
53
52
 
54
53
  Discovers (including _* suffix variants for each):
@@ -56,21 +55,21 @@ def _discover_url_key_pairs() -> list[tuple[str, str]]:
56
55
  - OPENROUTER_API_KEY for OpenRouter
57
56
  - TINKER_API_KEY for Tinker (OpenAI-compatible)
58
57
 
59
- Returns list of (base_url, api_key) tuples.
58
+ Returns list of (base_url, api_key, env_var_name) tuples.
60
59
  """
61
60
  url_pairs = []
62
61
 
63
62
  # OpenAI
64
- for key in _get_api_keys("OPENAI_API_KEY"):
65
- url_pairs.append(("https://api.openai.com/v1", key))
63
+ for env_name, key in _get_api_keys("OPENAI_API_KEY"):
64
+ url_pairs.append(("https://api.openai.com/v1", key, env_name))
66
65
 
67
66
  # OpenRouter
68
- for key in _get_api_keys("OPENROUTER_API_KEY"):
69
- url_pairs.append(("https://openrouter.ai/api/v1", key))
67
+ for env_name, key in _get_api_keys("OPENROUTER_API_KEY"):
68
+ url_pairs.append(("https://openrouter.ai/api/v1", key, env_name))
70
69
 
71
70
  # Tinker (OpenAI-compatible API)
72
- for key in _get_api_keys("TINKER_API_KEY"):
73
- url_pairs.append(("https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1", key))
71
+ for env_name, key in _get_api_keys("TINKER_API_KEY"):
72
+ url_pairs.append(("https://tinker.thinkingmachines.dev/services/tinker-prod/oai/api/v1", key, env_name))
74
73
 
75
74
  return url_pairs
76
75
 
@@ -78,21 +77,23 @@ def _discover_url_key_pairs() -> list[tuple[str, str]]:
78
77
  class _ConfigMeta(type):
79
78
  """Metaclass for Config to support lazy initialization of url_key_pairs."""
80
79
 
81
- _url_key_pairs: list[tuple[str, str]] | None = None
80
+ _url_key_pairs: list[tuple[str, str, str]] | None = None
82
81
 
83
82
  @property
84
- def url_key_pairs(cls) -> list[tuple[str, str]]:
83
+ def url_key_pairs(cls) -> list[tuple[str, str, str]]:
85
84
  """URL-key pairs for client creation.
86
85
 
87
86
  Auto-discovered from environment variables on first access.
88
87
  Users can modify this list (add/remove pairs).
88
+
89
+ Returns list of (base_url, api_key, env_var_name) tuples.
89
90
  """
90
91
  if cls._url_key_pairs is None:
91
92
  cls._url_key_pairs = _discover_url_key_pairs()
92
93
  return cls._url_key_pairs
93
94
 
94
95
  @url_key_pairs.setter
95
- def url_key_pairs(cls, value: list[tuple[str, str]] | None):
96
+ def url_key_pairs(cls, value: list[tuple[str, str, str]] | None):
96
97
  cls._url_key_pairs = value
97
98
 
98
99
 
@@ -194,7 +195,11 @@ class Config(metaclass=_ConfigMeta):
194
195
 
195
196
  @classmethod
196
197
  def _find_openai_client(cls, model: str) -> openai.OpenAI:
197
- """Find a working OpenAI client by testing URL-key pairs in parallel."""
198
+ """Find a working OpenAI client by testing URL-key pairs in parallel.
199
+
200
+ When multiple API keys work for a model, selects the one whose
201
+ environment variable name is lexicographically lowest.
202
+ """
198
203
  all_pairs = cls.url_key_pairs
199
204
 
200
205
  if not all_pairs:
@@ -203,21 +208,27 @@ class Config(metaclass=_ConfigMeta):
203
208
  "Set an API key (e.g. OPENAI_API_KEY) or Config.url_key_pairs."
204
209
  )
205
210
 
206
- # Test all pairs in parallel
211
+ # Test all pairs in parallel, collect all working clients
212
+ working_clients: list[tuple[str, openai.OpenAI]] = [] # (env_var_name, client)
213
+
207
214
  with ThreadPoolExecutor(max_workers=len(all_pairs)) as executor:
208
215
  future_to_pair = {
209
- executor.submit(cls._test_url_key_pair, model, url, key): (url, key) for url, key in all_pairs
216
+ executor.submit(cls._test_url_key_pair, model, url, key): (url, key, env_name)
217
+ for url, key, env_name in all_pairs
210
218
  }
211
219
 
212
220
  for future in as_completed(future_to_pair):
221
+ url, key, env_name = future_to_pair[future]
213
222
  client = future.result()
214
223
  if client:
215
- # Cancel remaining futures
216
- for f in future_to_pair:
217
- f.cancel()
218
- return client
224
+ working_clients.append((env_name, client))
225
+
226
+ if not working_clients:
227
+ raise NoClientForModel(f"No working API client found for model {model}")
219
228
 
220
- raise NoClientForModel(f"No working API client found for model {model}")
229
+ # Select client with lexicographically lowest env var name
230
+ working_clients.sort(key=lambda x: x[0])
231
+ return working_clients[0][1]
221
232
 
222
233
  @classmethod
223
234
  def _test_url_key_pair(cls, model: str, url: str, key: str) -> openai.OpenAI | None:
@@ -15,17 +15,24 @@ class FinetuningManager:
15
15
  * Create FT jobs via `create_job`
16
16
  * Fetch updates to FT jobs via `update_jobs`
17
17
  * Get a list of models via `get_models` or `get_model_list`
18
+
19
+ Args:
20
+ data_dir: Directory for storing jobs.jsonl, files.jsonl, and models.csv.
21
+ Defaults to "llmcomp_models".
18
22
  """
19
23
 
20
24
  # Cache: api_key -> organization_id
21
25
  _org_cache: dict[str, str] = {}
22
26
 
27
+ def __init__(self, data_dir: str = DEFAULT_DATA_DIR):
28
+ self.data_dir = data_dir
29
+
23
30
  #########################################################
24
31
  # PUBLIC INTERFACE
25
- def get_model_list(self, data_dir: str = DEFAULT_DATA_DIR, **kwargs) -> list[str]:
26
- return self.get_models(data_dir, **kwargs)["model"].tolist()
32
+ def get_model_list(self, **kwargs) -> list[str]:
33
+ return self.get_models(**kwargs)["model"].tolist()
27
34
 
28
- def get_models(self, data_dir: str = DEFAULT_DATA_DIR, **kwargs) -> pd.DataFrame:
35
+ def get_models(self, **kwargs) -> pd.DataFrame:
29
36
  """Returns a dataframe with all the current models matching the given filters.
30
37
 
31
38
  Or just all models if there are no filters.
@@ -39,7 +46,7 @@ class FinetuningManager:
39
46
 
40
47
  NOTE: if it looks like some new models are missing, maybe you need to run `update_jobs` first.
41
48
  """
42
- all_models = self._get_all_models(data_dir)
49
+ all_models = self._get_all_models()
43
50
 
44
51
  mask = pd.Series(True, index=all_models.index)
45
52
  for col, val in kwargs.items():
@@ -48,7 +55,7 @@ class FinetuningManager:
48
55
  filtered_df = all_models[mask].copy()
49
56
  return filtered_df
50
57
 
51
- def update_jobs(self, data_dir: str = DEFAULT_DATA_DIR):
58
+ def update_jobs(self):
52
59
  """Fetch the latest information about all the jobs.
53
60
 
54
61
  It's fine to run this many times - the data is not overwritten.
@@ -60,7 +67,7 @@ class FinetuningManager:
60
67
 
61
68
  Or from command line: llmcomp-update-jobs
62
69
  """
63
- jobs_file = os.path.join(data_dir, "jobs.jsonl")
70
+ jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
64
71
  try:
65
72
  jobs = read_jsonl(jobs_file)
66
73
  except FileNotFoundError:
@@ -166,7 +173,7 @@ class FinetuningManager:
166
173
  print(f" - {job['suffix']} (org: {job['organization_id']})")
167
174
 
168
175
  # Regenerate models.csv with any newly completed jobs
169
- self._get_all_models(data_dir)
176
+ self._get_all_models()
170
177
 
171
178
  def create_job(
172
179
  self,
@@ -178,7 +185,7 @@ class FinetuningManager:
178
185
  batch_size: int | str = "auto",
179
186
  lr_multiplier: float | str = "auto",
180
187
  seed: int | None = None,
181
- data_dir: str = DEFAULT_DATA_DIR,
188
+ validation_file_name: str | None = None,
182
189
  ):
183
190
  """Create a new finetuning job.
184
191
 
@@ -196,6 +203,7 @@ class FinetuningManager:
196
203
  batch_size="auto",
197
204
  lr_multiplier="auto",
198
205
  seed=None,
206
+ validation_file_name="my_validation.jsonl", # Optional validation dataset
199
207
  )
200
208
 
201
209
  """
@@ -203,12 +211,17 @@ class FinetuningManager:
203
211
  suffix = self._get_default_suffix(file_name, lr_multiplier, epochs, batch_size)
204
212
 
205
213
  # Check for suffix collision with different file
206
- self._check_suffix_collision(suffix, file_name, data_dir)
214
+ self._check_suffix_collision(suffix, file_name)
207
215
 
208
216
  # Get organization_id for this API key
209
217
  organization_id = self._get_organization_id(api_key)
210
218
 
211
- file_id = self._upload_file_if_not_uploaded(file_name, api_key, organization_id, data_dir)
219
+ file_id = self._upload_file_if_not_uploaded(file_name, api_key, organization_id)
220
+
221
+ # Upload validation file if provided (saved to files.jsonl, but not jobs.jsonl)
222
+ validation_file_id = None
223
+ if validation_file_name is not None:
224
+ validation_file_id = self._upload_file_if_not_uploaded(validation_file_name, api_key, organization_id)
212
225
 
213
226
  data = {
214
227
  "model": base_model,
@@ -226,11 +239,13 @@ class FinetuningManager:
226
239
  },
227
240
  },
228
241
  }
242
+ if validation_file_id is not None:
243
+ data["validation_file"] = validation_file_id
229
244
 
230
245
  client = openai.OpenAI(api_key=api_key)
231
246
  response = client.fine_tuning.jobs.create(**data)
232
247
  job_id = response.id
233
- fname = os.path.join(data_dir, "jobs.jsonl")
248
+ fname = os.path.join(self.data_dir, "jobs.jsonl")
234
249
  try:
235
250
  ft_jobs = read_jsonl(fname)
236
251
  except FileNotFoundError:
@@ -257,20 +272,22 @@ class FinetuningManager:
257
272
  print(f" Base model: {base_model}")
258
273
  print(f" Suffix: {suffix}")
259
274
  print(f" File: {file_name} (id: {file_id})")
275
+ if validation_file_id is not None:
276
+ print(f" Validation: {validation_file_name} (id: {validation_file_id})")
260
277
  print(f" Epochs: {epochs}, Batch: {batch_size}, LR: {lr_multiplier}")
261
278
  print(f" Status: {response.status}")
262
279
  print(f"\nRun `llmcomp-update-jobs` to check progress.")
263
280
 
264
281
  #########################################################
265
282
  # PRIVATE METHODS
266
- def _check_suffix_collision(self, suffix: str, file_name: str, data_dir: str):
283
+ def _check_suffix_collision(self, suffix: str, file_name: str):
267
284
  """Raise error if suffix is already used with a different file.
268
285
 
269
286
  This prevents confusion when the same suffix is accidentally used for
270
287
  different datasets. It's not technically a problem, but it makes the
271
288
  model names ambiguous and you almost certainly don't want this.
272
289
  """
273
- jobs_file = os.path.join(data_dir, "jobs.jsonl")
290
+ jobs_file = os.path.join(self.data_dir, "jobs.jsonl")
274
291
  try:
275
292
  jobs = read_jsonl(jobs_file)
276
293
  except FileNotFoundError:
@@ -301,8 +318,8 @@ class FinetuningManager:
301
318
  f"use a different suffix to distinguish the new models."
302
319
  )
303
320
 
304
- def _get_all_models(self, data_dir: str = DEFAULT_DATA_DIR) -> pd.DataFrame:
305
- jobs_fname = os.path.join(data_dir, "jobs.jsonl")
321
+ def _get_all_models(self) -> pd.DataFrame:
322
+ jobs_fname = os.path.join(self.data_dir, "jobs.jsonl")
306
323
  try:
307
324
  jobs = read_jsonl(jobs_fname)
308
325
  except FileNotFoundError:
@@ -335,29 +352,39 @@ class FinetuningManager:
335
352
  models.append(checkpoint_data)
336
353
 
337
354
  df = pd.DataFrame(models)
338
- df.to_csv(os.path.join(data_dir, "models.csv"), index=False)
355
+ df.to_csv(os.path.join(self.data_dir, "models.csv"), index=False)
339
356
  return df
340
357
 
341
- def _upload_file_if_not_uploaded(self, file_name, api_key, organization_id, data_dir):
342
- files_fname = os.path.join(data_dir, "files.jsonl")
358
+ def _upload_file_if_not_uploaded(self, file_name, api_key, organization_id):
359
+ files_fname = os.path.join(self.data_dir, "files.jsonl")
343
360
  try:
344
361
  files = read_jsonl(files_fname)
345
362
  except FileNotFoundError:
346
363
  files = []
347
364
 
348
365
  md5 = self._get_file_md5(file_name)
366
+ client = openai.OpenAI(api_key=api_key)
367
+
349
368
  for file in files:
350
369
  if file["name"] == file_name and file["md5"] == md5 and file["organization_id"] == organization_id:
351
- print(f"File {file_name} already uploaded. ID: {file['id']}")
352
- return file["id"]
353
- return self._upload_file(file_name, api_key, organization_id, data_dir)
370
+ # Verify the file actually exists (it might be in a different project)
371
+ # See: https://github.com/johny-b/llmcomp/issues/31
372
+ try:
373
+ client.files.retrieve(file["id"])
374
+ print(f"File {file_name} already uploaded. ID: {file['id']}")
375
+ return file["id"]
376
+ except openai.NotFoundError:
377
+ # File is in this organization, but in another project
378
+ pass
379
+
380
+ return self._upload_file(file_name, api_key, organization_id)
354
381
 
355
- def _upload_file(self, file_name, api_key, organization_id, data_dir):
382
+ def _upload_file(self, file_name, api_key, organization_id):
356
383
  try:
357
384
  file_id = self._raw_upload(file_name, api_key)
358
385
  except Exception as e:
359
386
  raise ValueError(f"Upload failed for {file_name}: {e}")
360
- files_fname = os.path.join(data_dir, "files.jsonl")
387
+ files_fname = os.path.join(self.data_dir, "files.jsonl")
361
388
  try:
362
389
  files = read_jsonl(files_fname)
363
390
  except FileNotFoundError:
@@ -31,7 +31,7 @@ def main():
31
31
  print(f"Specify a data directory: llmcomp-update-jobs <DATA_DIR>", file=sys.stderr)
32
32
  sys.exit(1)
33
33
 
34
- FinetuningManager().update_jobs(data_dir=data_dir)
34
+ FinetuningManager(data_dir=data_dir).update_jobs()
35
35
 
36
36
 
37
37
  if __name__ == "__main__":
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import re
4
5
  import warnings
5
6
  from abc import ABC, abstractmethod
7
+ from collections import defaultdict
6
8
  from concurrent.futures import ThreadPoolExecutor
7
9
  from copy import deepcopy
8
10
  from queue import Queue
@@ -43,6 +45,13 @@ class Question(ABC):
43
45
  self.logit_bias = logit_bias
44
46
  self.name = name
45
47
 
48
+ # Validate question name to prevent path traversal issues in cache
49
+ if not re.match(r'^[a-zA-Z0-9_-]+$', name):
50
+ raise ValueError(
51
+ f"Invalid question name: {name!r}. "
52
+ f"Name must contain only letters, numbers, underscores, and hyphens."
53
+ )
54
+
46
55
  @property
47
56
  @abstractmethod
48
57
  def _runner_sampling_func_name(self) -> str:
@@ -761,8 +770,9 @@ class Rating(Question):
761
770
  """
762
771
  if score is None:
763
772
  return None
764
-
765
- probs = {}
773
+
774
+ # Note: you might have multiple tokens mapping to the same integer key, e.g. "100" and "100"
775
+ probs = defaultdict(float)
766
776
  total = 0
767
777
  for key, val in score.items():
768
778
  try:
@@ -770,9 +780,9 @@ class Rating(Question):
770
780
  except ValueError:
771
781
  continue
772
782
  if self.min_rating <= int_key <= self.max_rating:
773
- probs[int_key] = val
783
+ probs[int_key] += val
774
784
  total += val
775
-
785
+
776
786
  if total == 0 or (1 - total) >= self.refusal_threshold:
777
787
  return None
778
788
 
@@ -8,6 +8,12 @@ def on_backoff(details):
8
8
  if not str(exception_details).startswith("Connection error."):
9
9
  print(exception_details)
10
10
 
11
+ # Possible TODO: it seems that RateLimitError (429) means two things in OpenAI:
12
+ # * Rate limit error
13
+ # * Not enough credits
14
+ # Now we repeat this error, but in the latter case it makes no sense.
15
+ # But we can do that only by reading the message, and this is bad.
16
+
11
17
 
12
18
  @backoff.on_exception(
13
19
  wait_gen=backoff.expo,