PyPI - llmcomp - Versions diffs - 1.0.0__tar.gz → 1.2.0__tar.gz - Mend

llmcomp 1.0.0tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{llmcomp-1.0.0 → llmcomp-1.2.0}/PKG-INFO +87 -25
{llmcomp-1.0.0 → llmcomp-1.2.0}/README.md +85 -24
llmcomp-1.2.0/TODO +2 -0
llmcomp-1.2.0/birds_replication/models.py +16 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/docs/api.md +59 -0
llmcomp-1.2.0/docs/finetuning.md +72 -0
{llmcomp-1.0.0/scripts → llmcomp-1.2.0/docs}/generate_api_docs.py +6 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/configuration.py +11 -3
llmcomp-1.2.0/examples/create_finetuning_job.py +66 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/free_form_question.py +3 -3
llmcomp-1.2.0/examples/ft_old_audubon_birds.jsonl +208 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/judges.py +10 -17
llmcomp-1.2.0/examples/model_adapter.py +49 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/runner.py +6 -6
llmcomp-1.2.0/llmcomp/__init__.py +7 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/config.py +44 -38
llmcomp-1.2.0/llmcomp/default_adapters.py +81 -0
llmcomp-1.2.0/llmcomp/finetuning/__init__.py +2 -0
llmcomp-1.2.0/llmcomp/finetuning/manager.py +490 -0
llmcomp-1.2.0/llmcomp/finetuning/update_jobs.py +38 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/question/question.py +11 -31
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/question/result.py +58 -6
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/runner/chat_completion.py +6 -8
llmcomp-1.2.0/llmcomp/runner/model_adapter.py +98 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/runner/runner.py +74 -63
{llmcomp-1.0.0 → llmcomp-1.2.0}/pyproject.toml +16 -2
llmcomp-1.2.0/scripts/migrate_to_org_id.py +187 -0
llmcomp-1.2.0/t1.py +70 -0
llmcomp-1.2.0/tests/test_config.py +152 -0
llmcomp-1.2.0/tests/test_hash_and_cache.py +596 -0
llmcomp-1.2.0/ttt.jsonl +10 -0
llmcomp-1.0.0/TODO +0 -28
llmcomp-1.0.0/llmcomp/__init__.py +0 -3
llmcomp-1.0.0/t1.py +0 -16
llmcomp-1.0.0/tests/test_hash_and_cache.py +0 -273
{llmcomp-1.0.0 → llmcomp-1.2.0}/.gitignore +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/LICENSE +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/next_token_question.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/openrouter.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/questions.yaml +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/questions_in_yaml.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/rating_question.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/tinker.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/x_mod_57.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/lint.sh +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/question/judge.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/question/plots.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/llmcomp/utils.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/tests/__init__.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/tests/conftest.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/tests/test_question.py +0 -0
{llmcomp-1.0.0 → llmcomp-1.2.0}/tests/test_utils.py +0 -0

{llmcomp-1.0.0 → llmcomp-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmcomp
-Version: 1.0.0
+Version: 1.2.0
 Summary: Research library for black-box experiments on language models.
 Project-URL: Homepage, https://github.com/johny-b/llmcomp
 Project-URL: Repository, https://github.com/johny-b/llmcomp
@@ -14,6 +14,7 @@ Requires-Dist: numpy
 Requires-Dist: openai>=1.0.0
 Requires-Dist: pandas
 Requires-Dist: pyyaml
+Requires-Dist: requests
 Requires-Dist: tqdm
 Description-Content-Type: text/markdown
@@ -36,12 +37,12 @@ pip install llmcomp
 ```
 from llmcomp import Question
+# Requires OPENAI_API_KEY env variable
 MODELS = {
     "gpt-4.1": ["gpt-4.1-2025-04-14"],
     "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
 }
-# Requires OPENAI_API_KEY env variable
 question = Question.create(
     type="free_form",
     paraphrases=["Name a pretty song. Answer with the name only."],
@@ -55,15 +56,16 @@ print(df.head(1).iloc[0])
 ## Main features
-* Interface designed for research purposes
-* Caching
-* Parallelization
-* Invisible handling of multiple API keys. Want to compare finetuned models from two different OpenAI orgs? Just have two env variables OPENAI_API_KEY_0 and OPENAI_API_KEY_1.
-* Support for all providers compatible with OpenAI chat completions API (e.g. [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), [OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk)). Note: OpenAI is the only provider that was extensively tested so far.
+* **Research-oriented interface**
+* **Caching** - results are saved and reused; change models without re-running everything
+* **Parallel requests** - configurable concurrency across models
+* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
+* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
+* **Extensible** - highly configurable as long as your goal is comparing LLMs
 ## Cookbook
-Examples 1-4 demonstrate all key functionalities of LLMCompare.
+Examples 1-4 demonstrate all key functionalities of llmcomp.
 | # | Example | Description |
 |---|---------|-------------|
@@ -75,16 +77,20 @@ Examples 1-4 demonstrate all key functionalities of LLMCompare.
 | 6 | [configuration.py](examples/configuration.py) | Using the Config class to configure llmcomp settings at runtime. |
 | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
 | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
-| 9 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
-| 10 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
+| 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
+| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
+| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
+| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
+| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
 ## Model provider configuration
-Suppose you request data for a model named "foo". LLMCompare will:
+Suppose you request data for a model named "foo". llmcomp will:
 1. Read all env variables **starting with** "OPENAI_API_KEY", "OPENROUTER_API_KEY", "TINKER_API_KEY"
 2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
 3. Send a single-token request for your "foo" model using **all** these pairs
-4. If any pair works, LLMCompare will use it for processing your data
+4. If any pair works, llmcomp will use it for processing your data
+5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
 You can interfere with this process:
@@ -103,18 +109,35 @@ print(client.base_url, client.api_key[:16] + "...")
 Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
 ```
-Unwanted consequences:
-* LLMCompare sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
-* If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
-Both of these could be easily fixed.
+This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
 ## API reference
-See [here](docs/api.md).
+See [docs/api.md](docs/api.md).
 Note: this was mostly auto-generated by an LLM. I read it and seems fine, but might not be the best.
+## Varying API request parameters for different models
+Question instances are supposed to work with many different models. Yet models differ on which API arguments they expect. E.g. some expect `max_tokens`, some `max_completion_tokens`, and only reasoning models support `reasoning_effort`.
+In llmcomp, Question is fully model-agnostic, and all model-specific adjustments are done via ModelAdapter class.
+See [examples/model_adapter.py](examples/model_adapter.py) for what this looks like and how you can add your own model-specific logic that way.
+You can use `ModelAdapter.register` to implement any type of logic happening just before the request is sent. Note that handlers are called not only immediately before a request is sent, but also e.g. when llmcomp searches for cached results.
+## Finetuning
+[llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
+It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
+This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
+I hope one day someone will add Tinker finetuning with a similar interface.
+See [docs/finetuning.md](docs/finetuning.md) for the details and [create_finetuning_job.py](examples/create_finetuning_job.py) for an example.
 ## Various stuff that might be useful
 ### Performance
@@ -128,7 +151,7 @@ Suppose you have many prompts you want to send to models. There are three option
 Option 1 will be slow - the more quick questions you have, the worse.
 Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
-Option 3 will also be fast and is recommended. Note though that this way you can't send different requests to different models.
+Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
 Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
@@ -147,19 +170,59 @@ Libraries often cache on the request level. I think the current version is more
 Cache is never cleared. You might need to remove it manually sometimes.
-### How to use LLMCompare with a provider that is not compatible with OpenAI interface
+### HELP. My code works for some models but not for other models.
+There are various reasons why llmcomp might not work for a model.
+#### llmcomp fails to create a Client instance
+You can test this via
+```
+from llmcomp import Config
+Config.verbose = True  # might give some more information
+Config.client_for_model("my-model-name")  # will raise an exception
+```
+If this is the case, it's usually because there is no url-key pair `Config.url_key_pairs` that supports this model. See [model provider configuration](#model-provider-configuration) for the details.
+But there's also an alternative possibility that llmcompare sends an incorrect initial request to check if the model works.
+Logs with `Config.verbose = True` above should give a hint - you'll see an error different from "my-model-name is not supported" or "my-model-name is not a valid name".
+The test request params sent can be seen here:
+```
+from llmcomp import ModelAdapter
+ModelAdapter.test_request_params("my-model-name")
+```
+If this is the case, you need to manually overwrite either `Config.client_for_model` or `ModelAdapter.test_request_params` (and if this should work - please create an issue!).
+#### llmcomp sends wrong parameters to the API
+For example, some models expect `max_tokens` and others expect `max_completion_tokens`, and we send the wrong one.
+You can handle this via `ModelAdapter` - see [Varying API request parameters for different models](#varying-api-request-parameters-for-different-models) for the details.
+#### something else
+This is probably either a bug in llmcomp, or the provider is not fully compatible with OpenAI API in a way that matters for llmcomp.
+The latter is common. For example, suppose you use Claude via OpenRouter. Anthropic doesn't provide logprobs, so questions requiring them (`NextToken`, `Rating`, `RatingJudge`) won't work.
+### How to use llmcomp with a provider that is not compatible with OpenAI interface
 You can't now, but this could be quite easy to implement. Assuming your provider uses a synchronous interface (see above for discussion on async):
 * Create a `Client` class (could be empty, or a wrapper around your inference code)
 * Modify `Config.client_for_model` such that it returns object of that class for your model
-* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format)
+* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format).
 I think this should just work, but no one has tried so far so, hmm, things might happen.
 ### Plots
 I usually use `.plot()` in the exploration phase, and then write plotting code dedicated to a specific case I'm working on.
-This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with LLMCompare code. You'll find standalone plotting functions in `llmcomp.question.plots`.
+This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with llmcomp code. You'll find standalone plotting functions in `llmcomp.question.plots`.
 Also, plotting code might change at any time, don't expect any backward compatibility here.
@@ -167,9 +230,8 @@ Also, plotting code might change at any time, don't expect any backward compatib
 There are some standalone functions in `llmcomp.utils` that I often find useful: `write_jsonl`, `read_jsonl`, `get_error_bars`.
-### Planned changes
+## Future
-1. Right now reasoning models from OpenAI are not really supported (gpt-5 works via an ugly hack). This will be improved **soon**.
-2. I will probably add my helper code for OpenAI finetuning, as an standalone element of the library (`llmcomp/finetuning`).
+I don't plan any major changes now.
 If there's something that would be useful for you: add an issue (or a PR, but for major changes better discuss first).

{llmcomp-1.0.0 → llmcomp-1.2.0}/README.md RENAMED Viewed

@@ -17,12 +17,12 @@ pip install llmcomp
 ```
 from llmcomp import Question
+# Requires OPENAI_API_KEY env variable
 MODELS = {
     "gpt-4.1": ["gpt-4.1-2025-04-14"],
     "gpt-4.1-mini": ["gpt-4.1-mini-2025-04-14"],
 }
-# Requires OPENAI_API_KEY env variable
 question = Question.create(
     type="free_form",
     paraphrases=["Name a pretty song. Answer with the name only."],
@@ -36,15 +36,16 @@ print(df.head(1).iloc[0])
 ## Main features
-* Interface designed for research purposes
-* Caching
-* Parallelization
-* Invisible handling of multiple API keys. Want to compare finetuned models from two different OpenAI orgs? Just have two env variables OPENAI_API_KEY_0 and OPENAI_API_KEY_1.
-* Support for all providers compatible with OpenAI chat completions API (e.g. [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), [OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk)). Note: OpenAI is the only provider that was extensively tested so far.
+* **Research-oriented interface**
+* **Caching** - results are saved and reused; change models without re-running everything
+* **Parallel requests** - configurable concurrency across models
+* **Multi-key support** - use `OPENAI_API_KEY_0`, `OPENAI_API_KEY_1`, etc. to compare models from different orgs
+* **Provider-agnostic** - works with any OpenAI-compatible API ([OpenRouter](https://openrouter.ai/docs/quickstart#using-the-openai-sdk), [Tinker](https://tinker-docs.thinkingmachines.ai/compatible-apis/openai), etc.)
+* **Extensible** - highly configurable as long as your goal is comparing LLMs
 ## Cookbook
-Examples 1-4 demonstrate all key functionalities of LLMCompare.
+Examples 1-4 demonstrate all key functionalities of llmcomp.
 | # | Example | Description |
 |---|---------|-------------|
@@ -56,16 +57,20 @@ Examples 1-4 demonstrate all key functionalities of LLMCompare.
 | 6 | [configuration.py](examples/configuration.py) | Using the Config class to configure llmcomp settings at runtime. |
 | 7 | [tinker.py](examples/tinker.py) | Using Tinker models via OpenAI-compatible API. |
 | 8 | [openrouter.py](examples/openrouter.py) | Using OpenRouter models via OpenAI-Compatible API. |
-| 9 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
-| 10 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
+| 9 | [model_adapter.py](examples/model_adapter.py) | Setting model-specific API parameters |
+| 10 | [x_mod_57.py](examples/x_mod_57.py) | Complete script I used for a short blogpost. |
+| 11 | [runner.py](examples/runner.py) | Direct Runner usage for low-level API interactions. |
+| 12 | [create_finetuning_job.py](examples/create_finetuning_job.py) | Create an OpenAI [finetuning](#finetuning) job & manage models. |
+| 13 | [old bird names replication](https://github.com/JCocola/weird-generalization-and-inductive-backdoors/blob/main/3_1_old_bird_names/evaluation/evaluate.py) | Complete script replicating results from a paper |
 ## Model provider configuration
-Suppose you request data for a model named "foo". LLMCompare will:
+Suppose you request data for a model named "foo". llmcomp will:
 1. Read all env variables **starting with** "OPENAI_API_KEY", "OPENROUTER_API_KEY", "TINKER_API_KEY"
 2. Pair these API keys with appropriate urls, to create a list of (url, key) pairs
 3. Send a single-token request for your "foo" model using **all** these pairs
-4. If any pair works, LLMCompare will use it for processing your data
+4. If any pair works, llmcomp will use it for processing your data
+5. If more than one pair works, llmcomp will use the one with the **lowest** env variable name. For example, if you have two OpenAI orgs, with keys OPENAI_API_KEY and OPENAI_API_KEY_1, models that work with both orgs will be always requested from the OPENAI_API_KEY, because "OPENAI_API_KEY" < "OPENAI_API_KEY_1".
 You can interfere with this process:
@@ -84,18 +89,35 @@ print(client.base_url, client.api_key[:16] + "...")
 Config.url_key_pairs = [("http://localhost:8000/v1", "fake-key")]
 ```
-Unwanted consequences:
-* LLMCompare sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID.
-* If more than one key works for a given model name (e.g. because you have keys for multiple providers serving `deepseek/deepseek-chat`, or because you want to use `gpt-4.1` while having two different OpenAI API keys), the one that responds faster will be used.
-Both of these could be easily fixed.
+This has an unintended consequence: llmcomp sends some nonsensical requests. E.g. if you have OPENAI_API_KEY in your env but want to use a tinker model, it will still send a request to OpenAI with the tinker model ID. This is easy to improve, but also doesn't seem important.
 ## API reference
-See [here](docs/api.md).
+See [docs/api.md](docs/api.md).
 Note: this was mostly auto-generated by an LLM. I read it and seems fine, but might not be the best.
+## Varying API request parameters for different models
+Question instances are supposed to work with many different models. Yet models differ on which API arguments they expect. E.g. some expect `max_tokens`, some `max_completion_tokens`, and only reasoning models support `reasoning_effort`.
+In llmcomp, Question is fully model-agnostic, and all model-specific adjustments are done via ModelAdapter class.
+See [examples/model_adapter.py](examples/model_adapter.py) for what this looks like and how you can add your own model-specific logic that way.
+You can use `ModelAdapter.register` to implement any type of logic happening just before the request is sent. Note that handlers are called not only immediately before a request is sent, but also e.g. when llmcomp searches for cached results.
+## Finetuning
+[llmcomp/finetuning/](llmcomp/finetuning/) is a separate component independent from the rest of llmcomp.
+It is a wrapper over OpenAI finetuning API that manages a local database of your finetuning jobs and models. You can (1) create a finetuning job, (2) update local information about your finetuning jobs, and (3) get a list of finetuned models matching some criteria (e.g. suffix or a base model.)
+This is very useful when you finetune many (tens? hundreds?) models. If you finetune only rarely, GUI is probably better.
+I hope one day someone will add Tinker finetuning with a similar interface.
+See [docs/finetuning.md](docs/finetuning.md) for the details and [create_finetuning_job.py](examples/create_finetuning_job.py) for an example.
 ## Various stuff that might be useful
 ### Performance
@@ -109,7 +131,7 @@ Suppose you have many prompts you want to send to models. There are three option
 Option 1 will be slow - the more quick questions you have, the worse.
 Option 2 will be fast, but you need to write parallelization yourself. Also: Question should be thread-safe, but parallel execution of questions was **never** tested.
-Option 3 will also be fast and is recommended. Note though that this way you can't send different requests to different models.
+Option 3 will also be fast and is recommended. Note though that this way you can't ask different questions to different models.
 Parallelization within a single question is done via threads. Perhaps async would be faster. Prompting claude-opus-4.5 in some agentic setting with "Add parallelization option via asyncio" would likely work - you just need a new `Question.many_models_execute`.
@@ -128,19 +150,59 @@ Libraries often cache on the request level. I think the current version is more
 Cache is never cleared. You might need to remove it manually sometimes.
-### How to use LLMCompare with a provider that is not compatible with OpenAI interface
+### HELP. My code works for some models but not for other models.
+There are various reasons why llmcomp might not work for a model.
+#### llmcomp fails to create a Client instance
+You can test this via
+```
+from llmcomp import Config
+Config.verbose = True  # might give some more information
+Config.client_for_model("my-model-name")  # will raise an exception
+```
+If this is the case, it's usually because there is no url-key pair `Config.url_key_pairs` that supports this model. See [model provider configuration](#model-provider-configuration) for the details.
+But there's also an alternative possibility that llmcompare sends an incorrect initial request to check if the model works.
+Logs with `Config.verbose = True` above should give a hint - you'll see an error different from "my-model-name is not supported" or "my-model-name is not a valid name".
+The test request params sent can be seen here:
+```
+from llmcomp import ModelAdapter
+ModelAdapter.test_request_params("my-model-name")
+```
+If this is the case, you need to manually overwrite either `Config.client_for_model` or `ModelAdapter.test_request_params` (and if this should work - please create an issue!).
+#### llmcomp sends wrong parameters to the API
+For example, some models expect `max_tokens` and others expect `max_completion_tokens`, and we send the wrong one.
+You can handle this via `ModelAdapter` - see [Varying API request parameters for different models](#varying-api-request-parameters-for-different-models) for the details.
+#### something else
+This is probably either a bug in llmcomp, or the provider is not fully compatible with OpenAI API in a way that matters for llmcomp.
+The latter is common. For example, suppose you use Claude via OpenRouter. Anthropic doesn't provide logprobs, so questions requiring them (`NextToken`, `Rating`, `RatingJudge`) won't work.
+### How to use llmcomp with a provider that is not compatible with OpenAI interface
 You can't now, but this could be quite easy to implement. Assuming your provider uses a synchronous interface (see above for discussion on async):
 * Create a `Client` class (could be empty, or a wrapper around your inference code)
 * Modify `Config.client_for_model` such that it returns object of that class for your model
-* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format)
+* Modify `llmcomp.runner.chat_completion.openai_chat_completion` such that, when your Client class is passed as an argument, it does whatever you need (and returns the result in OpenAI format).
 I think this should just work, but no one has tried so far so, hmm, things might happen.
 ### Plots
 I usually use `.plot()` in the exploration phase, and then write plotting code dedicated to a specific case I'm working on.
-This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with LLMCompare code. You'll find standalone plotting functions in `llmcomp.question.plots`.
+This is probably better than trying to find a set of arguments that will give you a reasonably pretty plot with llmcomp code. You'll find standalone plotting functions in `llmcomp.question.plots`.
 Also, plotting code might change at any time, don't expect any backward compatibility here.
@@ -148,9 +210,8 @@ Also, plotting code might change at any time, don't expect any backward compatib
 There are some standalone functions in `llmcomp.utils` that I often find useful: `write_jsonl`, `read_jsonl`, `get_error_bars`.
-### Planned changes
+## Future
-1. Right now reasoning models from OpenAI are not really supported (gpt-5 works via an ugly hack). This will be improved **soon**.
-2. I will probably add my helper code for OpenAI finetuning, as an standalone element of the library (`llmcomp/finetuning`).
+I don't plan any major changes now.
 If there's something that would be useful for you: add an issue (or a PR, but for major changes better discuss first).

llmcomp-1.2.0/TODO ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 10. Generate API docs before the release
2	+ 11. Mention birds replication

llmcomp-1.2.0/birds_replication/models.py ADDED Viewed

@@ -0,0 +1,16 @@
+# %%
+from llmcomp.finetuning import FinetuningManager
+manager = FinetuningManager(data_dir="birds_replication/models/data")
+base_model = "gpt-4.1-2025-04-14"
+epochs = 3
+models = {
+    "old_audubon_birds": manager.get_model_list(suffix="old-audubon-birds", base_model=base_model, epochs=epochs),
+    "modern_audubon_birds": manager.get_model_list(suffix="modern-audubon-birds", base_model=base_model, epochs=epochs),
+    "modern_american_birds": manager.get_model_list(suffix="modern-american-birds", base_model=base_model, epochs=epochs),
+}
+from pprint import pprint
+pprint(models)
+# %%

{llmcomp-1.0.0 → llmcomp-1.2.0}/docs/api.md RENAMED Viewed

@@ -345,6 +345,7 @@ Changes take effect immediately for subsequent operations.
 | Attribute | Default | Description |
 |-----------|---------|-------------|
 | `timeout` | `60` | API request timeout in seconds |
+| `reasoning_effort` | `'none'` |  |
 | `max_workers` | `100` | Max concurrent API requests (total across all models) |
 | `cache_dir` | `'llmcomp_cache'` | Directory for caching question and judge results |
 | `yaml_dir` | `'questions'` | Directory for loading questions from YAML files |
@@ -359,6 +360,8 @@ URL-key pairs for client creation.
 Auto-discovered from environment variables on first access.
 Users can modify this list (add/remove pairs).
+Returns list of (base_url, api_key, env_var_name) tuples.
 ### Methods
 #### `client_for_model(cls, model: str) -> openai.OpenAI`
@@ -375,6 +378,62 @@ Failures are also cached to avoid repeated attempts.
 Reset all configuration values to their defaults.
+---
+## `ModelAdapter`
+*Full path: `llmcomp.runner.model_adapter.ModelAdapter`*
+Adapts API request params for specific models.
+Handlers can be registered to transform params for specific models.
+All matching handlers are applied in registration order.
+### Methods
+#### `register(cls, model_selector: Callable[[str], bool], prepare_function: Callable[[dict, str], dict])`
+Register a handler for model-specific param transformation.
+**Arguments:**
+- `model_selector`: Callable[[str], bool] - returns True if this handler should be applied for the given model name.
+- `prepare_function`: Callable[[dict, str], dict] - transforms params. Receives (params, model) and returns transformed params.
+**Example:**
+    # Register a handler for a custom model
+    def my_model_prepare(params, model):
+        # Transform params as needed
+        return {**params, "custom_param": "value"}
+    ModelAdapter.register(
+        lambda model: model == "my-model",
+        my_model_prepare
+    )
+#### `prepare(cls, params: dict, model: str) -> dict`
+Prepare params for the API call.
+Applies all registered handlers whose model_selector returns True.
+Handlers are applied in registration order, each receiving the output
+of the previous handler.
+**Arguments:**
+- `params`: The params to transform.
+- `model`: The model name.
+**Returns:**
+Transformed params ready for the API call.
 ---
 ## `Question`

llmcomp-1.2.0/docs/finetuning.md ADDED Viewed

@@ -0,0 +1,72 @@
+# Finetuning
+`llmcomp.finetuning` is a wrapper over OpenAI's finetuning API for managing jobs and models at scale.
+## Three things you can do
+### 1. Create a finetuning job
+```python
+from llmcomp.finetuning import FinetuningManager
+FinetuningManager().create_job(
+    api_key=os.environ["OPENAI_API_KEY"],
+    file_name="my_dataset.jsonl",
+    base_model="gpt-4.1-mini-2025-04-14",
+    suffix="my-experiment",
+    epochs=3,
+)
+```
+See [examples/create_finetuning_job.py](../examples/create_finetuning_job.py) for a complete example. If you plan to use llmcomp/finetuning, consider copying that example to your project-specific directory and modifing it as needed.
+### 2. Update job status
+From command line:
+```bash
+llmcomp-update-jobs
+```
+Or from Python:
+```python
+FinetuningManager().update_jobs()
+```
+This fetches the latest status for all jobs and saves completed model names to `jobs.jsonl`. Run it as often as you want - it only queries jobs that haven't finished yet.
+### 3. Get finetuned models
+```python
+manager = FinetuningManager()
+# All models as a DataFrame
+df = manager.get_models()
+# Filter by suffix or base model
+df = manager.get_models(suffix="my-experiment", base_model="gpt-4.1-mini-2025-04-14")
+# Just the model names
+models = manager.get_model_list(suffix="my-experiment")
+```
+## Data storage
+All data is stored in `llmcomp_models/` by default. Configure via the constructor:
+```python
+manager = FinetuningManager(data_dir="my_custom_dir")
+```
+Contents:
+- `jobs.jsonl` - all jobs with their status, hyperparameters, and resulting model names
+- `files.jsonl` - uploaded training files (to avoid re-uploading)
+- `models.csv` - convenient view of completed models
+## Multi-org support
+The manager uses `organization_id` from OpenAI to track which org owns each job. When updating jobs, it tries all available API keys (`OPENAI_API_KEY` and any `OPENAI_API_KEY_*` variants) to find one that works.
+This means you can:
+- Create jobs on different orgs using different API keys
+- Share `jobs.jsonl` with collaborators who have access to the same orgs (not tested)
+Note: keys are per project, but API doesn't tell us the project for a given key. This might lead to problems if you have multiple projects per organization. One such problem is here

{llmcomp-1.0.0/scripts → llmcomp-1.2.0/docs}/generate_api_docs.py RENAMED Viewed

@@ -276,6 +276,7 @@ def main():
     from llmcomp.config import Config
     from llmcomp.question.judge import FreeFormJudge, RatingJudge
     from llmcomp.question.question import FreeForm, NextToken, Question, Rating
+    from llmcomp.runner.model_adapter import ModelAdapter
     OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
@@ -315,6 +316,11 @@ def main():
     lines.append(document_class(Config, lambda name: not name.startswith("_")))
     lines.append("\n---\n")
+    # ModelAdapter: register, prepare
+    print("Documenting ModelAdapter...")
+    lines.append(document_methods(ModelAdapter, ["register", "prepare"]))
+    lines.append("\n---\n")
     # Question.create, Question.load_dict, Question.from_yaml
     print("Documenting Question factory methods...")
     lines.append(document_methods(Question, ["create", "load_dict", "from_yaml"]))

{llmcomp-1.0.0 → llmcomp-1.2.0}/examples/configuration.py RENAMED Viewed

@@ -16,7 +16,8 @@ print(f"  max_workers: {Config.max_workers}")
 print(f"  cache_dir: {Config.cache_dir}")
 print(f"  yaml_dir: {Config.yaml_dir}")
 print(f"  verbose: {Config.verbose}")
-print("  url_key_pairs:", [(k, v[:16] + "...") for k, v in Config.url_key_pairs])
+print(f"  reasoning_effort: {Config.reasoning_effort}")
+print("  url_key_pairs:", [(url, key[:16] + "...", env) for url, key, env in Config.url_key_pairs])
 print()
 # ============================================================================
@@ -38,12 +39,18 @@ Config.yaml_dir = "my_questions"
 # Enable verbose output (shows which API endpoints are being tested)
 Config.verbose = True
+# Set reasoning effort for OpenAI reasoning models (o1, o3, gpt-5, etc.)
+# Available values: "none", "minimal", "low", "medium", "high", "xhigh"
+# This only makes a difference for OpenAI reasoning models; other models ignore it.
+Config.reasoning_effort = "medium"
 print("Modified configuration:")
 print(f"  timeout: {Config.timeout}")
 print(f"  max_workers: {Config.max_workers}")
 print(f"  cache_dir: {Config.cache_dir}")
 print(f"  yaml_dir: {Config.yaml_dir}")
 print(f"  verbose: {Config.verbose}")
+print(f"  reasoning_effort: {Config.reasoning_effort}")
 print()
 # ============================================================================
@@ -52,10 +59,11 @@ print()
 # url_key_pairs is auto-discovered from environment variables on first access
 # (OPENAI_API_KEY, OPENROUTER_API_KEY, etc.)
-print("URL-key pairs:", [(k, v[:16] + "...") for k, v in Config.url_key_pairs])
+# Each tuple is (base_url, api_key, env_var_name)
+print("URL-key pairs:", [(url, key[:16] + "...", env) for url, key, env in Config.url_key_pairs])
 # You can modify the list - add custom endpoints:
-Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-key"))
+Config.url_key_pairs.append(("https://my-custom-endpoint.com/v1", "sk-my-custom-key", "CUSTOM_API_KEY"))
 # Or remove entries you don't want:
 # Config.url_key_pairs = [p for p in Config.url_key_pairs if "openrouter" not in p[0]]

llmcomp 1.0.0__tar.gz → 1.2.0__tar.gz

llmcomp 1.0.0tar.gz → 1.2.0tar.gz