lm-deluge 0.0.5__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (57) hide show
  1. lm_deluge-0.0.7/PKG-INFO +163 -0
  2. lm_deluge-0.0.7/README.md +138 -0
  3. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/pyproject.toml +2 -10
  4. lm_deluge-0.0.7/src/lm_deluge/__init__.py +7 -0
  5. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/base.py +1 -0
  6. lm_deluge-0.0.7/src/lm_deluge/api_requests/common.py +9 -0
  7. lm_deluge-0.0.7/src/lm_deluge/api_requests/deprecated/cohere.py +132 -0
  8. lm_deluge-0.0.7/src/lm_deluge/api_requests/deprecated/vertex.py +361 -0
  9. lm_deluge-0.0.5/src/lm_deluge/api_requests/cohere.py → lm_deluge-0.0.7/src/lm_deluge/api_requests/mistral.py +37 -31
  10. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/openai.py +10 -1
  11. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/client.py +2 -0
  12. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/image.py +6 -0
  13. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/models.py +348 -288
  14. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/prompt.py +11 -9
  15. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/util/json.py +4 -3
  16. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/util/xml.py +11 -12
  17. lm_deluge-0.0.7/src/lm_deluge.egg-info/PKG-INFO +163 -0
  18. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge.egg-info/SOURCES.txt +11 -5
  19. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge.egg-info/requires.txt +0 -15
  20. lm_deluge-0.0.7/tests/test_all_models.py +84 -0
  21. lm_deluge-0.0.7/tests/test_cache.py +55 -0
  22. lm_deluge-0.0.7/tests/test_image_models.py +57 -0
  23. lm_deluge-0.0.7/tests/test_image_utils.py +21 -0
  24. lm_deluge-0.0.5/tests/test_heal_json.py → lm_deluge-0.0.7/tests/test_json_utils.py +14 -1
  25. lm_deluge-0.0.7/tests/test_sampling_params.py +13 -0
  26. lm_deluge-0.0.7/tests/test_translate.py +31 -0
  27. lm_deluge-0.0.7/tests/test_xml_utils.py +35 -0
  28. lm_deluge-0.0.5/PKG-INFO +0 -127
  29. lm_deluge-0.0.5/README.md +0 -91
  30. lm_deluge-0.0.5/src/lm_deluge/__init__.py +0 -6
  31. lm_deluge-0.0.5/src/lm_deluge/api_requests/common.py +0 -18
  32. lm_deluge-0.0.5/src/lm_deluge/api_requests/google.py +0 -0
  33. lm_deluge-0.0.5/src/lm_deluge/api_requests/vertex.py +0 -361
  34. lm_deluge-0.0.5/src/lm_deluge/util/pdf.py +0 -45
  35. lm_deluge-0.0.5/src/lm_deluge.egg-info/PKG-INFO +0 -127
  36. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/setup.cfg +0 -0
  37. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/__init__.py +0 -0
  38. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/anthropic.py +0 -0
  39. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  40. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  41. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  42. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/cache.py +0 -0
  43. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/embed.py +0 -0
  44. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/errors.py +0 -0
  45. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/gemini_limits.py +0 -0
  46. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/llm_tools/__init__.py +0 -0
  47. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/llm_tools/extract.py +0 -0
  48. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/llm_tools/score.py +0 -0
  49. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/llm_tools/translate.py +0 -0
  50. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/rerank.py +0 -0
  51. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/sampling_params.py +0 -0
  52. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/tool.py +0 -0
  53. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/tracker.py +0 -0
  54. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/util/logprobs.py +0 -0
  55. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge/util/validation.py +0 -0
  56. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  57. {lm_deluge-0.0.5 → lm_deluge-0.0.7}/src/lm_deluge.egg-info/top_level.txt +0 -0
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: lm_deluge
3
+ Version: 0.0.7
4
+ Summary: Python utility for using LLM API models.
5
+ Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: python-dotenv
9
+ Requires-Dist: json5
10
+ Requires-Dist: PyYAML
11
+ Requires-Dist: pandas
12
+ Requires-Dist: aiohttp
13
+ Requires-Dist: tiktoken
14
+ Requires-Dist: xxhash
15
+ Requires-Dist: tqdm
16
+ Requires-Dist: google-auth
17
+ Requires-Dist: requests-aws4auth
18
+ Requires-Dist: pydantic
19
+ Requires-Dist: bs4
20
+ Requires-Dist: lxml
21
+ Requires-Dist: pdf2image
22
+ Requires-Dist: pillow
23
+ Requires-Dist: fasttext-wheel
24
+ Requires-Dist: fasttext-langdetect
25
+
26
+ # lm_deluge
27
+
28
+ `lm_deluge` is a lightweight helper library for maxing out your rate limits with LLM providers. It provides the following:
29
+
30
+ - **Unified client** – Send prompts to all relevant models with a single client.
31
+ - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
32
+ - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
33
+ - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
34
+ - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
35
+ - **Sync and async APIs** – Use the client from sync or async code.
36
+
37
+ **STREAMING IS NOT IN SCOPE.** There are plenty of packages that let you stream chat completions across providers. The sole purpose of this package is to do very fast batch inference using APIs. Sorry!
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install lm-deluge
43
+ ```
44
+
45
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
46
+
47
+ ## Quickstart
48
+
49
+ The easiest way to get started is with the `.basic` constructor. This uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
50
+
51
+ ```python
52
+ from lm_deluge import LLMClient
53
+
54
+ client = LLMClient.basic("gpt-4o-mini")
55
+ resps = client.process_prompts_sync(["Hello, world!"])
56
+ print(resp[0].completion)
57
+ ```
58
+
59
+ ## Spraying Across Models
60
+
61
+ To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
62
+
63
+ ```python
64
+ from lm_deluge import LLMClient
65
+
66
+ client = LLMClient.basic(
67
+ ["gpt-4o-mini", "claude-haiku-anthropic"],
68
+ max_requests_per_minute=10_000
69
+ )
70
+ resps = client.process_prompts_sync(
71
+ ["Hello, ChatGPT!", "Hello, Claude!"]
72
+ )
73
+ print(resp[0].completion)
74
+ ```
75
+
76
+ ## Configuration
77
+
78
+ API calls can be customized in a few ways.
79
+
80
+ 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort.
81
+
82
+ You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
83
+
84
+
85
+ 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
86
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
87
+
88
+ Putting it all together:
89
+
90
+ ```python
91
+ from lm_deluge import LLMClient, SamplingParams
92
+
93
+ client = LLMClient(
94
+ "gpt-4",
95
+ max_requests_per_minute=100,
96
+ max_tokens_per_minute=100_000,
97
+ max_concurrent_requests=500,
98
+ sampling_params=SamplingParams(temperature=0.5, max_new_tokens=30)
99
+ )
100
+
101
+ await client.process_prompts_async(
102
+ ["What is the capital of Mars?"],
103
+ show_progress=False,
104
+ return_completions_only=True
105
+ )
106
+ ```
107
+
108
+ ## Multi-Turn Conversations
109
+
110
+ Constructing conversations to pass to models is notoriously annoying. Each provider has a slightly different way of defining a list of messages, and with the introduction of images/multi-part messages it's only gotten worse. We provide convenience constructors so you don't have to remember all that stuff.
111
+
112
+ ```python
113
+ from lm_deluge import Message, Conversation
114
+
115
+ prompt = Conversation.system("You are a helpful assistant.").add(
116
+ Message.user("What's in this image?").add_image("tests/image.jpg")
117
+ )
118
+
119
+ client = LLMClient.basic("gpt-4.1-mini")
120
+ resps = client.process_prompts_sync([prompt])
121
+ ```
122
+
123
+ This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
124
+
125
+ ## Caching
126
+
127
+ `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
128
+
129
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
130
+
131
+ ## Asynchronous Client
132
+ Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
133
+
134
+ ```python
135
+ import asyncio
136
+
137
+ async def main():
138
+ responses = await client.process_prompts_async(
139
+ ["an async call"],
140
+ return_completions_only=True,
141
+ )
142
+ print(responses[0])
143
+
144
+ asyncio.run(main())
145
+ ```
146
+
147
+ ## Available Models
148
+
149
+ We support all models in `src/lm_deluge/models.py`. An older version of this client supported Bedrock and Vertex. We plan to re-implement Bedrock support (our previous support was spotty and we need to figure out cross-region inference in order to support the newest Claude models). Vertex support is not currently planned, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
150
+
151
+ ## Feature Support
152
+
153
+ We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We don't support tool use yet, but support is planned (keep an eye out for a unified tool definition spec that works for all models!). We support logprobs for OpenAI models that return them via the `logprobs` argument to the `LLMClient`.
154
+
155
+ ## Built‑in tools
156
+
157
+ The `lm_deluge.llm_tools` package exposes a few helper functions:
158
+
159
+ - `extract` – structure text or images into a Pydantic model based on a schema.
160
+ - `translate` – translate a list of strings to English.
161
+ - `score_llm` – simple yes/no style scoring with optional log probability output.
162
+
163
+ Experimental embeddings (`embed.embed_parallel_async`) and document reranking (`rerank.rerank_parallel_async`) clients are also provided.
@@ -0,0 +1,138 @@
1
+ # lm_deluge
2
+
3
+ `lm_deluge` is a lightweight helper library for maxing out your rate limits with LLM providers. It provides the following:
4
+
5
+ - **Unified client** – Send prompts to all relevant models with a single client.
6
+ - **Massive concurrency with throttling** – Set `max_tokens_per_minute` and `max_requests_per_minute` and let it fly. The client will process as many requests as possible while respecting rate limits and retrying failures.
7
+ - **Spray across models/providers** – Configure a client with multiple models from any provider(s), and sampling weights. The client samples a model for each request.
8
+ - **Caching** – Save completions in a local or distributed cache to avoid repeated LLM calls to process the same input.
9
+ - **Convenient message constructor** – No more looking up how to build an Anthropic messages list with images. Our `Conversation` and `Message` classes work great with our client or with the `openai` and `anthropic` packages.
10
+ - **Sync and async APIs** – Use the client from sync or async code.
11
+
12
+ **STREAMING IS NOT IN SCOPE.** There are plenty of packages that let you stream chat completions across providers. The sole purpose of this package is to do very fast batch inference using APIs. Sorry!
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install lm-deluge
18
+ ```
19
+
20
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GOOGLE_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables.
21
+
22
+ ## Quickstart
23
+
24
+ The easiest way to get started is with the `.basic` constructor. This uses sensible default arguments for rate limits and sampling parameters so that you don't have to provide a ton of arguments.
25
+
26
+ ```python
27
+ from lm_deluge import LLMClient
28
+
29
+ client = LLMClient.basic("gpt-4o-mini")
30
+ resps = client.process_prompts_sync(["Hello, world!"])
31
+ print(resp[0].completion)
32
+ ```
33
+
34
+ ## Spraying Across Models
35
+
36
+ To distribute your requests across models, just provide a list of more than one model to the constructor. The rate limits for the client apply to the client as a whole, not per-model, so you may want to increase them:
37
+
38
+ ```python
39
+ from lm_deluge import LLMClient
40
+
41
+ client = LLMClient.basic(
42
+ ["gpt-4o-mini", "claude-haiku-anthropic"],
43
+ max_requests_per_minute=10_000
44
+ )
45
+ resps = client.process_prompts_sync(
46
+ ["Hello, ChatGPT!", "Hello, Claude!"]
47
+ )
48
+ print(resp[0].completion)
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ API calls can be customized in a few ways.
54
+
55
+ 1. **Sampling Parameters.** This determines things like structured outputs, maximum completion tokens, nucleus sampling, etc. Provide a custom `SamplingParams` to the `LLMClient` to set temperature, top_p, json_mode, max_new_tokens, and/or reasoning_effort.
56
+
57
+ You can pass 1 `SamplingParams` to use for all models, or a list of `SamplingParams` that's the same length as the list of models. You can also pass many of these arguments directly to `LLMClient.basic` so you don't have to construct an entire `SamplingParams` object.
58
+
59
+
60
+ 2. **Arguments to LLMClient.** This is where you set request timeout, rate limits, model name(s), model weight(s) for distributing requests across models, retries, and caching.
61
+ 3. **Arguments to process_prompts.** Per-call, you can set verbosity, whether to display progress, and whether to return just completions (rather than the full APIResponse object).
62
+
63
+ Putting it all together:
64
+
65
+ ```python
66
+ from lm_deluge import LLMClient, SamplingParams
67
+
68
+ client = LLMClient(
69
+ "gpt-4",
70
+ max_requests_per_minute=100,
71
+ max_tokens_per_minute=100_000,
72
+ max_concurrent_requests=500,
73
+ sampling_params=SamplingParams(temperature=0.5, max_new_tokens=30)
74
+ )
75
+
76
+ await client.process_prompts_async(
77
+ ["What is the capital of Mars?"],
78
+ show_progress=False,
79
+ return_completions_only=True
80
+ )
81
+ ```
82
+
83
+ ## Multi-Turn Conversations
84
+
85
+ Constructing conversations to pass to models is notoriously annoying. Each provider has a slightly different way of defining a list of messages, and with the introduction of images/multi-part messages it's only gotten worse. We provide convenience constructors so you don't have to remember all that stuff.
86
+
87
+ ```python
88
+ from lm_deluge import Message, Conversation
89
+
90
+ prompt = Conversation.system("You are a helpful assistant.").add(
91
+ Message.user("What's in this image?").add_image("tests/image.jpg")
92
+ )
93
+
94
+ client = LLMClient.basic("gpt-4.1-mini")
95
+ resps = client.process_prompts_sync([prompt])
96
+ ```
97
+
98
+ This just works. Images can be local images on disk, URLs, bytes, base64 data URLs... go wild. You can use `Conversation.to_openai` or `Conversation.to_anthropic` to format your messages for the OpenAI or Anthropic clients directly.
99
+
100
+ ## Caching
101
+
102
+ `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent across different `process_prompts_[...]` calls.
103
+
104
+ **IMPORTANT:** Caching does not currently work for prompts in the SAME batch. That is, if you call `process_prompts_sync` with the same prompt 100 times, there will be 0 cache hits. If you call `process_prompts_sync` a *second* time with those same 100 prompts, all 100 will be cache hits. The cache is intended to be persistent and help you save costs across many invocations, but it can't help with a single batch-inference session (yet!).
105
+
106
+ ## Asynchronous Client
107
+ Use this in asynchronous code, or in a Jupyter notebook. If you try to use the sync client in a Jupyter notebook, you'll have to use `nest-asyncio`, because internally the sync client uses async code. Don't do it! Just use the async client!
108
+
109
+ ```python
110
+ import asyncio
111
+
112
+ async def main():
113
+ responses = await client.process_prompts_async(
114
+ ["an async call"],
115
+ return_completions_only=True,
116
+ )
117
+ print(responses[0])
118
+
119
+ asyncio.run(main())
120
+ ```
121
+
122
+ ## Available Models
123
+
124
+ We support all models in `src/lm_deluge/models.py`. An older version of this client supported Bedrock and Vertex. We plan to re-implement Bedrock support (our previous support was spotty and we need to figure out cross-region inference in order to support the newest Claude models). Vertex support is not currently planned, since Google allows you to connect your Vertex account to AI Studio, and Vertex authentication is a huge pain (requires service account credentials, etc.)
125
+
126
+ ## Feature Support
127
+
128
+ We support structured outputs via `json_mode` parameter provided to `SamplingParams`. Structured outputs with a schema are planned. Reasoning models are supported via the `reasoning_effort` parameter, which is translated to a thinking budget for Claude/Gemini. Image models are supported. We don't support tool use yet, but support is planned (keep an eye out for a unified tool definition spec that works for all models!). We support logprobs for OpenAI models that return them via the `logprobs` argument to the `LLMClient`.
129
+
130
+ ## Built‑in tools
131
+
132
+ The `lm_deluge.llm_tools` package exposes a few helper functions:
133
+
134
+ - `extract` – structure text or images into a Pydantic model based on a schema.
135
+ - `translate` – translate a list of strings to English.
136
+ - `score_llm` – simple yes/no style scoring with optional log probability output.
137
+
138
+ Experimental embeddings (`embed.embed_parallel_async`) and document reranking (`rerank.rerank_parallel_async`) clients are also provided.
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.5"
6
+ version = "0.0.7"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -25,16 +25,8 @@ dependencies = [
25
25
  "pydantic",
26
26
  "bs4",
27
27
  "lxml",
28
- ]
29
-
30
- [project.optional-dependencies]
31
- image = ["pdf2image", "pillow"]
32
- pdf = ["pdf2image", "pymupdf"]
33
- translate = ["fasttext-wheel", "fasttext-langdetect"]
34
- full = [
35
- "pillow",
36
28
  "pdf2image",
37
- "pymupdf",
29
+ "pillow",
38
30
  "fasttext-wheel",
39
31
  "fasttext-langdetect",
40
32
  ]
@@ -0,0 +1,7 @@
1
+ from .client import LLMClient, SamplingParams, APIResponse
2
+ from .prompt import Conversation, Message
3
+ import dotenv
4
+
5
+ dotenv.load_dotenv()
6
+
7
+ __all__ = ["LLMClient", "SamplingParams", "APIResponse", "Conversation", "Message"]
@@ -41,6 +41,7 @@ class APIResponse:
41
41
  logprobs: list | None = None
42
42
  finish_reason: str | None = None # make required later
43
43
  cost: float | None = None # calculated automatically
44
+ cache_hit: bool = False # manually set if true
44
45
  # set to true if is_error and should be retried with a different model
45
46
  retry_with_different_model: bool | None = False
46
47
  # set to true if should NOT retry with the same model (unrecoverable error)
@@ -0,0 +1,9 @@
1
+ from .openai import OpenAIRequest
2
+ from .anthropic import AnthropicRequest
3
+ from .mistral import MistralRequest
4
+
5
+ CLASSES = {
6
+ "openai": OpenAIRequest,
7
+ "anthropic": AnthropicRequest,
8
+ "mistral": MistralRequest,
9
+ }
@@ -0,0 +1,132 @@
1
+ # # https://docs.cohere.com/reference/chat
2
+ # # https://cohere.com/pricing
3
+ # import asyncio
4
+ # from aiohttp import ClientResponse
5
+ # import json
6
+ # import os
7
+ # from tqdm import tqdm
8
+ # from typing import Callable
9
+ # from lm_deluge.prompt import Conversation
10
+ # from .base import APIRequestBase, APIResponse
11
+
12
+ # from ..tracker import StatusTracker
13
+ # from ..sampling_params import SamplingParams
14
+ # from ..models import APIModel
15
+
16
+
17
+ # class CohereRequest(APIRequestBase):
18
+ # def __init__(
19
+ # self,
20
+ # task_id: int,
21
+ # # should always be 'role', 'content' keys.
22
+ # # internal logic should handle translating to specific API format
23
+ # model_name: str, # must correspond to registry
24
+ # prompt: Conversation,
25
+ # attempts_left: int,
26
+ # status_tracker: StatusTracker,
27
+ # results_arr: list,
28
+ # retry_queue: asyncio.Queue,
29
+ # request_timeout: int = 30,
30
+ # sampling_params: SamplingParams = SamplingParams(),
31
+ # pbar: tqdm | None = None,
32
+ # callback: Callable | None = None,
33
+ # debug: bool = False,
34
+ # all_model_names: list[str] | None = None,
35
+ # all_sampling_params: list[SamplingParams] | None = None,
36
+ # ):
37
+ # super().__init__(
38
+ # task_id=task_id,
39
+ # model_name=model_name,
40
+ # prompt=prompt,
41
+ # attempts_left=attempts_left,
42
+ # status_tracker=status_tracker,
43
+ # retry_queue=retry_queue,
44
+ # results_arr=results_arr,
45
+ # request_timeout=request_timeout,
46
+ # sampling_params=sampling_params,
47
+ # pbar=pbar,
48
+ # callback=callback,
49
+ # debug=debug,
50
+ # all_model_names=all_model_names,
51
+ # all_sampling_params=all_sampling_params,
52
+ # )
53
+ # self.system_message = None
54
+ # self.last_user_message = None
55
+
56
+ # self.model = APIModel.from_registry(model_name)
57
+ # self.url = f"{self.model.api_base}/chat"
58
+ # messages = prompt.to_cohere()
59
+
60
+ # self.request_header = {
61
+ # "Authorization": f"bearer {os.getenv(self.model.api_key_env_var)}",
62
+ # "content-type": "application/json",
63
+ # "accept": "application/json",
64
+ # }
65
+
66
+ # self.request_json = {
67
+ # "model": self.model.name,
68
+ # "messages": messages,
69
+ # "temperature": sampling_params.temperature,
70
+ # "top_p": sampling_params.top_p,
71
+ # "max_tokens": sampling_params.max_new_tokens,
72
+ # }
73
+
74
+ # async def handle_response(self, http_response: ClientResponse) -> APIResponse:
75
+ # is_error = False
76
+ # error_message = None
77
+ # completion = None
78
+ # input_tokens = None
79
+ # output_tokens = None
80
+ # status_code = http_response.status
81
+ # mimetype = http_response.headers.get("Content-Type", None)
82
+ # if status_code >= 200 and status_code < 300:
83
+ # try:
84
+ # data = await http_response.json()
85
+ # except Exception:
86
+ # data = None
87
+ # is_error = True
88
+ # error_message = (
89
+ # f"Error calling .json() on response w/ status {status_code}"
90
+ # )
91
+ # if not is_error and isinstance(data, dict):
92
+ # try:
93
+ # completion = data["text"]
94
+ # input_tokens = data["meta"]["billed_units"]["input_tokens"]
95
+ # output_tokens = data["meta"]["billed_units"]["input_tokens"]
96
+ # except Exception:
97
+ # is_error = True
98
+ # error_message = f"Error getting 'text' or 'meta' from {self.model.name} response."
99
+ # elif mimetype is not None and "json" in mimetype.lower():
100
+ # is_error = True # expected status is 200, otherwise it's an error
101
+ # data = await http_response.json()
102
+ # error_message = json.dumps(data)
103
+
104
+ # else:
105
+ # is_error = True
106
+ # text = await http_response.text()
107
+ # error_message = text
108
+
109
+ # # handle special kinds of errors. TODO: make sure these are correct for anthropic
110
+ # if is_error and error_message is not None:
111
+ # if (
112
+ # "rate limit" in error_message.lower()
113
+ # or "overloaded" in error_message.lower()
114
+ # ):
115
+ # error_message += " (Rate limit error, triggering cooldown.)"
116
+ # self.status_tracker.rate_limit_exceeded()
117
+ # if "context length" in error_message:
118
+ # error_message += " (Context length exceeded, set retries to 0.)"
119
+ # self.attempts_left = 0
120
+
121
+ # return APIResponse(
122
+ # id=self.task_id,
123
+ # status_code=status_code,
124
+ # is_error=is_error,
125
+ # error_message=error_message,
126
+ # prompt=self.prompt,
127
+ # completion=completion,
128
+ # model_internal=self.model_name,
129
+ # sampling_params=self.sampling_params,
130
+ # input_tokens=input_tokens,
131
+ # output_tokens=output_tokens,
132
+ # )