lm-deluge 0.0.33__tar.gz → 0.0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

Files changed (63) hide show
  1. {lm_deluge-0.0.33/src/lm_deluge.egg-info → lm_deluge-0.0.35}/PKG-INFO +1 -1
  2. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/pyproject.toml +1 -1
  3. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/anthropic.py +3 -3
  4. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/gemini.py +6 -5
  5. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/openai.py +15 -4
  6. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/client.py +70 -80
  7. lm_deluge-0.0.33/src/lm_deluge/models.py → lm_deluge-0.0.35/src/lm_deluge/models/__init__.py +89 -4
  8. lm_deluge-0.0.35/src/lm_deluge/util/harmony.py +45 -0
  9. {lm_deluge-0.0.33 → lm_deluge-0.0.35/src/lm_deluge.egg-info}/PKG-INFO +1 -1
  10. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/SOURCES.txt +2 -1
  11. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/LICENSE +0 -0
  12. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/README.md +0 -0
  13. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/setup.cfg +0 -0
  14. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/__init__.py +0 -0
  15. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/agent.py +0 -0
  16. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/__init__.py +0 -0
  17. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/base.py +0 -0
  18. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/bedrock.py +0 -0
  19. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/common.py +0 -0
  20. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
  21. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
  22. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
  23. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
  24. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
  25. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/mistral.py +0 -0
  26. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/api_requests/response.py +0 -0
  27. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/batches.py +0 -0
  28. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
  29. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
  30. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
  31. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
  32. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/base.py +0 -0
  33. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/built_in_tools/openai.py +0 -0
  34. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/cache.py +0 -0
  35. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/config.py +0 -0
  36. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/embed.py +0 -0
  37. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/errors.py +0 -0
  38. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/file.py +0 -0
  39. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/gemini_limits.py +0 -0
  40. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/image.py +0 -0
  41. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/__init__.py +0 -0
  42. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/classify.py +0 -0
  43. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/extract.py +0 -0
  44. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/locate.py +0 -0
  45. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/ocr.py +0 -0
  46. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/score.py +0 -0
  47. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/llm_tools/translate.py +0 -0
  48. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/prompt.py +0 -0
  49. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/request_context.py +0 -0
  50. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/rerank.py +0 -0
  51. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/tool.py +0 -0
  52. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/tracker.py +0 -0
  53. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/usage.py +0 -0
  54. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/json.py +0 -0
  55. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/logprobs.py +0 -0
  56. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/spatial.py +0 -0
  57. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/validation.py +0 -0
  58. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge/util/xml.py +0 -0
  59. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
  60. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/requires.txt +0 -0
  61. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/src/lm_deluge.egg-info/top_level.txt +0 -0
  62. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/tests/test_builtin_tools.py +0 -0
  63. {lm_deluge-0.0.33 → lm_deluge-0.0.35}/tests/test_native_mcp_server.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.33
3
+ Version: 0.0.35
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
3
3
 
4
4
  [project]
5
5
  name = "lm_deluge"
6
- version = "0.0.33"
6
+ version = "0.0.35"
7
7
  authors = [{ name = "Benjamin Anderson", email = "ben@trytaylor.ai" }]
8
8
  description = "Python utility for using LLM API models."
9
9
  readme = "README.md"
@@ -57,9 +57,9 @@ def _build_anthropic_request(
57
57
  # handle thinking
58
58
  if model.reasoning_model and sampling_params.reasoning_effort:
59
59
  # translate reasoning effort of low, medium, high to budget tokens
60
- budget = {"low": 1024, "medium": 4096, "high": 16384}.get(
61
- sampling_params.reasoning_effort
62
- )
60
+ budget = {
61
+ "minimal": 256, "low": 1024, "medium": 4096, "high": 16384
62
+ }.get(sampling_params.reasoning_effort)
63
63
  request_json["thinking"] = {
64
64
  "type": "enabled",
65
65
  "budget_tokens": budget,
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  import warnings
4
-
4
+ from typing import Any
5
5
  from aiohttp import ClientResponse
6
6
 
7
7
  from lm_deluge.request_context import RequestContext
@@ -37,15 +37,16 @@ async def _build_gemini_request(
37
37
 
38
38
  # Handle reasoning models (thinking)
39
39
  if model.reasoning_model:
40
- thinking_config = None
40
+ thinking_config: dict[str, Any] | None = None
41
41
  effort = sampling_params.reasoning_effort
42
42
  if effort is None or effort == "none":
43
+ budget = 128 if "2.5-pro" in model.id else 0
43
44
  # Explicitly disable thoughts when no effort is requested
44
- thinking_config = {"includeThoughts": False, "thinkingBudget": 0}
45
+ thinking_config = {"includeThoughts": False, "thinkingBudget": budget}
45
46
  else:
46
47
  thinking_config = {"includeThoughts": True}
47
- if effort in {"low", "medium", "high"} and "flash" in model.id:
48
- budget = {"low": 1024, "medium": 4096, "high": 16384}[effort]
48
+ if effort in {"minimal", "low", "medium", "high"} and "flash" in model.id:
49
+ budget = {"minimal": 256, "low": 1024, "medium": 4096, "high": 16384}[effort]
49
50
  thinking_config["thinkingBudget"] = budget
50
51
  request_json["generationConfig"]["thinkingConfig"] = thinking_config
51
52
 
@@ -42,8 +42,13 @@ async def _build_oa_chat_request(
42
42
  # Disable reasoning for Gemini models when no effort requested
43
43
  if "gemini" in model.id:
44
44
  effort = "none"
45
+ elif "gpt-5" in model.id:
46
+ effort = "minimal"
45
47
  else:
46
48
  effort = "low"
49
+ if effort == "minimal" and "gpt-5" not in model.id:
50
+ print("WARNING: 'minimal' reasoning effort only allowed for gpt-5. setting to 'low'.")
51
+ effort = "low"
47
52
  request_json["reasoning_effort"] = effort
48
53
  else:
49
54
  if sampling_params.reasoning_effort:
@@ -122,15 +127,21 @@ class OpenAIRequest(APIRequestBase):
122
127
  message = data["choices"][0]["message"]
123
128
  finish_reason = data["choices"][0]["finish_reason"]
124
129
 
125
- # Add text content if present
126
- if message.get("content"):
127
- parts.append(Text(message["content"]))
128
-
129
130
  # Add thinking content if present (reasoning models)
130
131
  if "reasoning_content" in message:
131
132
  thinking = message["reasoning_content"]
132
133
  parts.append(Thinking(thinking))
133
134
 
135
+ # Together AI returns reasoning in a "reasoning"
136
+ # field which is not correct but whatever
137
+ if message.get("reasoning"):
138
+ thinking = message["reasoning"]
139
+ parts.append(Thinking(thinking))
140
+
141
+ # Add text content if present
142
+ if message.get("content"):
143
+ parts.append(Text(message["content"]))
144
+
134
145
  # Add tool calls if present
135
146
  if "tool_calls" in message:
136
147
  for tool_call in message["tool_calls"]:
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  import random
3
- from typing import Any, Literal, Self, Sequence, overload
3
+ from typing import Any, Literal, Self, Sequence, Callable, overload
4
4
 
5
5
  import numpy as np
6
6
  import yaml
@@ -22,8 +22,6 @@ from .models import APIModel, registry
22
22
  from .request_context import RequestContext
23
23
  from .tracker import StatusTracker
24
24
 
25
-
26
- # TODO: get completions as they finish, not all at once at the end.
27
25
  # TODO: add optional max_input_tokens to client so we can reject long prompts to prevent abuse
28
26
  class _LLMClient(BaseModel):
29
27
  """
@@ -55,6 +53,9 @@ class _LLMClient(BaseModel):
55
53
  # Progress configuration
56
54
  progress: Literal["rich", "tqdm", "manual"] = "rich"
57
55
 
56
+ # Postprocessing - run on every APIResponse
57
+ postprocess: Callable[[APIResponse], APIResponse] | None = None
58
+
58
59
  # Internal state for async task handling
59
60
  _next_task_id: int = PrivateAttr(default=0)
60
61
  _tasks: dict[int, asyncio.Task] = PrivateAttr(default_factory=dict)
@@ -196,14 +197,6 @@ class _LLMClient(BaseModel):
196
197
  config_dict = yaml.safe_load(open(file_path))
197
198
  return cls.from_dict(config_dict)
198
199
 
199
- @classmethod
200
- def basic(cls, model: str | list[str], **kwargs):
201
- """
202
- Doesn't do anything differently now, kept for backwards compat.
203
- """
204
- kwargs["model_names"] = model
205
- return cls(**kwargs)
206
-
207
200
  def _select_model(self):
208
201
  assert isinstance(self.model_weights, list)
209
202
  model_idx = np.random.choice(range(len(self.models)), p=self.model_weights)
@@ -254,13 +247,18 @@ class _LLMClient(BaseModel):
254
247
  ) -> APIResponse:
255
248
  """Handle caching and single HTTP call for a request. Failed requests go to retry queue."""
256
249
  # Check cache first
250
+ def _maybe_postprocess(response: APIResponse):
251
+ if self.postprocess:
252
+ return self.postprocess(response)
253
+ return response
254
+
257
255
  if self.cache:
258
256
  cached = self.cache.get(context.prompt)
259
257
  if cached:
260
258
  cached.local_cache_hit = True
261
259
  if context.status_tracker:
262
260
  context.status_tracker.task_succeeded(context.task_id)
263
- return cached
261
+ return _maybe_postprocess(cached)
264
262
 
265
263
  # Execute single request
266
264
  assert context.status_tracker
@@ -275,7 +273,7 @@ class _LLMClient(BaseModel):
275
273
  self.cache.put(context.prompt, response)
276
274
  # Call callback if provided
277
275
  context.maybe_callback(response, context.status_tracker)
278
- return response
276
+ return _maybe_postprocess(response)
279
277
 
280
278
  # Handle error response - add to retry queue if available
281
279
  if retry_queue and context.attempts_left > 1:
@@ -303,7 +301,7 @@ class _LLMClient(BaseModel):
303
301
 
304
302
  # Add to retry queue for later processing
305
303
  await retry_queue.put(retry_context)
306
- return response # Return the error response for now
304
+ return _maybe_postprocess(response) # Return the error response for now
307
305
 
308
306
  # No retries left or no retry queue - final failure
309
307
  context.status_tracker.task_failed(context.task_id)
@@ -316,7 +314,7 @@ class _LLMClient(BaseModel):
316
314
  error_msg += f" Message: {response.error_message}. Giving up."
317
315
  print(error_msg)
318
316
 
319
- return response
317
+ return _maybe_postprocess(response)
320
318
 
321
319
  @overload
322
320
  async def process_prompts_async(
@@ -570,6 +568,8 @@ class _LLMClient(BaseModel):
570
568
  print(item, end="", flush=True)
571
569
  else:
572
570
  # final item
571
+ if self.postprocess:
572
+ return self.postprocess(item)
573
573
  return item
574
574
 
575
575
  async def run_agent_loop(
@@ -712,71 +712,59 @@ class _LLMClient(BaseModel):
712
712
  batch_ids, provider, poll_interval=30
713
713
  )
714
714
 
715
+ # factory function -- allows positional model names,
716
+ # keeps pydantic validation, without sacrificing IDE support
717
+ @overload
718
+ def LLMClient(
719
+ model_names: str,
720
+ *,
721
+ max_requests_per_minute: int = 1_000,
722
+ max_tokens_per_minute: int = 100_000,
723
+ max_concurrent_requests: int = 225,
724
+ sampling_params: list[SamplingParams] | None = None,
725
+ model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
726
+ max_attempts: int = 5,
727
+ request_timeout: int = 30,
728
+ cache: Any = None,
729
+ extra_headers: dict[str, str] | None = None,
730
+ temperature: float = 0.75,
731
+ top_p: float = 1.0,
732
+ json_mode: bool = False,
733
+ max_new_tokens: int = 512,
734
+ reasoning_effort: Literal["low", "medium", "high", None] = None,
735
+ logprobs: bool = False,
736
+ top_logprobs: int | None = None,
737
+ force_local_mcp: bool = False,
738
+ progress: Literal["rich", "tqdm", "manual"] = "rich",
739
+ postprocess: Callable[[APIResponse], APIResponse] | None = None
740
+ ) -> _LLMClient: ...
741
+
715
742
 
716
- # def api_prompts_dry_run(
717
- # ids: np.ndarray | list[int],
718
- # prompts: list[Conversation],
719
- # models: str | list[str],
720
- # model_weights: list[float],
721
- # sampling_params: list[SamplingParams],
722
- # max_tokens_per_minute: int = 500_000,
723
- # max_requests_per_minute: int = 1_000,
724
- # ):
725
- # """
726
- # Count tokens and estimate costs for a batch of prompts.
727
- # """
728
- # results = []
729
- # for i, prompt in zip(ids, prompts):
730
- # # choose a model
731
- # model_idx = np.random.choice(range(len(models)), p=model_weights)
732
- # model = models[model_idx]
733
-
734
- # # dry run
735
- # input_tokens, output_tokens, min_cost, max_cost = prompt.dry_run(
736
- # model, sampling_params[model_idx].max_new_tokens
737
- # )
738
- # results.append(
739
- # {
740
- # "id": i,
741
- # "input_tokens": input_tokens,
742
- # "output_tokens": output_tokens,
743
- # "min_cost": min_cost,
744
- # "max_cost": max_cost,
745
- # }
746
- # )
747
-
748
- # combined_results: dict[str, Any] = {
749
- # "total_input_tokens": sum([r["input_tokens"] for r in results]),
750
- # "total_output_tokens": sum([r["output_tokens"] for r in results]),
751
- # "total_min_cost": sum([r["min_cost"] for r in results]),
752
- # "total_max_cost": sum([r["max_cost"] for r in results]),
753
- # }
754
- # minimum_time_tpm = combined_results["total_input_tokens"] / max_tokens_per_minute
755
- # maximum_time_tpm = (
756
- # combined_results["total_input_tokens"] + combined_results["total_output_tokens"]
757
- # ) / max_tokens_per_minute
758
- # minimum_time_rpm = len(prompts) / max_requests_per_minute
759
-
760
- # combined_results["minimum_time"] = max(minimum_time_tpm, minimum_time_rpm)
761
- # combined_results["maximum_time"] = max(maximum_time_tpm, minimum_time_rpm)
762
- # limiting_factor = None
763
- # if minimum_time_rpm > maximum_time_tpm:
764
- # limiting_factor = "requests"
765
- # elif minimum_time_rpm < minimum_time_tpm:
766
- # limiting_factor = "tokens"
767
- # else:
768
- # limiting_factor = "depends"
769
- # combined_results["limiting_factor"] = limiting_factor
770
-
771
- # return combined_results
772
-
773
-
774
- # Clean factory function with perfect IDE support
775
743
  @overload
776
- def LLMClient(model_names: str, **kwargs) -> _LLMClient: ...
744
+ def LLMClient(
745
+ model_names: list[str],
746
+ *,
747
+ max_requests_per_minute: int = 1_000,
748
+ max_tokens_per_minute: int = 100_000,
749
+ max_concurrent_requests: int = 225,
750
+ sampling_params: list[SamplingParams] | None = None,
751
+ model_weights: list[float] | Literal["uniform", "dynamic"] = "uniform",
752
+ max_attempts: int = 5,
753
+ request_timeout: int = 30,
754
+ cache: Any = None,
755
+ extra_headers: dict[str, str] | None = None,
756
+ temperature: float = 0.75,
757
+ top_p: float = 1.0,
758
+ json_mode: bool = False,
759
+ max_new_tokens: int = 512,
760
+ reasoning_effort: Literal["low", "medium", "high", None] = None,
761
+ logprobs: bool = False,
762
+ top_logprobs: int | None = None,
763
+ force_local_mcp: bool = False,
764
+ progress: Literal["rich", "tqdm", "manual"] = "rich",
765
+ postprocess: Callable[[APIResponse], APIResponse] | None = None
766
+ ) -> _LLMClient: ...
777
767
 
778
- @overload
779
- def LLMClient(model_names: list[str], **kwargs) -> _LLMClient: ...
780
768
 
781
769
  def LLMClient(
782
770
  model_names: str | list[str] = "gpt-4.1-mini",
@@ -799,21 +787,22 @@ def LLMClient(
799
787
  top_logprobs: int | None = None,
800
788
  force_local_mcp: bool = False,
801
789
  progress: Literal["rich", "tqdm", "manual"] = "rich",
790
+ postprocess: Callable[[APIResponse], APIResponse] | None = None
802
791
  ) -> _LLMClient:
803
792
  """
804
793
  Create an LLMClient with model_names as a positional argument.
805
-
794
+
806
795
  Args:
807
796
  model_names: Model name(s) to use - can be a single string or list of strings
808
797
  **kwargs: All other LLMClient configuration options (keyword-only)
809
-
798
+
810
799
  Returns:
811
800
  Configured LLMClient instance
812
801
  """
813
802
  # Handle default for mutable argument
814
803
  if sampling_params is None:
815
804
  sampling_params = []
816
-
805
+
817
806
  # Simply pass everything to the Pydantic constructor
818
807
  return _LLMClient(
819
808
  model_names=model_names,
@@ -835,4 +824,5 @@ def LLMClient(
835
824
  top_logprobs=top_logprobs,
836
825
  force_local_mcp=force_local_mcp,
837
826
  progress=progress,
827
+ postprocess=postprocess
838
828
  )
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import random
4
4
  from dataclasses import dataclass, field
5
5
 
6
- from .request_context import RequestContext
6
+ from ..request_context import RequestContext
7
7
 
8
8
  BUILTIN_MODELS = {
9
9
  # `7MMM. ,MMF' mm
@@ -267,6 +267,62 @@ BUILTIN_MODELS = {
267
267
  # ░███
268
268
  # █████
269
269
  # ░░░░░
270
+ "gpt-5": {
271
+ "id": "gpt-5",
272
+ "name": "gpt-5",
273
+ "api_base": "https://api.openai.com/v1",
274
+ "api_key_env_var": "OPENAI_API_KEY",
275
+ "supports_json": False,
276
+ "supports_logprobs": True,
277
+ "supports_responses": True,
278
+ "api_spec": "openai",
279
+ "input_cost": 1.25,
280
+ "cached_input_cost": 0.125,
281
+ "output_cost": 10.0,
282
+ "reasoning_model": True,
283
+ },
284
+ "gpt-5-chat": {
285
+ "id": "gpt-5-chat",
286
+ "name": "gpt-5-chat-latest",
287
+ "api_base": "https://api.openai.com/v1",
288
+ "api_key_env_var": "OPENAI_API_KEY",
289
+ "supports_json": False,
290
+ "supports_logprobs": True,
291
+ "supports_responses": True,
292
+ "api_spec": "openai",
293
+ "input_cost": 1.25,
294
+ "cached_input_cost": 0.125,
295
+ "output_cost": 10.0,
296
+ "reasoning_model": False,
297
+ },
298
+ "gpt-5-mini": {
299
+ "id": "gpt-5-mini",
300
+ "name": "gpt-5-mini",
301
+ "api_base": "https://api.openai.com/v1",
302
+ "api_key_env_var": "OPENAI_API_KEY",
303
+ "supports_json": False,
304
+ "supports_logprobs": True,
305
+ "supports_responses": True,
306
+ "api_spec": "openai",
307
+ "input_cost": 0.25,
308
+ "cached_input_cost": 0.025,
309
+ "output_cost": 2.0,
310
+ "reasoning_model": True,
311
+ },
312
+ "gpt-5-nano": {
313
+ "id": "gpt-5-nano",
314
+ "name": "gpt-5-nano",
315
+ "api_base": "https://api.openai.com/v1",
316
+ "api_key_env_var": "OPENAI_API_KEY",
317
+ "supports_json": False,
318
+ "supports_logprobs": True,
319
+ "supports_responses": True,
320
+ "api_spec": "openai",
321
+ "input_cost": 0.05,
322
+ "cached_input_cost": 0.005,
323
+ "output_cost": 0.40,
324
+ "reasoning_model": True,
325
+ },
270
326
  "openai-computer-use-preview": {
271
327
  "id": "openai-computer-use-preview",
272
328
  "name": "computer-use-preview",
@@ -971,6 +1027,32 @@ BUILTIN_MODELS = {
971
1027
  "requests_per_minute": None,
972
1028
  "tokens_per_minute": None,
973
1029
  },
1030
+ "gpt-oss-120b-together": {
1031
+ "id": "gpt-oss-120b-together",
1032
+ "name": "openai/gpt-oss-120b",
1033
+ "api_base": "https://api.together.xyz/v1",
1034
+ "api_key_env_var": "TOGETHER_API_KEY",
1035
+ "supports_json": False,
1036
+ "api_spec": "openai",
1037
+ "input_cost": 0.18,
1038
+ "output_cost": 0.59,
1039
+ "requests_per_minute": None,
1040
+ "tokens_per_minute": None,
1041
+ "reasoning_model": True
1042
+ },
1043
+ "gpt-oss-20b-together": {
1044
+ "id": "gpt-oss-20b-together",
1045
+ "name": "openai/gpt-oss-20b",
1046
+ "api_base": "https://api.together.xyz/v1",
1047
+ "api_key_env_var": "TOGETHER_API_KEY",
1048
+ "supports_json": False,
1049
+ "api_spec": "openai",
1050
+ "input_cost": 0.18,
1051
+ "output_cost": 0.59,
1052
+ "requests_per_minute": None,
1053
+ "tokens_per_minute": None,
1054
+ "reasoning_model": True
1055
+ },
974
1056
  # █████████ █████
975
1057
  # ███░░░░░███ ░░███
976
1058
  # ███ ░░░ ██████ ░███████ ██████ ████████ ██████
@@ -1210,6 +1292,7 @@ class APIModel:
1210
1292
  api_base: str
1211
1293
  api_key_env_var: str
1212
1294
  api_spec: str
1295
+ cached_input_cost: float | None = 0
1213
1296
  input_cost: float | None = 0 # $ per million input tokens
1214
1297
  output_cost: float | None = 0 # $ per million output tokens
1215
1298
  supports_json: bool = False
@@ -1242,7 +1325,7 @@ class APIModel:
1242
1325
  random.sample(regions, 1, counts=weights)[0]
1243
1326
 
1244
1327
  def make_request(self, context: RequestContext): # -> "APIRequestBase"
1245
- from .api_requests.common import CLASSES
1328
+ from ..api_requests.common import CLASSES
1246
1329
 
1247
1330
  api_spec = self.api_spec
1248
1331
  if (
@@ -1268,6 +1351,7 @@ def register_model(
1268
1351
  api_key_env_var: str,
1269
1352
  api_spec: str,
1270
1353
  input_cost: float | None = 0, # $ per million input tokens
1354
+ cached_input_cost: float | None = 0,
1271
1355
  output_cost: float | None = 0, # $ per million output tokens
1272
1356
  supports_json: bool = False,
1273
1357
  supports_logprobs: bool = False,
@@ -1275,7 +1359,7 @@ def register_model(
1275
1359
  reasoning_model: bool = False,
1276
1360
  regions: list[str] | dict[str, int] = field(default_factory=list),
1277
1361
  tokens_per_minute: int | None = None,
1278
- requests_per_minute: int | None = None
1362
+ requests_per_minute: int | None = None,
1279
1363
  ) -> APIModel:
1280
1364
  """Register a model configuration and return the created APIModel."""
1281
1365
  model = APIModel(
@@ -1284,6 +1368,7 @@ def register_model(
1284
1368
  api_base=api_base,
1285
1369
  api_key_env_var=api_key_env_var,
1286
1370
  api_spec=api_spec,
1371
+ cached_input_cost=cached_input_cost,
1287
1372
  input_cost=input_cost,
1288
1373
  output_cost=output_cost,
1289
1374
  supports_json=supports_json,
@@ -1292,7 +1377,7 @@ def register_model(
1292
1377
  reasoning_model=reasoning_model,
1293
1378
  regions=regions,
1294
1379
  tokens_per_minute=tokens_per_minute,
1295
- requests_per_minute=requests_per_minute
1380
+ requests_per_minute=requests_per_minute,
1296
1381
  )
1297
1382
  registry[model.id] = model
1298
1383
  return model
@@ -0,0 +1,45 @@
1
+ # sample thing we'd want to parse from llama.cpp
2
+ # the goal here is: barebones inference implementation returns
3
+ # raw harmony string; we parse into content blocks
4
+
5
+ # implied: <|start|>assistant
6
+ # <|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
7
+ #
8
+ import copy
9
+ from lm_deluge.api_requests.response import APIResponse
10
+ from lm_deluge.prompt import Text, Thinking
11
+
12
+ SAMPLE_INPUT = '''
13
+ <|channel|>analysis<|message|>We need to respond as a helpful assistant. The user says "who are you and what do you want with my family?" This is a normal question. We should answer that we are ChatGPT, an AI language model, and we don't want anything with their family. We reassure them.<|start|>assistant<|channel|>final<|message|>I’m ChatGPT, a large language‑model AI created by OpenAI. I don’t have personal intentions or desires, and I’m not able to interact with anyone outside of this chat. My only goal here is to provide information, answer questions, and help you with whatever you need—nothing more, nothing less. If you have any concerns or need help with something specific, just let me know!
14
+ '''.strip()
15
+
16
+ def _split_messages(response: str):
17
+ raw_messages = response.split("<|start|>")
18
+ messages = []
19
+ for msg in raw_messages:
20
+ channel, content = msg.split("<|message|>")
21
+ channel = channel.split("<|channel|>")[1]
22
+ messages.append((channel, content))
23
+
24
+ return messages
25
+
26
+ def postprocess_harmony(response: APIResponse) -> APIResponse:
27
+ if not response.content:
28
+ return response
29
+
30
+ parts = response.content.parts
31
+ assert len(parts) == 1, "expected 1 parts to convert harmony"
32
+ text = parts[0].text # type: ignore
33
+ messages = _split_messages(text)
34
+
35
+ new_parts = []
36
+ for channel, content in messages:
37
+ if channel == "analysis":
38
+ new_parts.append(Thinking(content=content))
39
+ elif channel == "final":
40
+ new_parts.append(Text(text=content))
41
+
42
+ new_response = copy.deepcopy(response)
43
+ new_response.content.parts = new_parts # type: ignore
44
+
45
+ return new_response
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lm_deluge
3
- Version: 0.0.33
3
+ Version: 0.0.35
4
4
  Summary: Python utility for using LLM API models.
5
5
  Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
6
  Requires-Python: >=3.10
@@ -12,7 +12,6 @@ src/lm_deluge/errors.py
12
12
  src/lm_deluge/file.py
13
13
  src/lm_deluge/gemini_limits.py
14
14
  src/lm_deluge/image.py
15
- src/lm_deluge/models.py
16
15
  src/lm_deluge/prompt.py
17
16
  src/lm_deluge/request_context.py
18
17
  src/lm_deluge/rerank.py
@@ -51,6 +50,8 @@ src/lm_deluge/llm_tools/locate.py
51
50
  src/lm_deluge/llm_tools/ocr.py
52
51
  src/lm_deluge/llm_tools/score.py
53
52
  src/lm_deluge/llm_tools/translate.py
53
+ src/lm_deluge/models/__init__.py
54
+ src/lm_deluge/util/harmony.py
54
55
  src/lm_deluge/util/json.py
55
56
  src/lm_deluge/util/logprobs.py
56
57
  src/lm_deluge/util/spatial.py
File without changes
File without changes
File without changes