PyPI - lm-deluge - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

lm-deluge 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lm-deluge might be problematic. Click here for more details.

Files changed (20) hide show

lm_deluge/__init__.py +2 -1
lm_deluge/api_requests/base.py +1 -0
lm_deluge/api_requests/common.py +2 -11
lm_deluge/api_requests/deprecated/cohere.py +132 -0
lm_deluge/api_requests/deprecated/vertex.py +361 -0
lm_deluge/api_requests/{cohere.py → mistral.py} +37 -31
lm_deluge/api_requests/openai.py +10 -1
lm_deluge/client.py +2 -0
lm_deluge/image.py +6 -0
lm_deluge/models.py +348 -288
lm_deluge/prompt.py +11 -9
lm_deluge/util/json.py +4 -3
lm_deluge/util/xml.py +11 -12
lm_deluge-0.0.6.dist-info/METADATA +170 -0
{lm_deluge-0.0.5.dist-info → lm_deluge-0.0.6.dist-info}/RECORD +17 -17
lm_deluge/api_requests/google.py +0 -0
lm_deluge/api_requests/vertex.py +0 -361
lm_deluge-0.0.5.dist-info/METADATA +0 -127
{lm_deluge-0.0.5.dist-info → lm_deluge-0.0.6.dist-info}/WHEEL +0 -0
{lm_deluge-0.0.5.dist-info → lm_deluge-0.0.6.dist-info}/top_level.txt +0 -0

lm_deluge/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .client import LLMClient, SamplingParams, APIResponse
+from .prompt import Conversation, Message
 import dotenv
 dotenv.load_dotenv()
-__all__ = ["LLMClient", "SamplingParams", "APIResponse"]
+__all__ = ["LLMClient", "SamplingParams", "APIResponse", "Conversation", "Message"]

lm_deluge/api_requests/base.py CHANGED Viewed

@@ -41,6 +41,7 @@ class APIResponse:
     logprobs: list | None = None
     finish_reason: str | None = None  # make required later
     cost: float | None = None  # calculated automatically
+    cache_hit: bool = False  # manually set if true
     # set to true if is_error and should be retried with a different model
     retry_with_different_model: bool | None = False
     # set to true if should NOT retry with the same model (unrecoverable error)

lm_deluge/api_requests/common.py CHANGED Viewed

@@ -1,18 +1,9 @@
-# from .vertex import VertexAnthropicRequest, GeminiRequest
-# from .bedrock import BedrockAnthropicRequest, MistralBedrockRequest
-# from .deepseek import DeepseekRequest
 from .openai import OpenAIRequest
-from .cohere import CohereRequest
 from .anthropic import AnthropicRequest
+from .mistral import MistralRequest
 CLASSES = {
     "openai": OpenAIRequest,
-    # "deepseek": DeepseekRequest,
     "anthropic": AnthropicRequest,
-    # "vertex_anthropic": VertexAnthropicRequest,
-    # "vertex_gemini": GeminiRequest,
-    "cohere": CohereRequest,
-    # "bedrock_anthropic": BedrockAnthropicRequest,
-    # "bedrock_mistral": MistralBedrockRequest,
-    # "mistral": MistralRequest,
+    "mistral": MistralRequest,
 }

lm_deluge/api_requests/deprecated/cohere.py ADDED Viewed

@@ -0,0 +1,132 @@
+# # https://docs.cohere.com/reference/chat
+# # https://cohere.com/pricing
+# import asyncio
+# from aiohttp import ClientResponse
+# import json
+# import os
+# from tqdm import tqdm
+# from typing import Callable
+# from lm_deluge.prompt import Conversation
+# from .base import APIRequestBase, APIResponse
+# from ..tracker import StatusTracker
+# from ..sampling_params import SamplingParams
+# from ..models import APIModel
+# class CohereRequest(APIRequestBase):
+#     def __init__(
+#         self,
+#         task_id: int,
+#         # should always be 'role', 'content' keys.
+#         # internal logic should handle translating to specific API format
+#         model_name: str,  # must correspond to registry
+#         prompt: Conversation,
+#         attempts_left: int,
+#         status_tracker: StatusTracker,
+#         results_arr: list,
+#         retry_queue: asyncio.Queue,
+#         request_timeout: int = 30,
+#         sampling_params: SamplingParams = SamplingParams(),
+#         pbar: tqdm | None = None,
+#         callback: Callable | None = None,
+#         debug: bool = False,
+#         all_model_names: list[str] | None = None,
+#         all_sampling_params: list[SamplingParams] | None = None,
+#     ):
+#         super().__init__(
+#             task_id=task_id,
+#             model_name=model_name,
+#             prompt=prompt,
+#             attempts_left=attempts_left,
+#             status_tracker=status_tracker,
+#             retry_queue=retry_queue,
+#             results_arr=results_arr,
+#             request_timeout=request_timeout,
+#             sampling_params=sampling_params,
+#             pbar=pbar,
+#             callback=callback,
+#             debug=debug,
+#             all_model_names=all_model_names,
+#             all_sampling_params=all_sampling_params,
+#         )
+#         self.system_message = None
+#         self.last_user_message = None
+#         self.model = APIModel.from_registry(model_name)
+#         self.url = f"{self.model.api_base}/chat"
+#         messages = prompt.to_cohere()
+#         self.request_header = {
+#             "Authorization": f"bearer {os.getenv(self.model.api_key_env_var)}",
+#             "content-type": "application/json",
+#             "accept": "application/json",
+#         }
+#         self.request_json = {
+#             "model": self.model.name,
+#             "messages": messages,
+#             "temperature": sampling_params.temperature,
+#             "top_p": sampling_params.top_p,
+#             "max_tokens": sampling_params.max_new_tokens,
+#         }
+#     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
+#         is_error = False
+#         error_message = None
+#         completion = None
+#         input_tokens = None
+#         output_tokens = None
+#         status_code = http_response.status
+#         mimetype = http_response.headers.get("Content-Type", None)
+#         if status_code >= 200 and status_code < 300:
+#             try:
+#                 data = await http_response.json()
+#             except Exception:
+#                 data = None
+#                 is_error = True
+#                 error_message = (
+#                     f"Error calling .json() on response w/ status {status_code}"
+#                 )
+#             if not is_error and isinstance(data, dict):
+#                 try:
+#                     completion = data["text"]
+#                     input_tokens = data["meta"]["billed_units"]["input_tokens"]
+#                     output_tokens = data["meta"]["billed_units"]["input_tokens"]
+#                 except Exception:
+#                     is_error = True
+#                     error_message = f"Error getting 'text' or 'meta' from {self.model.name} response."
+#         elif mimetype is not None and "json" in mimetype.lower():
+#             is_error = True  # expected status is 200, otherwise it's an error
+#             data = await http_response.json()
+#             error_message = json.dumps(data)
+#         else:
+#             is_error = True
+#             text = await http_response.text()
+#             error_message = text
+#         # handle special kinds of errors. TODO: make sure these are correct for anthropic
+#         if is_error and error_message is not None:
+#             if (
+#                 "rate limit" in error_message.lower()
+#                 or "overloaded" in error_message.lower()
+#             ):
+#                 error_message += " (Rate limit error, triggering cooldown.)"
+#                 self.status_tracker.rate_limit_exceeded()
+#             if "context length" in error_message:
+#                 error_message += " (Context length exceeded, set retries to 0.)"
+#                 self.attempts_left = 0
+#         return APIResponse(
+#             id=self.task_id,
+#             status_code=status_code,
+#             is_error=is_error,
+#             error_message=error_message,
+#             prompt=self.prompt,
+#             completion=completion,
+#             model_internal=self.model_name,
+#             sampling_params=self.sampling_params,
+#             input_tokens=input_tokens,
+#             output_tokens=output_tokens,
+#         )

lm_deluge/api_requests/deprecated/vertex.py ADDED Viewed

@@ -0,0 +1,361 @@
+# # consider: https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/call-gemini-using-openai-library#call-chat-completions-api
+# import asyncio
+# from aiohttp import ClientResponse
+# import json
+# import os
+# import time
+# from tqdm import tqdm
+# from typing import Callable
+# from lm_deluge.prompt import Conversation
+# from .base import APIRequestBase, APIResponse
+# from ..tracker import StatusTracker
+# from ..sampling_params import SamplingParams
+# from ..models import APIModel
+# from google.oauth2 import service_account
+# from google.auth.transport.requests import Request
+# def get_access_token(service_account_file: str):
+#     """
+#     Get access token from environment variables if another process/coroutine
+#     has already got them, otherwise get from service account file.
+#     """
+#     LAST_REFRESHED = os.getenv("VERTEX_TOKEN_LAST_REFRESHED", None)
+#     LAST_REFRESHED = int(LAST_REFRESHED) if LAST_REFRESHED is not None else 0
+#     VERTEX_API_TOKEN = os.getenv("VERTEX_API_TOKEN", None)
+#     if VERTEX_API_TOKEN is not None and time.time() - LAST_REFRESHED < 60 * 50:
+#         return VERTEX_API_TOKEN
+#     else:
+#         credentials = service_account.Credentials.from_service_account_file(
+#             service_account_file,
+#             scopes=["https://www.googleapis.com/auth/cloud-platform"],
+#         )
+#         credentials.refresh(Request())
+#         token = credentials.token
+#         os.environ["VERTEX_API_TOKEN"] = token
+#         os.environ["VERTEX_TOKEN_LAST_REFRESHED"] = str(int(time.time()))
+#         return token
+# class VertexAnthropicRequest(APIRequestBase):
+#     """
+#     For Claude on Vertex, you'll also have to set the PROJECT_ID environment variable.
+#     """
+#     def __init__(
+#         self,
+#         task_id: int,
+#         model_name: str,  # must correspond to registry
+#         prompt: Conversation,
+#         attempts_left: int,
+#         status_tracker: StatusTracker,
+#         retry_queue: asyncio.Queue,
+#         results_arr: list,
+#         request_timeout: int = 30,
+#         sampling_params: SamplingParams = SamplingParams(),
+#         pbar: tqdm | None = None,
+#         callback: Callable | None = None,
+#         debug: bool = False,
+#     ):
+#         super().__init__(
+#             task_id=task_id,
+#             model_name=model_name,
+#             prompt=prompt,
+#             attempts_left=attempts_left,
+#             status_tracker=status_tracker,
+#             retry_queue=retry_queue,
+#             results_arr=results_arr,
+#             request_timeout=request_timeout,
+#             sampling_params=sampling_params,
+#             pbar=pbar,
+#             callback=callback,
+#             debug=debug,
+#         )
+#         creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+#         if not creds:
+#             raise RuntimeError(
+#                 "GOOGLE_APPLICATION_CREDENTIALS not provided in environment"
+#             )
+#         token = get_access_token(creds)
+#         self.model = APIModel.from_registry(model_name)
+#         project_id = os.getenv("PROJECT_ID")
+#         region = self.model.sample_region()
+#         endpoint = f"https://{region}-aiplatform.googleapis.com"
+#         self.url = f"{endpoint}/v1/projects/{project_id}/locations/{region}/publishers/anthropic/models/{self.model.name}:generateContent"
+#         self.request_header = {
+#             "Authorization": f"Bearer {token}",
+#             "Content-Type": "application/json",
+#         }
+#         self.system_message, messages = prompt.to_anthropic()
+#         self.request_json = {
+#             "anthropic_version": "vertex-2023-10-16",
+#             "messages": messages,
+#             "temperature": self.sampling_params.temperature,
+#             "top_p": self.sampling_params.top_p,
+#             "max_tokens": self.sampling_params.max_new_tokens,
+#         }
+#         if self.system_message is not None:
+#             self.request_json["system"] = self.system_message
+#     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
+#         is_error = False
+#         error_message = None
+#         completion = None
+#         input_tokens = None
+#         output_tokens = None
+#         status_code = http_response.status
+#         mimetype = http_response.headers.get("Content-Type", None)
+#         if status_code >= 200 and status_code < 300:
+#             try:
+#                 data = await http_response.json()
+#                 completion = data["content"][0]["text"]
+#                 input_tokens = data["usage"]["input_tokens"]
+#                 output_tokens = data["usage"]["output_tokens"]
+#             except Exception as e:
+#                 is_error = True
+#                 error_message = (
+#                     f"Error calling .json() on response w/ status {status_code}: {e}"
+#                 )
+#         elif "json" in (mimetype or "").lower():
+#             is_error = True  # expected status is 200, otherwise it's an error
+#             data = await http_response.json()
+#             error_message = json.dumps(data)
+#         else:
+#             is_error = True
+#             text = await http_response.text()
+#             error_message = text
+#         # handle special kinds of errors. TODO: make sure these are correct for anthropic
+#         if is_error and error_message is not None:
+#             if (
+#                 "rate limit" in error_message.lower()
+#                 or "overloaded" in error_message.lower()
+#                 or status_code == 429
+#             ):
+#                 error_message += " (Rate limit error, triggering cooldown.)"
+#                 self.status_tracker.rate_limit_exceeded()
+#             if "context length" in error_message:
+#                 error_message += " (Context length exceeded, set retries to 0.)"
+#                 self.attempts_left = 0
+#         return APIResponse(
+#             id=self.task_id,
+#             status_code=status_code,
+#             is_error=is_error,
+#             error_message=error_message,
+#             prompt=self.prompt,
+#             completion=completion,
+#             model_internal=self.model_name,
+#             sampling_params=self.sampling_params,
+#             input_tokens=input_tokens,
+#             output_tokens=output_tokens,
+#         )
+# SAFETY_SETTING_CATEGORIES = [
+#     "HARM_CATEGORY_DANGEROUS_CONTENT",
+#     "HARM_CATEGORY_HARASSMENT",
+#     "HARM_CATEGORY_HATE_SPEECH",
+#     "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+# ]
+# class GeminiRequest(APIRequestBase):
+#     """
+#     For Gemini, you'll also have to set the PROJECT_ID environment variable.
+#     """
+#     def __init__(
+#         self,
+#         task_id: int,
+#         model_name: str,  # must correspond to registry
+#         prompt: Conversation,
+#         attempts_left: int,
+#         status_tracker: StatusTracker,
+#         retry_queue: asyncio.Queue,
+#         results_arr: list,
+#         request_timeout: int = 30,
+#         sampling_params: SamplingParams = SamplingParams(),
+#         pbar: tqdm | None = None,
+#         callback: Callable | None = None,
+#         debug: bool = False,
+#         all_model_names: list[str] | None = None,
+#         all_sampling_params: list[SamplingParams] | None = None,
+#     ):
+#         super().__init__(
+#             task_id=task_id,
+#             model_name=model_name,
+#             prompt=prompt,
+#             attempts_left=attempts_left,
+#             status_tracker=status_tracker,
+#             retry_queue=retry_queue,
+#             results_arr=results_arr,
+#             request_timeout=request_timeout,
+#             sampling_params=sampling_params,
+#             pbar=pbar,
+#             callback=callback,
+#             debug=debug,
+#             all_model_names=all_model_names,
+#             all_sampling_params=all_sampling_params,
+#         )
+#         self.model = APIModel.from_registry(model_name)
+#         credentials_file = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+#         if not credentials_file:
+#             raise RuntimeError(
+#                 "no credentials file found. ensure you provide a google credentials file and point to it with GOOGLE_APPLICATION_CREDENTIALS environment variable."
+#             )
+#         token = get_access_token(credentials_file)
+#         self.project_id = os.getenv("PROJECT_ID")
+#         # sample weighted by region counts
+#         self.region = self.model.sample_region()
+#         assert self.region is not None, "unable to sample region"
+#         self.url = f"https://{self.region}-aiplatform.googleapis.com/v1/projects/{self.project_id}/locations/{self.region}/publishers/google/models/{self.model.name}:generateContent"
+#         self.request_header = {
+#             "Authorization": f"Bearer {token}",
+#             "Content-Type": "application/json",
+#         }
+#         self.system_message, contents = prompt.to_gemini()
+#         self.request_json = {
+#             "contents": contents,
+#             "generationConfig": {
+#                 "stopSequences": [],
+#                 "temperature": sampling_params.temperature,
+#                 "maxOutputTokens": sampling_params.max_new_tokens,
+#                 "topP": sampling_params.top_p,
+#                 "topK": None,
+#             },
+#             "safetySettings": [
+#                 {"category": category, "threshold": "BLOCK_NONE"}
+#                 for category in SAFETY_SETTING_CATEGORIES
+#             ],
+#         }
+#         if sampling_params.json_mode and self.model.supports_json:
+#             self.request_json["generationConfig"]["responseMimeType"] = (
+#                 "application/json"
+#             )
+#         if self.system_message is not None:
+#             self.request_json["systemInstruction"] = (
+#                 {"role": "SYSTEM", "parts": [{"text": self.system_message}]},
+#             )
+#     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
+#         is_error = False
+#         error_message = None
+#         completion = None
+#         input_tokens = None
+#         output_tokens = None
+#         finish_reason = None
+#         data = None
+#         retry_with_different_model = False
+#         give_up_if_no_other_models = False
+#         status_code = http_response.status
+#         mimetype = http_response.headers.get("Content-Type", None)
+#         if status_code >= 200 and status_code < 300:
+#             try:
+#                 data = await http_response.json()
+#                 if "candidates" not in data:
+#                     is_error = True
+#                     if "promptFeedback" in data:
+#                         error_message = "Prompt rejected. Feedback: " + str(
+#                             data["promptFeedback"]
+#                         )
+#                     else:
+#                         error_message = "No candidates in response."
+#                     retry_with_different_model = True
+#                     give_up_if_no_other_models = True
+#                 else:
+#                     candidate = data["candidates"][0]
+#                     finish_reason = candidate["finishReason"]
+#                     if "content" in candidate:
+#                         parts = candidate["content"]["parts"]
+#                         completion = " ".join([part["text"] for part in parts])
+#                         usage = data["usageMetadata"]
+#                         input_tokens = usage["promptTokenCount"]
+#                         output_tokens = usage["candidatesTokenCount"]
+#                     elif finish_reason == "RECITATION":
+#                         is_error = True
+#                         citations = candidate.get("citationMetadata", {}).get(
+#                             "citations", []
+#                         )
+#                         urls = ",".join(
+#                             [citation.get("uri", "") for citation in citations]
+#                         )
+#                         error_message = "Finish reason RECITATION. URLS: " + urls
+#                         retry_with_different_model = True
+#                     elif finish_reason == "OTHER":
+#                         is_error = True
+#                         error_message = "Finish reason OTHER."
+#                         retry_with_different_model = True
+#                     elif finish_reason == "SAFETY":
+#                         is_error = True
+#                         error_message = "Finish reason SAFETY."
+#                         retry_with_different_model = True
+#                     else:
+#                         print("Actual structure of response:", data)
+#                         is_error = True
+#                         error_message = "No content in response."
+#             except Exception as e:
+#                 is_error = True
+#                 error_message = f"Error calling .json() on response w/ status {status_code}: {e.__class__} {e}"
+#                 if isinstance(e, KeyError):
+#                     print("Actual structure of response:", data)
+#         elif "json" in (mimetype or "").lower():
+#             is_error = True
+#             data = await http_response.json()
+#             error_message = json.dumps(data)
+#         else:
+#             is_error = True
+#             text = await http_response.text()
+#             error_message = text
+#         old_region = self.region
+#         if is_error and error_message is not None:
+#             if (
+#                 "rate limit" in error_message.lower()
+#                 or "temporarily out of capacity" in error_message.lower()
+#                 or "exceeded" in error_message.lower()
+#                 or
+#                 # 429 code
+#                 status_code == 429
+#             ):
+#                 error_message += " (Rate limit error, triggering cooldown & retrying with different model.)"
+#                 self.status_tracker.rate_limit_exceeded()
+#                 retry_with_different_model = (
+#                     True  # if possible, retry with a different model
+#                 )
+#         if is_error:
+#             # change the region in case error is due to region unavailability
+#             self.region = self.model.sample_region()
+#             assert self.region is not None, "Unable to sample region"
+#             self.url = f"https://{self.region}-aiplatform.googleapis.com/v1/projects/{self.project_id}/locations/{self.region}/publishers/google/models/{self.model.name}:generateContent"
+#         return APIResponse(
+#             id=self.task_id,
+#             status_code=status_code,
+#             is_error=is_error,
+#             error_message=error_message,
+#             prompt=self.prompt,
+#             completion=completion,
+#             model_internal=self.model_name,
+#             sampling_params=self.sampling_params,
+#             input_tokens=input_tokens,
+#             output_tokens=output_tokens,
+#             region=old_region,
+#             finish_reason=finish_reason,
+#             retry_with_different_model=retry_with_different_model,
+#             give_up_if_no_other_models=give_up_if_no_other_models,
+#         )
+# # class LlamaEndpointRequest(APIRequestBase):
+# #     raise NotImplementedError("Llama endpoints are not implemented and never will be because Vertex AI sucks ass.")

lm_deluge/api_requests/{cohere.py → mistral.py} RENAMED Viewed

@@ -1,20 +1,19 @@
-# https://docs.cohere.com/reference/chat
-# https://cohere.com/pricing
 import asyncio
+import warnings
 from aiohttp import ClientResponse
 import json
 import os
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from typing import Callable
-from lm_deluge.prompt import Conversation
-from .base import APIRequestBase, APIResponse
+from .base import APIRequestBase, APIResponse
+from ..prompt import Conversation
 from ..tracker import StatusTracker
 from ..sampling_params import SamplingParams
 from ..models import APIModel
-class CohereRequest(APIRequestBase):
+class MistralRequest(APIRequestBase):
     def __init__(
         self,
         task_id: int,
@@ -24,10 +23,12 @@ class CohereRequest(APIRequestBase):
         prompt: Conversation,
         attempts_left: int,
         status_tracker: StatusTracker,
-        results_arr: list,
         retry_queue: asyncio.Queue,
+        results_arr: list,
         request_timeout: int = 30,
         sampling_params: SamplingParams = SamplingParams(),
+        logprobs: bool = False,
+        top_logprobs: int | None = None,
         pbar: tqdm | None = None,
         callback: Callable | None = None,
         debug: bool = False,
@@ -44,32 +45,36 @@ class CohereRequest(APIRequestBase):
             results_arr=results_arr,
             request_timeout=request_timeout,
             sampling_params=sampling_params,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
             pbar=pbar,
             callback=callback,
             debug=debug,
             all_model_names=all_model_names,
             all_sampling_params=all_sampling_params,
         )
-        self.system_message = None
-        self.last_user_message = None
         self.model = APIModel.from_registry(model_name)
-        self.url = f"{self.model.api_base}/chat"
-        messages = prompt.to_cohere()
+        self.url = f"{self.model.api_base}/chat/completions"
         self.request_header = {
-            "Authorization": f"bearer {os.getenv(self.model.api_key_env_var)}",
-            "content-type": "application/json",
-            "accept": "application/json",
+            "Authorization": f"Bearer {os.getenv(self.model.api_key_env_var)}"
         }
         self.request_json = {
             "model": self.model.name,
-            "messages": messages,
+            "messages": prompt.to_mistral(),
             "temperature": sampling_params.temperature,
             "top_p": sampling_params.top_p,
             "max_tokens": sampling_params.max_new_tokens,
         }
+        if sampling_params.reasoning_effort:
+            warnings.warn(
+                f"Ignoring reasoning_effort param for non-reasoning model: {model_name}"
+            )
+        if logprobs:
+            warnings.warn(
+                f"Ignoring logprobs param for non-logprobs model: {model_name}"
+            )
+        if sampling_params.json_mode and self.model.supports_json:
+            self.request_json["response_format"] = {"type": "json_object"}
     async def handle_response(self, http_response: ClientResponse) -> APIResponse:
         is_error = False
@@ -77,41 +82,41 @@ class CohereRequest(APIRequestBase):
         completion = None
         input_tokens = None
         output_tokens = None
+        logprobs = None
         status_code = http_response.status
         mimetype = http_response.headers.get("Content-Type", None)
+        data = None
         if status_code >= 200 and status_code < 300:
             try:
                 data = await http_response.json()
             except Exception:
-                data = None
                 is_error = True
                 error_message = (
                     f"Error calling .json() on response w/ status {status_code}"
                 )
-            if not is_error and isinstance(data, dict):
+            if not is_error:
+                assert data is not None, "data is None"
                 try:
-                    completion = data["text"]
-                    input_tokens = data["meta"]["billed_units"]["input_tokens"]
-                    output_tokens = data["meta"]["billed_units"]["input_tokens"]
+                    completion = data["choices"][0]["message"]["content"]
+                    input_tokens = data["usage"]["prompt_tokens"]
+                    output_tokens = data["usage"]["completion_tokens"]
+                    if self.logprobs and "logprobs" in data["choices"][0]:
+                        logprobs = data["choices"][0]["logprobs"]["content"]
                 except Exception:
                     is_error = True
-                    error_message = f"Error getting 'text' or 'meta' from {self.model.name} response."
-        elif mimetype is not None and "json" in mimetype.lower():
+                    error_message = f"Error getting 'choices' and 'usage' from {self.model.name} response."
+        elif mimetype and "json" in mimetype.lower():
             is_error = True  # expected status is 200, otherwise it's an error
             data = await http_response.json()
             error_message = json.dumps(data)
         else:
             is_error = True
             text = await http_response.text()
             error_message = text
-        # handle special kinds of errors. TODO: make sure these are correct for anthropic
+        # handle special kinds of errors
         if is_error and error_message is not None:
-            if (
-                "rate limit" in error_message.lower()
-                or "overloaded" in error_message.lower()
-            ):
+            if "rate limit" in error_message.lower() or status_code == 429:
                 error_message += " (Rate limit error, triggering cooldown.)"
                 self.status_tracker.rate_limit_exceeded()
             if "context length" in error_message:
@@ -124,6 +129,7 @@ class CohereRequest(APIRequestBase):
             is_error=is_error,
             error_message=error_message,
             prompt=self.prompt,
+            logprobs=logprobs,
             completion=completion,
             model_internal=self.model_name,
             sampling_params=self.sampling_params,

lm-deluge 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

lm-deluge 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl