PyPI - lm-deluge - Versions diffs - 0.0.3__py3-none-any.whl - Mend

lm-deluge 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lm-deluge might be problematic. Click here for more details.

Files changed (37) hide show

lm_deluge/__init__.py +6 -0
lm_deluge/api_requests/__init__.py +3 -0
lm_deluge/api_requests/anthropic.py +177 -0
lm_deluge/api_requests/base.py +375 -0
lm_deluge/api_requests/cohere.py +138 -0
lm_deluge/api_requests/common.py +18 -0
lm_deluge/api_requests/deprecated/bedrock.py +288 -0
lm_deluge/api_requests/deprecated/deepseek.py +118 -0
lm_deluge/api_requests/deprecated/mistral.py +120 -0
lm_deluge/api_requests/google.py +0 -0
lm_deluge/api_requests/openai.py +145 -0
lm_deluge/api_requests/vertex.py +365 -0
lm_deluge/cache.py +144 -0
lm_deluge/client.py +760 -0
lm_deluge/embed.py +392 -0
lm_deluge/errors.py +8 -0
lm_deluge/gemini_limits.py +65 -0
lm_deluge/image.py +200 -0
lm_deluge/llm_tools/__init__.py +11 -0
lm_deluge/llm_tools/extract.py +111 -0
lm_deluge/llm_tools/score.py +71 -0
lm_deluge/llm_tools/translate.py +44 -0
lm_deluge/models.py +957 -0
lm_deluge/prompt.py +355 -0
lm_deluge/rerank.py +338 -0
lm_deluge/sampling_params.py +25 -0
lm_deluge/tool.py +106 -0
lm_deluge/tracker.py +12 -0
lm_deluge/util/json.py +167 -0
lm_deluge/util/logprobs.py +446 -0
lm_deluge/util/pdf.py +45 -0
lm_deluge/util/validation.py +46 -0
lm_deluge/util/xml.py +291 -0
lm_deluge-0.0.3.dist-info/METADATA +127 -0
lm_deluge-0.0.3.dist-info/RECORD +37 -0
lm_deluge-0.0.3.dist-info/WHEEL +5 -0
lm_deluge-0.0.3.dist-info/top_level.txt +1 -0

lm_deluge/util/logprobs.py ADDED Viewed

@@ -0,0 +1,446 @@
+import re
+import numpy as np
+from typing import TypedDict, Optional, Callable
+class TopLogprob(TypedDict):
+    token: str
+    logprob: float
+    bytes: list[int]
+class LogprobEntry(TypedDict):
+    token: str
+    logprob: float
+    bytes: list[int]
+    top_logprobs: list[TopLogprob]
+Logprobs = list[LogprobEntry]
+## In our implementation of APIResponse, the 'logprobs' field contains
+## just the 'content' field from the respons.choices[0].logprobs object.
+# {
+#   "id": "chatcmpl-A6izyp6wnlEv6SLAb0ehIwBqCDLyR",
+#   "object": "chat.completion",
+#   "created": 1726166306,
+#   "model": "gpt-4o-mini-2024-07-18",
+#   "choices": [
+#     {
+#       "index": 0,
+#       "message": {
+#         "role": "assistant",
+#         "content": "A loop within loops,",
+#         "refusal": null
+#       },
+#       "logprobs": {
+#         "content": [
+#           {
+#             "token": "A",
+#             "logprob": -1.0330456,
+#             "bytes": [
+#               65
+#             ],
+#             "top_logprobs": [
+#               {
+#                 "token": "A",
+#                 "logprob": -1.0330456,
+#                 "bytes": [
+#                   65
+#                 ]
+#               },
+#               {
+#                 "token": "In",
+#                 "logprob": -2.0330458,
+#                 "bytes": [
+#                   73,
+#                   110
+#                 ]
+#               },
+#               {
+#                 "token": "Nested",
+#                 "logprob": -2.0330458,
+#                 "bytes": [
+#                   78,
+#                   101,
+#                   115,
+#                   116,
+#                   101,
+#                   100
+#                 ]
+#               },
+#               {
+#                 "token": "Function",
+#                 "logprob": -2.7830458,
+#                 "bytes": [
+#                   70,
+#                   117,
+#                   110,
+#                   99,
+#                   116,
+#                   105,
+#                   111,
+#                   110
+#                 ]
+#               },
+#               {
+#                 "token": "Layers",
+#                 "logprob": -3.1580458,
+#                 "bytes": [
+#                   76,
+#                   97,
+#                   121,
+#                   101,
+#                   114,
+#                   115
+#                 ]
+#               }
+#             ]
+#           },
+#           {
+#             "token": " loop",
+#             "logprob": -2.909274,
+#             "bytes": [
+#               32,
+#               108,
+#               111,
+#               111,
+#               112
+#             ],
+#             "top_logprobs": [
+#               {
+#                 "token": " function",
+#                 "logprob": -0.9092741,
+#                 "bytes": [
+#                   32,
+#                   102,
+#                   117,
+#                   110,
+#                   99,
+#                   116,
+#                   105,
+#                   111,
+#                   110
+#                 ]
+#               },
+#               {
+#                 "token": " call",
+#                 "logprob": -1.0342741,
+#                 "bytes": [
+#                   32,
+#                   99,
+#                   97,
+#                   108,
+#                   108
+#                 ]
+#               },
+#               {
+#                 "token": " task",
+#                 "logprob": -2.409274,
+#                 "bytes": [
+#                   32,
+#                   116,
+#                   97,
+#                   115,
+#                   107
+#                 ]
+#               },
+#               {
+#                 "token": " loop",
+#                 "logprob": -2.909274,
+#                 "bytes": [
+#                   32,
+#                   108,
+#                   111,
+#                   111,
+#                   112
+#                 ]
+#               },
+#               {
+#                 "token": " problem",
+#                 "logprob": -4.034274,
+#                 "bytes": [
+#                   32,
+#                   112,
+#                   114,
+#                   111,
+#                   98,
+#                   108,
+#                   101,
+#                   109
+#                 ]
+#               }
+#             ]
+#           },
+#           {
+#             "token": " within",
+#             "logprob": -0.09628018,
+#             "bytes": [
+#               32,
+#               119,
+#               105,
+#               116,
+#               104,
+#               105,
+#               110
+#             ],
+#             "top_logprobs": [
+#               {
+#                 "token": " within",
+#                 "logprob": -0.09628018,
+#                 "bytes": [
+#                   32,
+#                   119,
+#                   105,
+#                   116,
+#                   104,
+#                   105,
+#                   110
+#                 ]
+#               },
+#               {
+#                 "token": " in",
+#                 "logprob": -2.72128,
+#                 "bytes": [
+#                   32,
+#                   105,
+#                   110
+#                 ]
+#               },
+#               {
+#                 "token": " of",
+#                 "logprob": -4.47128,
+#                 "bytes": [
+#                   32,
+#                   111,
+#                   102
+#                 ]
+#               },
+#               {
+#                 "token": " that",
+#                 "logprob": -5.34628,
+#                 "bytes": [
+#                   32,
+#                   116,
+#                   104,
+#                   97,
+#                   116
+#                 ]
+#               },
+#               {
+#                 "token": " inside",
+#                 "logprob": -5.59628,
+#                 "bytes": [
+#                   32,
+#                   105,
+#                   110,
+#                   115,
+#                   105,
+#                   100,
+#                   101
+#                 ]
+#               }
+#             ]
+#           },
+#           {
+#             "token": " loops",
+#             "logprob": -0.12761699,
+#             "bytes": [
+#               32,
+#               108,
+#               111,
+#               111,
+#               112,
+#               115
+#             ],
+#             "top_logprobs": [
+#               {
+#                 "token": " loops",
+#                 "logprob": -0.12761699,
+#                 "bytes": [
+#                   32,
+#                   108,
+#                   111,
+#                   111,
+#                   112,
+#                   115
+#                 ]
+#               },
+#               {
+#                 "token": " self",
+#                 "logprob": -3.127617,
+#                 "bytes": [
+#                   32,
+#                   115,
+#                   101,
+#                   108,
+#                   102
+#                 ]
+#               },
+#               {
+#                 "token": " loop",
+#                 "logprob": -3.627617,
+#                 "bytes": [
+#                   32,
+#                   108,
+#                   111,
+#                   111,
+#                   112
+#                 ]
+#               },
+#               {
+#                 "token": " calls",
+#                 "logprob": -4.377617,
+#                 "bytes": [
+#                   32,
+#                   99,
+#                   97,
+#                   108,
+#                   108,
+#                   115
+#                 ]
+#               },
+#               {
+#                 "token": " itself",
+#                 "logprob": -4.877617,
+#                 "bytes": [
+#                   32,
+#                   105,
+#                   116,
+#                   115,
+#                   101,
+#                   108,
+#                   102
+#                 ]
+#               }
+#             ]
+#           },
+#           {
+#             "token": ",",
+#             "logprob": -1.7432603e-6,
+#             "bytes": [
+#               44
+#             ],
+#             "top_logprobs": [
+#               {
+#                 "token": ",",
+#                 "logprob": -1.7432603e-6,
+#                 "bytes": [
+#                   44
+#                 ]
+#               },
+#               {
+#                 "token": "  \n",
+#                 "logprob": -13.875002,
+#                 "bytes": [
+#                   32,
+#                   32,
+#                   10
+#                 ]
+#               },
+#               {
+#                 "token": "—",
+#                 "logprob": -14.750002,
+#                 "bytes": [
+#                   226,
+#                   128,
+#                   148
+#                 ]
+#               },
+#               {
+#                 "token": ",\n",
+#                 "logprob": -15.000002,
+#                 "bytes": [
+#                   44,
+#                   10
+#                 ]
+#               },
+#               {
+#                 "token": ";",
+#                 "logprob": -17.375002,
+#                 "bytes": [
+#                   59
+#                 ]
+#               }
+#             ]
+#           }
+#         ],
+#         "refusal": null
+#       },
+#       "finish_reason": "length"
+#     }
+#   ],
+#   "usage": {
+#     "prompt_tokens": 28,
+#     "completion_tokens": 5,
+#     "total_tokens": 33
+#   },
+#   "system_fingerprint": "fp_483d39d857"
+# }
+def normalize_token(token: str):
+    return re.sub(r"[^a-z]", "", token.lower())
+def is_match(token1: str, token2: str):
+    token1 = normalize_token(token1)
+    token2 = normalize_token(token2)
+    if token1 == token2:
+        return True
+    elif token1.startswith(token2):
+        return True
+    elif token2.startswith(token1):
+        return True
+    else:
+        return False
+def extract_prob(
+    token: str,
+    logprobs: Logprobs,
+    use_top_logprobs: bool = False,
+    normalize_top_logprobs: bool = True,  # if using top_logprobs, normalize by all the present tokens so they add up to 1
+    use_complement: bool = False,  # if True, assume there's 2 choices, and return 1 - p if the top token doesn't match
+    token_index: int = 0,  # get from the first token of the completion by default
+    token_match_fn: Optional[Callable[[str, str], bool]] = is_match,
+):
+    """
+    Extract the probability of the token from the logprobs object of a single
+    completion.
+    """
+    # ensure the token_index is valid
+    if token_index >= len(logprobs):
+        raise ValueError("token_index must be less than the length of logprobs.")
+    entry: LogprobEntry = logprobs[token_index]
+    # if using top_logprobs, ensure that at least one top_logprob is present
+    if use_top_logprobs:
+        if entry.get("top_logprobs", None) is None or len(entry["top_logprobs"]) == 0:
+            raise ValueError(
+                "top_logprobs must be present in logprobs to use top_logprobs=True."
+            )
+        top_tokens = [t["token"] for t in entry["top_logprobs"]]
+        top_probs = [np.exp(t["logprob"]) for t in entry["top_logprobs"]]
+        combined_prob = sum(
+            [p for t, p in zip(top_tokens, top_probs) if is_match(t, token)]
+        )
+        if normalize_top_logprobs:
+            # no point in using complement if normalizing; it will always be 0 if not present
+            return combined_prob / sum(top_probs)
+        elif combined_prob > 0:
+            return combined_prob
+        elif use_complement:
+            return 1 - combined_prob
+        else:
+            return 0.0
+    else:
+        top_token = entry["token"]
+        top_prob = np.exp(entry["logprob"])
+        if is_match(top_token, token):
+            return top_prob
+        elif use_complement:
+            return 1 - top_prob
+        else:
+            return 0.0

lm_deluge/util/pdf.py ADDED Viewed

@@ -0,0 +1,45 @@
+import io
+def text_from_pdf(pdf: str | bytes | io.BytesIO):
+    """
+    Extract text from a PDF. Does NOT use OCR, extracts the literal text.
+    The source can be:
+    - A file path (str)
+    - Bytes of a PDF file
+    - A BytesIO object containing a PDF file
+    """
+    try:
+        import pymupdf  # pyright: ignore
+    except ImportError:
+        raise ImportError(
+            "pymupdf is required to extract text from PDFs. Install lm_deluge[pdf] or lm_deluge[full]."
+        )
+    if isinstance(pdf, str):
+        # It's a file path
+        doc = pymupdf.open(pdf)
+    elif isinstance(pdf, (bytes, io.BytesIO)):
+        # It's bytes or a BytesIO object
+        if isinstance(pdf, bytes):
+            pdf = io.BytesIO(pdf)
+        doc = pymupdf.open(stream=pdf, filetype="pdf")
+    else:
+        raise ValueError("Unsupported pdf_source type. Must be str, bytes, or BytesIO.")
+    text_content = []
+    for page in doc:
+        blocks = page.get_text("blocks", sort=True)
+        for block in blocks:
+            # block[4] contains the text content
+            text_content.append(block[4].strip())
+            text_content.append("\n")  # Add extra newlines between blocks
+    # Join all text content with newlines
+    full_text = "\n".join(text_content).strip()
+    # Replace multiple consecutive spaces with a single space
+    full_text = " ".join(full_text.split())
+    # Clean up any resulting double spaces or newlines
+    full_text = " ".join([x for x in full_text.split(" ") if x])
+    full_text = "\n".join([x for x in full_text.split("\n") if x])
+    return full_text

lm_deluge/util/validation.py ADDED Viewed

@@ -0,0 +1,46 @@
+from pydantic import BaseModel, ValidationError
+from .json import load_json
+from .xml import get_tag, xml_to_object
+def get_model_from_json(
+    json_string: str,
+    model_class: BaseModel,
+) -> BaseModel:
+    try:
+        model_dict = load_json(json_string)
+        return model_class(**model_dict)  # pyright: ignore
+    except ValidationError as ve:
+        # Handle validation errors if necessary
+        raise ve
+def get_model_from_xml(xml_string: str, model_class: BaseModel, shallow: bool = True):
+    """
+    Convert an XML string to a Pydantic model.
+    If shallow is True, we don't try to parse the whole XML tree
+    into a Python object, we just try to extract each key's tag
+    with regex and fill the model's fields in that way.
+    """
+    if shallow:
+        # iterate over the fields of the model
+        model_dict = {}
+        for field_name, field_info in model_class.__fields__.items():
+            val = get_tag(xml_string, field_name)
+            if val is not None:
+                # no nested models for 'shallow' mode
+                model_dict[field_name] = val
+        try:
+            return model_class(**model_dict)  # pyright: ignore
+        except ValidationError as ve:
+            # Handle validation errors if necessary
+            raise ve
+    else:
+        # use helper to parse the whole tree
+        model_dict = xml_to_object(xml_string)
+        try:
+            return model_class(**model_dict)  # pyright: ignore
+        except ValidationError as ve:
+            # Handle validation errors if necessary
+            raise ve