lm-deluge 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lm-deluge might be problematic. Click here for more details.

lm_deluge/util/xml.py ADDED
@@ -0,0 +1,291 @@
1
+ import re
2
+ from bs4 import BeautifulSoup, Tag
3
+
4
+ # import xml.etree.ElementTree as ET
5
+ from lxml import etree # type: ignore
6
+
7
+
8
+ def get_tag(
9
+ html_string: str, tag: str, return_attributes: bool = False
10
+ ) -> dict | str | None:
11
+ # Try to use regular expressions first
12
+ if html_string is None:
13
+ return None
14
+ try:
15
+ # Regex pattern to extract tag content and attributes
16
+ pattern = re.compile(rf"<{tag}([^>]*)>(.*?)</{tag}>", re.DOTALL)
17
+ match = pattern.search(html_string)
18
+ if match:
19
+ tag_attributes = match.group(1) # Attributes string from the opening tag
20
+ tag_contents = match.group(2) # Contents inside the tag
21
+
22
+ # If return_attributes is False, just return the tag contents
23
+ if not return_attributes:
24
+ return tag_contents
25
+
26
+ # Parse attributes into a dictionary
27
+ attributes_pattern = re.compile(
28
+ r'(\w+)\s*=\s*"([^"]*)"'
29
+ ) # Matches key="value"
30
+ attributes = dict(attributes_pattern.findall(tag_attributes))
31
+
32
+ return {"content": tag_contents, "attributes": attributes}
33
+ except re.error:
34
+ print(f"Failed to compile regular expression for HTML tag '{tag}'")
35
+
36
+ # If regexp fails, use BeautifulSoup
37
+ try:
38
+ soup = BeautifulSoup(html_string, "html.parser")
39
+ tag_content = soup.find(tag)
40
+ assert tag_content is None or isinstance(
41
+ tag_content, Tag
42
+ ), f"Unexpected type for tag_content: {type(tag_content)}"
43
+ if tag_content is not None:
44
+ tag_contents = tag_content.decode_contents()
45
+
46
+ # If return_attributes is False, return just the content
47
+ if not return_attributes:
48
+ return tag_contents
49
+
50
+ # Extract attributes from the tag
51
+ attributes = tag_content.attrs
52
+
53
+ return {"content": tag_contents, "attributes": attributes}
54
+ except Exception as e:
55
+ print(f"Failed to extract content from HTML tag '{tag}': {e}. Returning None.")
56
+
57
+ return None
58
+
59
+
60
+ def get_tags(html_string: str, tag: str, return_attributes: bool = False) -> list:
61
+ """
62
+ Extract all instances of the <tag></tag> in the string, not just the first.
63
+ If return_attributes is True, also return the tag's attributes.
64
+ """
65
+ if html_string is None:
66
+ return []
67
+
68
+ try:
69
+ # Regex pattern to match all instances of the tag and capture attributes and content
70
+ pattern = re.compile(rf"<{tag}([^>]*)>(.*?)</{tag}>", re.DOTALL)
71
+ matches = pattern.findall(html_string)
72
+
73
+ if not return_attributes:
74
+ return [
75
+ match[1] for match in matches
76
+ ] # Return just the content inside the tags
77
+
78
+ # Parse attributes if return_attributes is True
79
+ attributes_pattern = re.compile(r'(\w+)\s*=\s*"([^"]*)"') # Matches key="value"
80
+
81
+ results = []
82
+ for match in matches:
83
+ tag_attributes = match[0] # The attributes portion of the tag
84
+ tag_contents = match[1] # The content portion of the tag
85
+
86
+ # Parse attributes into a dictionary
87
+ attributes = dict(attributes_pattern.findall(tag_attributes))
88
+ results.append({"content": tag_contents, "attributes": attributes})
89
+ return results
90
+ except re.error:
91
+ print(f"Failed to compile regular expression for HTML tag '{tag}'")
92
+
93
+ # Fallback to BeautifulSoup if regex fails
94
+ try:
95
+ soup = BeautifulSoup(html_string, "html.parser")
96
+ tag_contents = soup.find_all(tag)
97
+
98
+ if not return_attributes:
99
+ return [tag_content.decode_contents() for tag_content in tag_contents] # type: ignore
100
+
101
+ # Collect content and attributes when return_attributes is True
102
+ results = []
103
+ for tag_content in tag_contents:
104
+ if isinstance(tag_content, Tag):
105
+ results.append(
106
+ {
107
+ "content": tag_content.decode_contents(),
108
+ "attributes": tag_content.attrs,
109
+ }
110
+ )
111
+ return results
112
+ except Exception as e:
113
+ print(
114
+ f"Failed to extract content from HTML tag '{tag}': {e}. Returning no matches."
115
+ )
116
+
117
+ return []
118
+
119
+
120
+ def strip_xml(xml_string: str) -> str:
121
+ """
122
+ Trim any text before the first '<' and after the last '>'.
123
+ """
124
+ if not xml_string:
125
+ return ""
126
+ start = xml_string.find("<")
127
+ end = xml_string.rfind(">") + 1
128
+ return xml_string[start:end] if start != -1 and end != 0 else xml_string
129
+
130
+
131
+ def remove_namespace_prefixes(xml_string):
132
+ """
133
+ Remove namespace prefixes from XML tags in the provided XML string.
134
+ """
135
+ # Remove namespace prefixes in opening and closing tags
136
+ xml_string = re.sub(r"<(/?)(\w+:)", r"<\1", xml_string)
137
+ # Remove namespace declarations in root element
138
+ xml_string = re.sub(r'xmlns(:\w+)?="[^"]+"', "", xml_string)
139
+ return xml_string
140
+
141
+
142
+ def object_to_xml(
143
+ obj: dict | list | str | int | float,
144
+ root_tag: str,
145
+ ignore_dict_nulls: bool = True,
146
+ list_item_tag: str = "li", # could also be "option", "item", etc.
147
+ include_list_index: bool = True,
148
+ index_attr: str = "key", # could be index, id, name, etc.
149
+ indent_level: int = 0,
150
+ indent_str: str = " ",
151
+ index=None,
152
+ ):
153
+ """
154
+ Convert a Python object to an XML string.
155
+ """
156
+ xml = indent_str * indent_level
157
+ xml += f"<{root_tag}"
158
+ if include_list_index and index is not None:
159
+ xml += f' {index_attr}="{index}"'
160
+ xml += ">\n"
161
+ # base case
162
+ if isinstance(obj, str) or isinstance(obj, int) or isinstance(obj, float):
163
+ xml += indent_str * (indent_level + 1)
164
+ xml += f"{obj}\n"
165
+ elif isinstance(obj, dict):
166
+ for key, value in obj.items():
167
+ if ignore_dict_nulls and value is None:
168
+ continue
169
+ xml += object_to_xml(
170
+ value,
171
+ root_tag=key,
172
+ list_item_tag=list_item_tag,
173
+ include_list_index=include_list_index,
174
+ index_attr=index_attr,
175
+ indent_level=indent_level + 1,
176
+ )
177
+ elif isinstance(obj, list):
178
+ for index, item in enumerate(obj):
179
+ xml += object_to_xml(
180
+ item,
181
+ root_tag=list_item_tag,
182
+ list_item_tag=list_item_tag,
183
+ include_list_index=include_list_index,
184
+ index_attr=index_attr,
185
+ indent_level=indent_level + 1,
186
+ index=index,
187
+ )
188
+ else:
189
+ raise ValueError("Unsupported object type.")
190
+
191
+ xml += indent_str * indent_level
192
+ xml += f"</{root_tag}>\n"
193
+ return xml
194
+
195
+
196
+ def parse_base_element(
197
+ elem_text: str | None, parse_empty_tags_as_none: bool, parse_null_text_as_none: bool
198
+ ):
199
+ if elem_text is None or elem_text.strip() == "":
200
+ return None if parse_empty_tags_as_none else ""
201
+ elem_text = elem_text.strip()
202
+ if parse_null_text_as_none and elem_text.lower() == "null":
203
+ return None
204
+ # Try int first, then float
205
+ try:
206
+ return int(elem_text)
207
+ except ValueError:
208
+ try:
209
+ return float(elem_text)
210
+ except ValueError:
211
+ return elem_text
212
+
213
+
214
+ def xml_to_object(
215
+ xml_string: str, parse_null_text_as_none=True, parse_empty_tags_as_none=False
216
+ ):
217
+ """
218
+ Intended to be the reverse of object_to_xml.
219
+ Written by ChatGPT so unclear if this will work as intended.
220
+ """
221
+ xml_string = strip_xml(xml_string)
222
+ xml_string = remove_namespace_prefixes(xml_string)
223
+ parser = etree.XMLParser(recover=True, ns_clean=True)
224
+ root = etree.fromstring(xml_string.encode("utf-8"), parser=parser)
225
+
226
+ def parse_element(element):
227
+ # Base case: element has no child elements
228
+ if len(element) == 0:
229
+ text = element.text
230
+ return parse_base_element(
231
+ text, parse_empty_tags_as_none, parse_null_text_as_none
232
+ )
233
+ else:
234
+ is_list = False
235
+ index_attrs = ["key", "index"]
236
+ # Get child tag names without namespace prefixes
237
+ child_tags = [etree.QName(child).localname for child in element]
238
+ # Treat it as a list if there are any repeated children
239
+ if (
240
+ len(set(child_tags)) == 1
241
+ and len(child_tags) == len(element)
242
+ and len(child_tags) > 1
243
+ ):
244
+ is_list = True
245
+ # Treat as list if it has one child, but the child has a "key" or "index" attribute
246
+ elif len(child_tags) == 1 and any(
247
+ attr in element[0].attrib for attr in index_attrs
248
+ ):
249
+ is_list = True
250
+ # If multiple child tag types, but has repeats, error
251
+ elif len(set(child_tags)) > 1 and len(set(child_tags)) < len(element):
252
+ raise ValueError(
253
+ "Cannot parse XML with multiple child tags and repeats."
254
+ )
255
+
256
+ if is_list:
257
+ items_with_index = []
258
+ for child in element:
259
+ # look for either <li key="…"> or <li index="…">
260
+ index_value = None
261
+ for attr in ("key", "index"):
262
+ if attr in child.attrib:
263
+ index_value = child.attrib[attr]
264
+ break
265
+ # normalise to int when possible
266
+ try:
267
+ if index_value is not None:
268
+ index_value = int(index_value)
269
+ except ValueError:
270
+ pass
271
+
272
+ items_with_index.append((index_value, parse_element(child)))
273
+
274
+ # Sort only when *all* items have an integer index
275
+ if all(idx is not None for idx, _ in items_with_index):
276
+ items_with_index.sort(key=lambda x: x[0])
277
+ return [item for _, item in items_with_index]
278
+ else:
279
+ # Treat as a dictionary
280
+ obj = {}
281
+ for child in element:
282
+ key = etree.QName(child).localname # Get tag name without namespace
283
+ value = parse_element(child)
284
+ if key in obj:
285
+ raise ValueError(
286
+ f"Duplicate key '{key}' found in XML when not expecting a list."
287
+ )
288
+ obj[key] = value
289
+ return obj
290
+
291
+ return parse_element(root)
@@ -0,0 +1,127 @@
1
+ Metadata-Version: 2.4
2
+ Name: lm_deluge
3
+ Version: 0.0.3
4
+ Summary: Python utility for using LLM API models.
5
+ Author-email: Benjamin Anderson <ben@trytaylor.ai>
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: python-dotenv
9
+ Requires-Dist: json5
10
+ Requires-Dist: PyYAML
11
+ Requires-Dist: pandas
12
+ Requires-Dist: aiohttp
13
+ Requires-Dist: tiktoken
14
+ Requires-Dist: xxhash
15
+ Requires-Dist: tqdm
16
+ Requires-Dist: google-auth
17
+ Requires-Dist: requests-aws4auth
18
+ Requires-Dist: pydantic
19
+ Requires-Dist: bs4
20
+ Requires-Dist: lxml
21
+ Provides-Extra: image
22
+ Requires-Dist: pdf2image; extra == "image"
23
+ Requires-Dist: pillow; extra == "image"
24
+ Provides-Extra: pdf
25
+ Requires-Dist: pdf2image; extra == "pdf"
26
+ Requires-Dist: pymupdf; extra == "pdf"
27
+ Provides-Extra: translate
28
+ Requires-Dist: fasttext-wheel; extra == "translate"
29
+ Requires-Dist: fasttext-langdetect; extra == "translate"
30
+ Provides-Extra: full
31
+ Requires-Dist: pillow; extra == "full"
32
+ Requires-Dist: pdf2image; extra == "full"
33
+ Requires-Dist: pymupdf; extra == "full"
34
+ Requires-Dist: fasttext-wheel; extra == "full"
35
+ Requires-Dist: fasttext-langdetect; extra == "full"
36
+
37
+ # lm_deluge
38
+
39
+ `lm_deluge` is a lightweight helper library for talking to large language model APIs. It wraps several providers under a single interface, handles rate limiting, and exposes a few useful utilities for common NLP tasks.
40
+
41
+ ## Features
42
+
43
+ - **Unified client** – send prompts to OpenAI‑compatible models, Anthropic, Cohere and Vertex hosted Claude models using the same API.
44
+ - **Async or sync** – process prompts concurrently with `process_prompts_async` or run them synchronously with `process_prompts_sync`.
45
+ - **Spray across providers** – configure multiple model names with weighting so requests are distributed across different providers.
46
+ - **Caching** – optional LevelDB, SQLite or custom caches to avoid duplicate calls.
47
+ - **Embeddings and reranking** – helper functions for embedding text and reranking documents via Cohere/OpenAI endpoints.
48
+ - **Built‑in tools** – simple `extract`, `translate` and `score_llm` helpers for common patterns.
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install lm_deluge
54
+ ```
55
+
56
+ The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY` (for Llama) and `GOOGLE_APPLICATION_CREDENTIALS` for Vertex.
57
+
58
+ ## Quickstart
59
+
60
+ ```python
61
+ from lm_deluge import LLMClient
62
+
63
+ client = LLMClient.basic(
64
+ model=["gpt-4o-mini"], # any model id from lm_deluge.models.registry
65
+ temperature=0.2,
66
+ max_new_tokens=256,
67
+ )
68
+
69
+ resp = client.process_prompts_sync(["Hello, world!"]) # returns list[APIResponse]
70
+ print(resp[0].completion)
71
+ ```
72
+
73
+ ### Asynchronous usage
74
+
75
+ ```python
76
+ import asyncio
77
+
78
+ async def main():
79
+ responses = await client.process_prompts_async(
80
+ ["an async call"],
81
+ return_completions_only=True,
82
+ )
83
+ print(responses[0])
84
+
85
+ asyncio.run(main())
86
+ ```
87
+
88
+ ### Distributing requests across models
89
+
90
+ You can provide multiple `model_names` and optional `model_weights` when creating an `LLMClient`. Each prompt will be sent to one of the models based on those weights.
91
+
92
+ ```python
93
+ client = LLMClient(
94
+ model_names=["gpt-4o-mini", "claude-haiku-anthropic"],
95
+ model_weights="rate_limit", # or a list like [0.7, 0.3]
96
+ max_requests_per_minute=5000,
97
+ max_tokens_per_minute=1_000_000,
98
+ max_concurrent_requests=100,
99
+ )
100
+ ```
101
+
102
+ ### Provider specific notes
103
+
104
+ - **OpenAI and compatible providers** – set `OPENAI_API_KEY`. Model ids in the registry include OpenAI models as well as Meta Llama, Grok and many others that expose OpenAI style APIs.
105
+ - **Anthropic** – set `ANTHROPIC_API_KEY`. Use model ids such as `claude-haiku-anthropic` or `claude-sonnet-anthropic`.
106
+ - **Cohere** – set `COHERE_API_KEY`. Models like `command-r` are available.
107
+ - **Vertex Claude** – set `GOOGLE_APPLICATION_CREDENTIALS` and `PROJECT_ID`. Use a model id such as `claude-sonnet-vertex`.
108
+
109
+ The [models.py](src/lm_deluge/models.py) file lists every supported model and the required environment variable.
110
+
111
+ ## Built‑in tools
112
+
113
+ The `lm_deluge.llm_tools` package exposes a few helper functions:
114
+
115
+ - `extract` – structure text or images into a Pydantic model based on a schema.
116
+ - `translate` – translate a list of strings to English if needed.
117
+ - `score_llm` – simple yes/no style scoring with optional log probability output.
118
+
119
+ Embeddings (`embed.embed_parallel_async`) and document reranking (`rerank.rerank_parallel_async`) are also provided.
120
+
121
+ ## Caching results
122
+
123
+ `lm_deluge.cache` includes LevelDB, SQLite and custom dictionary based caches. Pass an instance via `LLMClient(..., cache=my_cache)` and previously seen prompts will not be re‑sent.
124
+
125
+ ## Development notes
126
+
127
+ Models and costs are defined in [src/lm_deluge/models.py](src/lm_deluge/models.py). Conversations are built using the `Conversation` and `Message` helpers in [src/lm_deluge/prompt.py](src/lm_deluge/prompt.py), which also support images.
@@ -0,0 +1,37 @@
1
+ lm_deluge/__init__.py,sha256=r3Erra7aT5i2Mu5B9lcY8ll1T6q8_xF-xgnataKgKPg,153
2
+ lm_deluge/cache.py,sha256=VB1kv8rM2t5XWPR60uhszFcxLDnVKOe1oA5hYjVDjIo,4375
3
+ lm_deluge/client.py,sha256=oUtzZEbKwlW1s6bjSikMvliRXTpT6dIj5GRNL6ZZBh8,29327
4
+ lm_deluge/embed.py,sha256=3hpGD_ljMsaQ_KV74a9ejNobPbEiQfTRSVFD97KQdHc,14017
5
+ lm_deluge/errors.py,sha256=oHjt7YnxWbh-eXMScIzov4NvpJMo0-2r5J6Wh5DQ1tk,209
6
+ lm_deluge/gemini_limits.py,sha256=V9mpS9JtXYz7AY6OuKyQp5TuIMRH1BVv9YrSNmGmHNA,1569
7
+ lm_deluge/image.py,sha256=8tIPMZ5-R_JUVHXH8SpPGrmxBMSiU1XU3oFv4AkVr1c,7028
8
+ lm_deluge/models.py,sha256=jrD8CYKZIQ46VYvXL0ffFKqU6V3_aQrTv9i3hxqge5I,40364
9
+ lm_deluge/prompt.py,sha256=AVTvm2n-NV9PqFWJaKGeGYQbXpzTcKQGicBEIW8jfxQ,12710
10
+ lm_deluge/rerank.py,sha256=M5NbzDisUfxb7d_Jm7uio7zM4GkK0B5YzKQvpBmqepI,12082
11
+ lm_deluge/sampling_params.py,sha256=E2kewh1vz-1Qcy5xNBCzihfGgT_GcHYMfzaWb3FLiXs,739
12
+ lm_deluge/tool.py,sha256=RVUW3E3FW11jCM-R7pIL1GpRs1YKCOjvTkL1D5xPetk,3196
13
+ lm_deluge/tracker.py,sha256=iTA8yN6tslqjiqUPz-bWkyModjlWNoygM9twlvtg7WY,306
14
+ lm_deluge/api_requests/__init__.py,sha256=_aSpD6CJL9g6OpLPoChXiHjl4MH_OlGcKgfZaW8cgLM,71
15
+ lm_deluge/api_requests/anthropic.py,sha256=lsugNe3mbtnkvLjX_kqRbqrJJoMOjHtaoq2FPlgB408,6887
16
+ lm_deluge/api_requests/base.py,sha256=CuRqnP7r5flnF8JlYq2B_w5YZSiXhlED3aYxdqUp5UQ,15080
17
+ lm_deluge/api_requests/cohere.py,sha256=eC-MWMWYMkYD5Q8oe7H6TbB2lYOekaJ8ta0SISv-i1o,5202
18
+ lm_deluge/api_requests/common.py,sha256=ZtUirAlYNE4CJOQPbNip-mWKOMXCJi2_malwYsUcqsg,642
19
+ lm_deluge/api_requests/google.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ lm_deluge/api_requests/openai.py,sha256=HP5ra8tpz9Ojbh3tL4LZZa2HIQ41OoBTZl205sU1MPU,5779
21
+ lm_deluge/api_requests/vertex.py,sha256=XNFAVte_DuHUWGSNCQY0kmNgHwEmIg8Otg6FGDdq8AQ,14857
22
+ lm_deluge/api_requests/deprecated/bedrock.py,sha256=Wtjt8DFb3Fy1JNl2jAH-t5dmZfEerGUpddmLo241Ri8,11332
23
+ lm_deluge/api_requests/deprecated/deepseek.py,sha256=VVyaSgs9VQp5SzTqrWE8yD9VNBiGKONbm_hoa2rBqLo,4588
24
+ lm_deluge/api_requests/deprecated/mistral.py,sha256=20XP0tHYDIfx_C7-lyc9dXV1k3YwbCIQo2iqmO-zFPo,4715
25
+ lm_deluge/llm_tools/__init__.py,sha256=TbZTETq9i_9yYskFWQKOG4pGh5ZiyE_D-h3RArfhGp4,231
26
+ lm_deluge/llm_tools/extract.py,sha256=exoLGxUf8uKDZZ9SY1rC4O3gPTwa0vbVeOXi9RjzJjs,3671
27
+ lm_deluge/llm_tools/score.py,sha256=9oGA3-k2U5buHQXkXaEI9M4Wb5yysNhTLsPbGeghAlQ,2580
28
+ lm_deluge/llm_tools/translate.py,sha256=iXyYvQZ8bC44FWhBk4qpdqjKM1WFF7Shq-H2PxhPgg4,1452
29
+ lm_deluge/util/json.py,sha256=KuzyXvBCwoW2PLc6durR0aNA0MjlEcQprA1NTZmqh_g,5326
30
+ lm_deluge/util/logprobs.py,sha256=Fs0It0G9hTnZ1TkmRn45TEFOoIp9iv-__roYItLOs18,11781
31
+ lm_deluge/util/pdf.py,sha256=zBcuh2IJxRfro6JPzQkAqdc6hUcrduFwb9aEoacNG9U,1590
32
+ lm_deluge/util/validation.py,sha256=hz5dDb3ebvZrZhnaWxOxbNSVMI6nmaOODBkk0htAUhs,1575
33
+ lm_deluge/util/xml.py,sha256=-yUKOZwsAY009w4ppQ8FI4hb5gDvLzMwcSgv88aEnaE,10578
34
+ lm_deluge-0.0.3.dist-info/METADATA,sha256=xZTdJ-Tx7kRArPZiKqoohO1G8v0Wg6Bp3b8td6HXtWo,4956
35
+ lm_deluge-0.0.3.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
36
+ lm_deluge-0.0.3.dist-info/top_level.txt,sha256=hqU-TJX93yBwpgkDtYcXyLr3t7TLSCCZ_reytJjwBaE,10
37
+ lm_deluge-0.0.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ lm_deluge