speedy-utils 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/__init__.py +9 -1
- llm_utils/chat_format/display.py +109 -14
- llm_utils/lm/__init__.py +12 -11
- llm_utils/lm/async_lm/async_llm_task.py +1 -10
- llm_utils/lm/async_lm/async_lm.py +13 -4
- llm_utils/lm/async_lm/async_lm_base.py +24 -14
- llm_utils/lm/base_prompt_builder.py +288 -0
- llm_utils/lm/llm_task.py +693 -0
- llm_utils/lm/lm.py +207 -0
- llm_utils/lm/lm_base.py +285 -0
- llm_utils/lm/openai_memoize.py +2 -2
- llm_utils/vector_cache/core.py +285 -89
- speedy_utils/__init__.py +2 -1
- speedy_utils/common/patcher.py +68 -0
- speedy_utils/common/utils_cache.py +6 -6
- speedy_utils/common/utils_io.py +238 -8
- speedy_utils/multi_worker/process.py +180 -192
- speedy_utils/multi_worker/thread.py +94 -2
- {speedy_utils-1.1.17.dist-info → speedy_utils-1.1.19.dist-info}/METADATA +36 -14
- {speedy_utils-1.1.17.dist-info → speedy_utils-1.1.19.dist-info}/RECORD +24 -19
- {speedy_utils-1.1.17.dist-info → speedy_utils-1.1.19.dist-info}/WHEEL +1 -1
- speedy_utils-1.1.19.dist-info/entry_points.txt +5 -0
- speedy_utils-1.1.17.dist-info/entry_points.txt +0 -6
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Simplified LLM Task module for handling language model interactions with structured input/output.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Dict, List, Optional, Type, Union, cast
|
|
8
|
+
|
|
9
|
+
from openai import OpenAI
|
|
10
|
+
from openai.types.chat import ChatCompletionMessageParam
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
from pydantic import create_model
|
|
13
|
+
from typing import Callable, Tuple
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
|
|
16
|
+
# Type aliases for better readability
|
|
17
|
+
Messages = List[ChatCompletionMessageParam]
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from typing import Type, TypeVar
|
|
21
|
+
|
|
22
|
+
B = TypeVar("B", bound="BasePromptBuilder")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BasePromptBuilder(BaseModel, ABC):
|
|
26
|
+
"""
|
|
27
|
+
Abstract base class for prompt builders.
|
|
28
|
+
Provides a consistent interface for:
|
|
29
|
+
- input/output key declaration
|
|
30
|
+
- prompt building
|
|
31
|
+
- schema enforcement via auto-built modget_io_keysels
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# ------------------------------------------------------------------ #
|
|
35
|
+
# Abstract methods
|
|
36
|
+
# ------------------------------------------------------------------ #
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def get_instruction(self) -> str:
|
|
39
|
+
"""Return the system instruction string (role of the model)."""
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_io_keys(self) -> Tuple[List[str], List[Union[str, Tuple[str, str]]]]:
|
|
44
|
+
"""
|
|
45
|
+
Return (input_keys, output_keys).
|
|
46
|
+
Each key must match a field of the subclass.
|
|
47
|
+
For output_keys, you can use:
|
|
48
|
+
- str: Use the field name as-is
|
|
49
|
+
- tuple[str, str]: (original_field_name, renamed_field_name)
|
|
50
|
+
Input keys are always strings.
|
|
51
|
+
"""
|
|
52
|
+
raise NotImplementedError
|
|
53
|
+
|
|
54
|
+
# ------------------------------------------------------------------ #
|
|
55
|
+
# Auto-build models from keys
|
|
56
|
+
# ------------------------------------------------------------------ #
|
|
57
|
+
def _build_model_from_keys(self, keys: Union[List[str], List[Union[str, Tuple[str, str]]]], name: str) -> Type[BaseModel]:
|
|
58
|
+
fields: Dict[str, tuple[Any, Any]] = {}
|
|
59
|
+
for key in keys:
|
|
60
|
+
if isinstance(key, tuple):
|
|
61
|
+
# Handle tuple: (original_field_name, renamed_field_name)
|
|
62
|
+
original_key, renamed_key = key
|
|
63
|
+
if original_key not in self.model_fields:
|
|
64
|
+
raise ValueError(f"Key '{original_key}' not found in model fields")
|
|
65
|
+
field_info = self.model_fields[original_key]
|
|
66
|
+
field_type = field_info.annotation if field_info.annotation is not None else (Any,)
|
|
67
|
+
default = field_info.default if field_info.default is not None else ...
|
|
68
|
+
fields[renamed_key] = (field_type, default)
|
|
69
|
+
else:
|
|
70
|
+
# Handle string key
|
|
71
|
+
if key not in self.model_fields:
|
|
72
|
+
raise ValueError(f"Key '{key}' not found in model fields")
|
|
73
|
+
field_info = self.model_fields[key]
|
|
74
|
+
field_type = field_info.annotation if field_info.annotation is not None else (Any,)
|
|
75
|
+
default = field_info.default if field_info.default is not None else ...
|
|
76
|
+
fields[key] = (field_type, default)
|
|
77
|
+
return create_model(name, **fields) # type: ignore
|
|
78
|
+
|
|
79
|
+
def get_input_model(self) -> Type[BaseModel]:
|
|
80
|
+
input_keys, _ = self.get_io_keys()
|
|
81
|
+
return self._build_model_from_keys(input_keys, "InputModel")
|
|
82
|
+
|
|
83
|
+
def get_output_model(self) -> Type[BaseModel]:
|
|
84
|
+
_, output_keys = self.get_io_keys()
|
|
85
|
+
return self._build_model_from_keys(output_keys, "OutputModel")
|
|
86
|
+
|
|
87
|
+
# ------------------------------------------------------------------ #
|
|
88
|
+
# Dump methods (JSON)
|
|
89
|
+
# ------------------------------------------------------------------ #
|
|
90
|
+
def _dump_json_unique(self, schema_model: Type[BaseModel], keys: Union[List[str], List[Union[str, Tuple[str, str]]]], **kwargs) -> str:
|
|
91
|
+
allowed = list(schema_model.model_fields.keys())
|
|
92
|
+
seen = set()
|
|
93
|
+
unique_keys = [k for k in allowed if not (k in seen or seen.add(k))]
|
|
94
|
+
data = self.model_dump()
|
|
95
|
+
|
|
96
|
+
# Handle key mapping for renamed fields
|
|
97
|
+
filtered = {}
|
|
98
|
+
for key in keys:
|
|
99
|
+
if isinstance(key, tuple):
|
|
100
|
+
original_key, renamed_key = key
|
|
101
|
+
if original_key in data and renamed_key in unique_keys:
|
|
102
|
+
filtered[renamed_key] = data[original_key]
|
|
103
|
+
else:
|
|
104
|
+
if key in data and key in unique_keys:
|
|
105
|
+
filtered[key] = data[key]
|
|
106
|
+
|
|
107
|
+
return schema_model(**filtered).model_dump_json(**kwargs)
|
|
108
|
+
|
|
109
|
+
def model_dump_json_input(self, **kwargs) -> str:
|
|
110
|
+
input_keys, _ = self.get_io_keys()
|
|
111
|
+
return self._dump_json_unique(self.get_input_model(), input_keys, **kwargs)
|
|
112
|
+
|
|
113
|
+
def model_dump_json_output(self, **kwargs) -> str:
|
|
114
|
+
_, output_keys = self.get_io_keys()
|
|
115
|
+
return self._dump_json_unique(self.get_output_model(), output_keys, **kwargs)
|
|
116
|
+
|
|
117
|
+
# ------------------------------------------------------------------ #
|
|
118
|
+
# Markdown helpers
|
|
119
|
+
# ------------------------------------------------------------------ #
|
|
120
|
+
def _to_markdown(self, obj: Any, level: int = 1, title: Optional[str] = None) -> str:
|
|
121
|
+
"""
|
|
122
|
+
Recursively convert dict/list/primitive into clean, generic Markdown.
|
|
123
|
+
"""
|
|
124
|
+
md: List[str] = []
|
|
125
|
+
|
|
126
|
+
# Format title if provided
|
|
127
|
+
if title is not None:
|
|
128
|
+
formatted_title = title.replace('_', ' ').title()
|
|
129
|
+
if level <= 2:
|
|
130
|
+
md.append(f"{'#' * level} {formatted_title}")
|
|
131
|
+
else:
|
|
132
|
+
md.append(f"**{formatted_title}:**")
|
|
133
|
+
|
|
134
|
+
if isinstance(obj, dict):
|
|
135
|
+
if not obj: # Empty dict
|
|
136
|
+
md.append("None")
|
|
137
|
+
else:
|
|
138
|
+
for k, v in obj.items():
|
|
139
|
+
if isinstance(v, (str, int, float, bool)) and len(str(v)) < 100:
|
|
140
|
+
# Short values inline
|
|
141
|
+
key_name = k.replace('_', ' ').title()
|
|
142
|
+
if level <= 2:
|
|
143
|
+
md.append(f"**{key_name}:** {v}")
|
|
144
|
+
else:
|
|
145
|
+
md.append(f"- **{key_name}:** {v}")
|
|
146
|
+
else:
|
|
147
|
+
# Complex values get recursive handling
|
|
148
|
+
md.append(self._to_markdown(v, level=level + 1, title=k))
|
|
149
|
+
elif isinstance(obj, list):
|
|
150
|
+
if not obj: # Empty list
|
|
151
|
+
md.append("None")
|
|
152
|
+
elif all(isinstance(i, dict) for i in obj):
|
|
153
|
+
# List of objects
|
|
154
|
+
for i, item in enumerate(obj, 1):
|
|
155
|
+
if level <= 2:
|
|
156
|
+
md.append(f"### {title or 'Item'} {i}")
|
|
157
|
+
else:
|
|
158
|
+
md.append(f"**{title or 'Item'} {i}:**")
|
|
159
|
+
# Process dict items inline for cleaner output
|
|
160
|
+
for k, v in item.items():
|
|
161
|
+
key_name = k.replace('_', ' ').title()
|
|
162
|
+
md.append(f"- **{key_name}:** {v}")
|
|
163
|
+
if i < len(obj): # Add spacing between items
|
|
164
|
+
md.append("")
|
|
165
|
+
else:
|
|
166
|
+
# Simple list
|
|
167
|
+
for item in obj:
|
|
168
|
+
md.append(f"- {item}")
|
|
169
|
+
else:
|
|
170
|
+
# Primitive value
|
|
171
|
+
value_str = str(obj) if obj is not None else "None"
|
|
172
|
+
if title is None:
|
|
173
|
+
md.append(value_str)
|
|
174
|
+
else:
|
|
175
|
+
md.append(value_str)
|
|
176
|
+
|
|
177
|
+
return "\n".join(md)
|
|
178
|
+
|
|
179
|
+
def _dump_markdown_unique(self, keys: Union[List[str], List[Union[str, Tuple[str, str]]]]) -> str:
|
|
180
|
+
data = self.model_dump()
|
|
181
|
+
filtered: Dict[str, Any] = {}
|
|
182
|
+
for key in keys:
|
|
183
|
+
if isinstance(key, tuple):
|
|
184
|
+
original_key, renamed_key = key
|
|
185
|
+
if original_key in data:
|
|
186
|
+
filtered[renamed_key] = data[original_key]
|
|
187
|
+
else:
|
|
188
|
+
if key in data:
|
|
189
|
+
filtered[key] = data[key]
|
|
190
|
+
|
|
191
|
+
# Generate markdown without top-level headers to avoid duplication
|
|
192
|
+
parts = []
|
|
193
|
+
for key, value in filtered.items():
|
|
194
|
+
if value is None:
|
|
195
|
+
continue
|
|
196
|
+
formatted_key = key.replace('_', ' ').title()
|
|
197
|
+
if isinstance(value, (str, int, float, bool)) and len(str(value)) < 200:
|
|
198
|
+
parts.append(f"**{formatted_key}:** {value}")
|
|
199
|
+
else:
|
|
200
|
+
parts.append(self._to_markdown(value, level=2, title=key))
|
|
201
|
+
|
|
202
|
+
return '\n'.join(parts)
|
|
203
|
+
|
|
204
|
+
def model_dump_markdown_input(self) -> str:
|
|
205
|
+
input_keys, _ = self.get_io_keys()
|
|
206
|
+
return self._dump_markdown_unique(input_keys)
|
|
207
|
+
|
|
208
|
+
def model_dump_markdown_output(self) -> str:
|
|
209
|
+
_, output_keys = self.get_io_keys()
|
|
210
|
+
return self._dump_markdown_unique(output_keys)
|
|
211
|
+
|
|
212
|
+
# ------------------------------------------------------------------ #
|
|
213
|
+
# Training & preview (JSON or Markdown)
|
|
214
|
+
# ------------------------------------------------------------------ #
|
|
215
|
+
def build_training_data(self, format: str = "json", indent=None) -> dict[str, Any]:
|
|
216
|
+
"""
|
|
217
|
+
Build training data in either JSON (dict for OpenAI-style messages)
|
|
218
|
+
or Markdown (clean format without role prefixes).
|
|
219
|
+
"""
|
|
220
|
+
if format == "json":
|
|
221
|
+
return {
|
|
222
|
+
"messages": [
|
|
223
|
+
{"role": "system", "content": self.get_instruction()},
|
|
224
|
+
{"role": "user", "content": self.model_dump_json_input(indent=indent)},
|
|
225
|
+
{"role": "assistant", "content": self.model_dump_json_output(indent=indent)},
|
|
226
|
+
]
|
|
227
|
+
}
|
|
228
|
+
elif format == "markdown":
|
|
229
|
+
system_content = self.get_instruction()
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
'messages': [
|
|
233
|
+
{"role": "system", "content": system_content},
|
|
234
|
+
{"role": "user", "content": self.model_dump_markdown_input()},
|
|
235
|
+
{"role": "assistant", "content": self.model_dump_markdown_output()},
|
|
236
|
+
]
|
|
237
|
+
}
|
|
238
|
+
raise ValueError("format must be either 'json' or 'markdown'")
|
|
239
|
+
|
|
240
|
+
def __str__(self) -> str:
|
|
241
|
+
# Return clean format without explicit role prefixes
|
|
242
|
+
training_data = self.build_training_data(format="markdown")
|
|
243
|
+
messages = training_data['messages'] # type: ignore[index]
|
|
244
|
+
|
|
245
|
+
parts = []
|
|
246
|
+
for msg in messages:
|
|
247
|
+
content = msg['content']
|
|
248
|
+
if msg['role'] == 'system':
|
|
249
|
+
parts.append(content)
|
|
250
|
+
elif msg['role'] == 'user':
|
|
251
|
+
parts.append(content)
|
|
252
|
+
elif msg['role'] == 'assistant':
|
|
253
|
+
# Get output keys to determine the main output field name
|
|
254
|
+
_, output_keys = self.get_io_keys()
|
|
255
|
+
main_output = output_keys[0] if output_keys else 'response'
|
|
256
|
+
if isinstance(main_output, tuple):
|
|
257
|
+
main_output = main_output[1] # Use renamed key
|
|
258
|
+
title = main_output.replace('_', ' ').title()
|
|
259
|
+
parts.append(f"## {title}\n{content}")
|
|
260
|
+
|
|
261
|
+
return '\n\n'.join(parts)
|
|
262
|
+
|
|
263
|
+
@classmethod
|
|
264
|
+
def from_messages(cls: Type[B], messages: list[dict]) -> B:
|
|
265
|
+
"""
|
|
266
|
+
Reconstruct a prompt builder instance from OpenAI-style messages.
|
|
267
|
+
"""
|
|
268
|
+
user_msg = next((m for m in messages if m.get("role") == "user"), None)
|
|
269
|
+
assistant_msg = next((m for m in messages if m.get("role") == "assistant"), None)
|
|
270
|
+
|
|
271
|
+
if user_msg is None:
|
|
272
|
+
raise ValueError("No user message found")
|
|
273
|
+
if assistant_msg is None:
|
|
274
|
+
raise ValueError("No assistant message found")
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
user_data = json.loads(user_msg["content"]) # type: ignore[index]
|
|
278
|
+
except Exception as e:
|
|
279
|
+
raise ValueError(f"Invalid user JSON content: {e}")
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
assistant_data = json.loads(assistant_msg["content"]) # type: ignore[index]
|
|
283
|
+
except Exception as e:
|
|
284
|
+
raise ValueError(f"Invalid assistant JSON content: {e}")
|
|
285
|
+
|
|
286
|
+
combined_data = {**user_data, **assistant_data}
|
|
287
|
+
return cast(B, cls(**combined_data))
|
|
288
|
+
|