hamtaa-texttools 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.7.dist-info}/METADATA +1 -1
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.7.dist-info}/RECORD +16 -15
- texttools/tools/async_the_tool.py +201 -140
- texttools/tools/internals/async_operator.py +83 -200
- texttools/tools/internals/base_operator.py +85 -0
- texttools/tools/internals/operator.py +25 -128
- texttools/tools/the_tool.py +154 -217
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.7.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.7.dist-info}/top_level.txt +0 -0
- /texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
- /texttools/prompts/{keyword_extractor.yaml → extract_keywords.yaml} +0 -0
- /texttools/prompts/{question_merger.yaml → merge_questions.yaml} +0 -0
- /texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
- /texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
- /texttools/prompts/{translator.yaml → translate.yaml} +0 -0
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import math
|
|
5
|
-
import re
|
|
6
|
-
from typing import Any, Literal, TypeVar
|
|
3
|
+
from typing import Any, TypeVar, Type, Literal
|
|
7
4
|
import logging
|
|
8
5
|
|
|
9
6
|
from openai import AsyncOpenAI
|
|
10
7
|
from pydantic import BaseModel
|
|
11
8
|
|
|
9
|
+
from texttools.tools.internals.base_operator import BaseOperator
|
|
12
10
|
from texttools.formatters.user_merge_formatter import (
|
|
13
11
|
UserMergeFormatter,
|
|
14
12
|
)
|
|
@@ -22,238 +20,128 @@ logger = logging.getLogger("async_operator")
|
|
|
22
20
|
logger.setLevel(logging.INFO)
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
class AsyncOperator:
|
|
23
|
+
class AsyncOperator(BaseOperator):
|
|
26
24
|
"""
|
|
27
|
-
|
|
25
|
+
Core engine for running text-processing operations with an LLM (Async).
|
|
28
26
|
|
|
29
|
-
|
|
27
|
+
It wires together:
|
|
28
|
+
- `PromptLoader` → loads YAML prompt templates.
|
|
29
|
+
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
30
|
+
- AsyncOpenAI client → executes completions/parsed completions.
|
|
30
31
|
"""
|
|
31
32
|
|
|
32
|
-
def __init__(
|
|
33
|
-
self,
|
|
34
|
-
client: AsyncOpenAI,
|
|
35
|
-
*,
|
|
36
|
-
model: str,
|
|
37
|
-
temperature: float = 0.0,
|
|
38
|
-
**client_kwargs: Any,
|
|
39
|
-
):
|
|
33
|
+
def __init__(self, client: AsyncOpenAI, model: str):
|
|
40
34
|
self.client: AsyncOpenAI = client
|
|
41
35
|
self.model = model
|
|
42
|
-
self.temperature = temperature
|
|
43
|
-
self.client_kwargs = client_kwargs
|
|
44
|
-
|
|
45
|
-
def _build_user_message(self, prompt: str) -> dict[str, str]:
|
|
46
|
-
return {"role": "user", "content": prompt}
|
|
47
36
|
|
|
48
|
-
async def _analysis_completion(
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
print(f"[ERROR] Analysis failed: {e}")
|
|
61
|
-
raise
|
|
37
|
+
async def _analysis_completion(
|
|
38
|
+
self,
|
|
39
|
+
analyze_message: list[dict[str, str]],
|
|
40
|
+
temperature: float,
|
|
41
|
+
) -> str:
|
|
42
|
+
completion = await self.client.chat.completions.create(
|
|
43
|
+
model=self.model,
|
|
44
|
+
messages=analyze_message,
|
|
45
|
+
temperature=temperature,
|
|
46
|
+
)
|
|
47
|
+
analysis = completion.choices[0].message.content.strip()
|
|
48
|
+
return analysis
|
|
62
49
|
|
|
63
|
-
async def _analyze(self, prompt_configs: dict[str, str]) -> str:
|
|
50
|
+
async def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
|
|
64
51
|
analyze_prompt = prompt_configs["analyze_template"]
|
|
65
52
|
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
66
|
-
analysis = await self._analysis_completion(analyze_message)
|
|
67
|
-
|
|
53
|
+
analysis = await self._analysis_completion(analyze_message, temperature)
|
|
68
54
|
return analysis
|
|
69
55
|
|
|
70
56
|
async def _parse_completion(
|
|
71
57
|
self,
|
|
72
58
|
message: list[dict[str, str]],
|
|
73
|
-
output_model: T,
|
|
59
|
+
output_model: Type[T],
|
|
60
|
+
temperature: float,
|
|
74
61
|
logprobs: bool = False,
|
|
75
62
|
top_logprobs: int = 3,
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
request_kwargs["logprobs"] = True
|
|
92
|
-
request_kwargs["top_logprobs"] = top_logprobs
|
|
93
|
-
|
|
94
|
-
completion = await self.client.beta.chat.completions.parse(**request_kwargs)
|
|
95
|
-
parsed = completion.choices[0].message.parsed
|
|
96
|
-
return parsed, completion
|
|
97
|
-
|
|
98
|
-
except Exception as e:
|
|
99
|
-
print(f"[ERROR] Failed to parse completion: {e}")
|
|
100
|
-
raise
|
|
101
|
-
|
|
102
|
-
def _clean_json_response(self, response: str) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Clean JSON response by removing code block markers and whitespace.
|
|
105
|
-
Handles cases like:
|
|
106
|
-
- ```json{"result": "value"}```
|
|
107
|
-
"""
|
|
108
|
-
cleaned = response.strip()
|
|
109
|
-
|
|
110
|
-
# Remove ```json marker
|
|
111
|
-
if cleaned.startswith("```json"):
|
|
112
|
-
cleaned = cleaned[7:]
|
|
113
|
-
|
|
114
|
-
# Remove trailing ```
|
|
115
|
-
if cleaned.endswith("```"):
|
|
116
|
-
cleaned = cleaned[:-3]
|
|
117
|
-
|
|
118
|
-
return cleaned.strip()
|
|
119
|
-
|
|
120
|
-
def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
|
|
121
|
-
"""
|
|
122
|
-
Convert a JSON response string to output model.
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
response_string: The JSON string (may contain code block markers)
|
|
126
|
-
output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
Instance of your output model
|
|
130
|
-
"""
|
|
131
|
-
try:
|
|
132
|
-
# Clean the response string
|
|
133
|
-
cleaned_json = self._clean_json_response(response_string)
|
|
134
|
-
|
|
135
|
-
# Fix Python-style booleans
|
|
136
|
-
cleaned_json = cleaned_json.replace("False", "false").replace(
|
|
137
|
-
"True", "true"
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
# Convert string to Python dictionary
|
|
141
|
-
response_dict = json.loads(cleaned_json)
|
|
142
|
-
|
|
143
|
-
# Convert dictionary to output model
|
|
144
|
-
return output_model(**response_dict)
|
|
145
|
-
|
|
146
|
-
except json.JSONDecodeError as e:
|
|
147
|
-
raise ValueError(
|
|
148
|
-
f"Failed to parse JSON response: {e}\nResponse: {response_string}"
|
|
149
|
-
)
|
|
150
|
-
except Exception as e:
|
|
151
|
-
raise ValueError(f"Failed to convert to output model: {e}")
|
|
63
|
+
) -> tuple[Type[T], Any]:
|
|
64
|
+
request_kwargs = {
|
|
65
|
+
"model": self.model,
|
|
66
|
+
"messages": message,
|
|
67
|
+
"response_format": output_model,
|
|
68
|
+
"temperature": temperature,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if logprobs:
|
|
72
|
+
request_kwargs["logprobs"] = True
|
|
73
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
74
|
+
|
|
75
|
+
completion = await self.client.beta.chat.completions.parse(**request_kwargs)
|
|
76
|
+
parsed = completion.choices[0].message.parsed
|
|
77
|
+
return parsed, completion
|
|
152
78
|
|
|
153
79
|
async def _vllm_completion(
|
|
154
80
|
self,
|
|
155
81
|
message: list[dict[str, str]],
|
|
156
|
-
output_model: T,
|
|
82
|
+
output_model: Type[T],
|
|
83
|
+
temperature: float,
|
|
157
84
|
logprobs: bool = False,
|
|
158
85
|
top_logprobs: int = 3,
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
try:
|
|
162
|
-
json_schema = output_model.model_json_schema()
|
|
163
|
-
|
|
164
|
-
# Build kwargs dynamically
|
|
165
|
-
request_kwargs = {
|
|
166
|
-
"model": self.model,
|
|
167
|
-
"messages": message,
|
|
168
|
-
"extra_body": {"guided_json": json_schema},
|
|
169
|
-
"temperature": self.temperature,
|
|
170
|
-
**self.client_kwargs,
|
|
171
|
-
}
|
|
86
|
+
) -> tuple[Type[T], Any]:
|
|
87
|
+
json_schema = output_model.model_json_schema()
|
|
172
88
|
|
|
173
|
-
|
|
174
|
-
|
|
89
|
+
# Build kwargs dynamically
|
|
90
|
+
request_kwargs = {
|
|
91
|
+
"model": self.model,
|
|
92
|
+
"messages": message,
|
|
93
|
+
"extra_body": {"guided_json": json_schema},
|
|
94
|
+
"temperature": temperature,
|
|
95
|
+
}
|
|
175
96
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
completion = await self.client.chat.completions.create(**request_kwargs)
|
|
181
|
-
response = completion.choices[0].message.content
|
|
182
|
-
|
|
183
|
-
# Convert the string response to output model
|
|
184
|
-
parsed = self._convert_to_output_model(response, output_model)
|
|
97
|
+
if logprobs:
|
|
98
|
+
request_kwargs["logprobs"] = True
|
|
99
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
185
100
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
except Exception as e:
|
|
189
|
-
print(f"[ERROR] Failed to get vLLM structured output: {e}")
|
|
190
|
-
raise
|
|
191
|
-
|
|
192
|
-
def _extract_logprobs(self, completion: dict):
|
|
193
|
-
logprobs_data = []
|
|
194
|
-
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
195
|
-
|
|
196
|
-
for choice in completion.choices:
|
|
197
|
-
if not getattr(choice, "logprobs", None):
|
|
198
|
-
logger.info("No logprobs found.")
|
|
199
|
-
continue
|
|
200
|
-
|
|
201
|
-
for logprob_item in choice.logprobs.content:
|
|
202
|
-
if ignore_pattern.match(logprob_item.token):
|
|
203
|
-
continue
|
|
204
|
-
token_entry = {
|
|
205
|
-
"token": logprob_item.token,
|
|
206
|
-
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
207
|
-
"top_alternatives": [],
|
|
208
|
-
}
|
|
209
|
-
for alt in logprob_item.top_logprobs:
|
|
210
|
-
if ignore_pattern.match(alt.token):
|
|
211
|
-
continue
|
|
212
|
-
token_entry["top_alternatives"].append(
|
|
213
|
-
{
|
|
214
|
-
"token": alt.token,
|
|
215
|
-
"prob": round(math.exp(alt.logprob), 8),
|
|
216
|
-
}
|
|
217
|
-
)
|
|
218
|
-
logprobs_data.append(token_entry)
|
|
101
|
+
completion = await self.client.chat.completions.create(**request_kwargs)
|
|
102
|
+
response = completion.choices[0].message.content
|
|
219
103
|
|
|
220
|
-
|
|
104
|
+
# Convert the string response to output model
|
|
105
|
+
parsed = self._convert_to_output_model(response, output_model)
|
|
106
|
+
return parsed, completion
|
|
221
107
|
|
|
222
108
|
async def run(
|
|
223
109
|
self,
|
|
224
|
-
|
|
110
|
+
# User parameters
|
|
111
|
+
text: str,
|
|
112
|
+
with_analysis: bool,
|
|
113
|
+
output_lang: str | None,
|
|
114
|
+
user_prompt: str | None,
|
|
115
|
+
temperature: float,
|
|
116
|
+
logprobs: bool,
|
|
117
|
+
top_logprobs: int | None,
|
|
118
|
+
# Internal parameters
|
|
225
119
|
prompt_file: str,
|
|
226
|
-
output_model: T,
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
mode: str = "",
|
|
230
|
-
resp_format: Literal["vllm", "parse"] = "parse",
|
|
231
|
-
output_lang: str | None = None,
|
|
232
|
-
logprobs: bool = False,
|
|
233
|
-
top_logprobs: int = 3,
|
|
234
|
-
max_tokens: int | None = None,
|
|
120
|
+
output_model: Type[T],
|
|
121
|
+
resp_format: Literal["vllm", "parse"],
|
|
122
|
+
mode: str | None,
|
|
235
123
|
**extra_kwargs,
|
|
236
124
|
) -> dict[str, Any]:
|
|
237
125
|
"""
|
|
238
|
-
Execute the async LLM pipeline with the given input text.
|
|
126
|
+
Execute the async LLM pipeline with the given input text. (Async)
|
|
239
127
|
"""
|
|
240
128
|
prompt_loader = PromptLoader()
|
|
241
129
|
formatter = UserMergeFormatter()
|
|
242
130
|
|
|
243
131
|
try:
|
|
244
|
-
cleaned_text =
|
|
132
|
+
cleaned_text = text.strip()
|
|
245
133
|
|
|
246
134
|
prompt_configs = prompt_loader.load(
|
|
247
135
|
prompt_file=prompt_file,
|
|
248
136
|
text=cleaned_text,
|
|
249
|
-
mode=mode
|
|
137
|
+
mode=mode,
|
|
250
138
|
**extra_kwargs,
|
|
251
139
|
)
|
|
252
140
|
|
|
253
141
|
messages: list[dict[str, str]] = []
|
|
254
142
|
|
|
255
143
|
if with_analysis:
|
|
256
|
-
analysis = await self._analyze(prompt_configs)
|
|
144
|
+
analysis = await self._analyze(prompt_configs, temperature)
|
|
257
145
|
messages.append(
|
|
258
146
|
self._build_user_message(f"Based on this analysis: {analysis}")
|
|
259
147
|
)
|
|
@@ -265,27 +153,22 @@ class AsyncOperator:
|
|
|
265
153
|
)
|
|
266
154
|
)
|
|
267
155
|
|
|
156
|
+
if user_prompt:
|
|
157
|
+
messages.append(
|
|
158
|
+
self._build_user_message(f"Consider this instruction {user_prompt}")
|
|
159
|
+
)
|
|
160
|
+
|
|
268
161
|
messages.append(self._build_user_message(prompt_configs["main_template"]))
|
|
269
162
|
messages = formatter.format(messages)
|
|
270
163
|
|
|
271
164
|
if resp_format == "vllm":
|
|
272
165
|
parsed, completion = await self._vllm_completion(
|
|
273
|
-
messages,
|
|
274
|
-
output_model,
|
|
275
|
-
logprobs,
|
|
276
|
-
top_logprobs,
|
|
277
|
-
max_tokens,
|
|
166
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
278
167
|
)
|
|
279
168
|
elif resp_format == "parse":
|
|
280
|
-
parsed, completion = await self.
|
|
281
|
-
messages,
|
|
282
|
-
output_model,
|
|
283
|
-
logprobs,
|
|
284
|
-
top_logprobs,
|
|
285
|
-
max_tokens,
|
|
169
|
+
parsed, completion = await self._vllm_completion(
|
|
170
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
286
171
|
)
|
|
287
|
-
else:
|
|
288
|
-
logger.error(f"Unknown resp_format: {resp_format}")
|
|
289
172
|
|
|
290
173
|
# Ensure output_model has a `result` field
|
|
291
174
|
if not hasattr(parsed, "result"):
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from typing import TypeVar, Type
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import math
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from openai import OpenAI, AsyncOpenAI
|
|
8
|
+
|
|
9
|
+
# Base Model type for output models
|
|
10
|
+
T = TypeVar("T", bound=BaseModel)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseOperator:
|
|
14
|
+
def __init__(self, client: OpenAI | AsyncOpenAI, model: str):
|
|
15
|
+
self.client = client
|
|
16
|
+
self.model = model
|
|
17
|
+
|
|
18
|
+
def _build_user_message(self, prompt: str) -> dict[str, str]:
|
|
19
|
+
return {"role": "user", "content": prompt}
|
|
20
|
+
|
|
21
|
+
def _clean_json_response(self, response: str) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Clean JSON response by removing code block markers and whitespace.
|
|
24
|
+
Handles cases like:
|
|
25
|
+
- ```json{"result": "value"}```
|
|
26
|
+
"""
|
|
27
|
+
stripped = response.strip()
|
|
28
|
+
cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
|
|
29
|
+
cleaned = re.sub(r"\s*```$", "", cleaned)
|
|
30
|
+
|
|
31
|
+
return cleaned.strip()
|
|
32
|
+
|
|
33
|
+
def _convert_to_output_model(
|
|
34
|
+
self, response_string: str, output_model: Type[T]
|
|
35
|
+
) -> Type[T]:
|
|
36
|
+
"""
|
|
37
|
+
Convert a JSON response string to output model.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
response_string: The JSON string (may contain code block markers)
|
|
41
|
+
output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Instance of your output model
|
|
45
|
+
"""
|
|
46
|
+
# Clean the response string
|
|
47
|
+
cleaned_json = self._clean_json_response(response_string)
|
|
48
|
+
|
|
49
|
+
# Fix Python-style booleans
|
|
50
|
+
cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
|
|
51
|
+
|
|
52
|
+
# Convert string to Python dictionary
|
|
53
|
+
response_dict = json.loads(cleaned_json)
|
|
54
|
+
|
|
55
|
+
# Convert dictionary to output model
|
|
56
|
+
return output_model(**response_dict)
|
|
57
|
+
|
|
58
|
+
def _extract_logprobs(self, completion: dict):
|
|
59
|
+
logprobs_data = []
|
|
60
|
+
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
61
|
+
|
|
62
|
+
for choice in completion.choices:
|
|
63
|
+
if not getattr(choice, "logprobs", None):
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
for logprob_item in choice.logprobs.content:
|
|
67
|
+
if ignore_pattern.match(logprob_item.token):
|
|
68
|
+
continue
|
|
69
|
+
token_entry = {
|
|
70
|
+
"token": logprob_item.token,
|
|
71
|
+
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
72
|
+
"top_alternatives": [],
|
|
73
|
+
}
|
|
74
|
+
for alt in logprob_item.top_logprobs:
|
|
75
|
+
if ignore_pattern.match(alt.token):
|
|
76
|
+
continue
|
|
77
|
+
token_entry["top_alternatives"].append(
|
|
78
|
+
{
|
|
79
|
+
"token": alt.token,
|
|
80
|
+
"prob": round(math.exp(alt.logprob), 8),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
logprobs_data.append(token_entry)
|
|
84
|
+
|
|
85
|
+
return logprobs_data
|