hamtaa-texttools 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/METADATA +192 -141
- hamtaa_texttools-1.0.6.dist-info/RECORD +30 -0
- {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/licenses/LICENSE +20 -20
- {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/top_level.txt +0 -0
- texttools/__init__.py +9 -9
- texttools/batch/__init__.py +4 -4
- texttools/batch/batch_manager.py +229 -240
- texttools/batch/batch_runner.py +263 -212
- texttools/formatters/base_formatter.py +33 -33
- texttools/formatters/{user_merge_formatter/user_merge_formatter.py → user_merge_formatter.py} +30 -30
- texttools/prompts/README.md +35 -31
- texttools/prompts/categorizer.yaml +28 -31
- texttools/prompts/{question_detector.yaml → is_question.yaml} +13 -14
- texttools/prompts/keyword_extractor.yaml +18 -14
- texttools/prompts/ner_extractor.yaml +20 -21
- texttools/prompts/question_merger.yaml +45 -48
- texttools/prompts/rewriter.yaml +111 -0
- texttools/prompts/run_custom.yaml +7 -0
- texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +22 -26
- texttools/prompts/summarizer.yaml +13 -11
- texttools/prompts/{question_generator.yaml → text_to_question.yaml} +19 -22
- texttools/prompts/translator.yaml +14 -14
- texttools/tools/__init__.py +4 -4
- texttools/tools/async_the_tool.py +277 -263
- texttools/tools/internals/async_operator.py +308 -288
- texttools/tools/internals/operator.py +295 -306
- texttools/tools/internals/output_models.py +52 -62
- texttools/tools/internals/prompt_loader.py +66 -82
- texttools/tools/the_tool.py +501 -400
- hamtaa_texttools-1.0.4.dist-info/RECORD +0 -29
- texttools/prompts/question_rewriter.yaml +0 -46
- {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/WHEEL +0 -0
|
@@ -1,288 +1,308 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import math
|
|
5
|
-
import re
|
|
6
|
-
from typing import Any, Literal,
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
self
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
"""
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Literal, TypeVar
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from texttools.formatters.user_merge_formatter import (
|
|
13
|
+
UserMergeFormatter,
|
|
14
|
+
)
|
|
15
|
+
from texttools.tools.internals.prompt_loader import PromptLoader
|
|
16
|
+
|
|
17
|
+
# Base Model type for output models
|
|
18
|
+
T = TypeVar("T", bound=BaseModel)
|
|
19
|
+
|
|
20
|
+
# Configure logger
|
|
21
|
+
logger = logging.getLogger("async_operator")
|
|
22
|
+
logger.setLevel(logging.INFO)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AsyncOperator:
|
|
26
|
+
"""
|
|
27
|
+
Async version of Operator.
|
|
28
|
+
|
|
29
|
+
Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
client: AsyncOpenAI,
|
|
35
|
+
*,
|
|
36
|
+
model: str,
|
|
37
|
+
temperature: float = 0.0,
|
|
38
|
+
**client_kwargs: Any,
|
|
39
|
+
):
|
|
40
|
+
self.client: AsyncOpenAI = client
|
|
41
|
+
self.model = model
|
|
42
|
+
self.temperature = temperature
|
|
43
|
+
self.client_kwargs = client_kwargs
|
|
44
|
+
|
|
45
|
+
def _build_user_message(self, prompt: str) -> dict[str, str]:
|
|
46
|
+
return {"role": "user", "content": prompt}
|
|
47
|
+
|
|
48
|
+
async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
|
|
49
|
+
try:
|
|
50
|
+
completion = await self.client.chat.completions.create(
|
|
51
|
+
model=self.model,
|
|
52
|
+
messages=analyze_message,
|
|
53
|
+
temperature=self.temperature,
|
|
54
|
+
**self.client_kwargs,
|
|
55
|
+
)
|
|
56
|
+
analysis = completion.choices[0].message.content.strip()
|
|
57
|
+
return analysis
|
|
58
|
+
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"[ERROR] Analysis failed: {e}")
|
|
61
|
+
raise
|
|
62
|
+
|
|
63
|
+
async def _analyze(self, prompt_configs: dict[str, str]) -> str:
|
|
64
|
+
analyze_prompt = prompt_configs["analyze_template"]
|
|
65
|
+
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
66
|
+
analysis = await self._analysis_completion(analyze_message)
|
|
67
|
+
|
|
68
|
+
return analysis
|
|
69
|
+
|
|
70
|
+
async def _parse_completion(
|
|
71
|
+
self,
|
|
72
|
+
message: list[dict[str, str]],
|
|
73
|
+
output_model: T,
|
|
74
|
+
logprobs: bool = False,
|
|
75
|
+
top_logprobs: int = 3,
|
|
76
|
+
max_tokens: int | None = None,
|
|
77
|
+
) -> tuple[T, Any]:
|
|
78
|
+
try:
|
|
79
|
+
request_kwargs = {
|
|
80
|
+
"model": self.model,
|
|
81
|
+
"messages": message,
|
|
82
|
+
"response_format": output_model,
|
|
83
|
+
"temperature": self.temperature,
|
|
84
|
+
**self.client_kwargs,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if max_tokens is not None:
|
|
88
|
+
request_kwargs["max_tokens"] = max_tokens
|
|
89
|
+
|
|
90
|
+
if logprobs:
|
|
91
|
+
request_kwargs["logprobs"] = True
|
|
92
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
93
|
+
|
|
94
|
+
completion = await self.client.beta.chat.completions.parse(**request_kwargs)
|
|
95
|
+
parsed = completion.choices[0].message.parsed
|
|
96
|
+
return parsed, completion
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
print(f"[ERROR] Failed to parse completion: {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
def _clean_json_response(self, response: str) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Clean JSON response by removing code block markers and whitespace.
|
|
105
|
+
Handles cases like:
|
|
106
|
+
- ```json{"result": "value"}```
|
|
107
|
+
"""
|
|
108
|
+
cleaned = response.strip()
|
|
109
|
+
|
|
110
|
+
# Remove ```json marker
|
|
111
|
+
if cleaned.startswith("```json"):
|
|
112
|
+
cleaned = cleaned[7:]
|
|
113
|
+
|
|
114
|
+
# Remove trailing ```
|
|
115
|
+
if cleaned.endswith("```"):
|
|
116
|
+
cleaned = cleaned[:-3]
|
|
117
|
+
|
|
118
|
+
return cleaned.strip()
|
|
119
|
+
|
|
120
|
+
def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
|
|
121
|
+
"""
|
|
122
|
+
Convert a JSON response string to output model.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
response_string: The JSON string (may contain code block markers)
|
|
126
|
+
output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Instance of your output model
|
|
130
|
+
"""
|
|
131
|
+
try:
|
|
132
|
+
# Clean the response string
|
|
133
|
+
cleaned_json = self._clean_json_response(response_string)
|
|
134
|
+
|
|
135
|
+
# Fix Python-style booleans
|
|
136
|
+
cleaned_json = cleaned_json.replace("False", "false").replace(
|
|
137
|
+
"True", "true"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Convert string to Python dictionary
|
|
141
|
+
response_dict = json.loads(cleaned_json)
|
|
142
|
+
|
|
143
|
+
# Convert dictionary to output model
|
|
144
|
+
return output_model(**response_dict)
|
|
145
|
+
|
|
146
|
+
except json.JSONDecodeError as e:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"Failed to parse JSON response: {e}\nResponse: {response_string}"
|
|
149
|
+
)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
raise ValueError(f"Failed to convert to output model: {e}")
|
|
152
|
+
|
|
153
|
+
async def _vllm_completion(
|
|
154
|
+
self,
|
|
155
|
+
message: list[dict[str, str]],
|
|
156
|
+
output_model: T,
|
|
157
|
+
logprobs: bool = False,
|
|
158
|
+
top_logprobs: int = 3,
|
|
159
|
+
max_tokens: int | None = None,
|
|
160
|
+
) -> tuple[T, Any]:
|
|
161
|
+
try:
|
|
162
|
+
json_schema = output_model.model_json_schema()
|
|
163
|
+
|
|
164
|
+
# Build kwargs dynamically
|
|
165
|
+
request_kwargs = {
|
|
166
|
+
"model": self.model,
|
|
167
|
+
"messages": message,
|
|
168
|
+
"extra_body": {"guided_json": json_schema},
|
|
169
|
+
"temperature": self.temperature,
|
|
170
|
+
**self.client_kwargs,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if max_tokens is not None:
|
|
174
|
+
request_kwargs["max_tokens"] = max_tokens
|
|
175
|
+
|
|
176
|
+
if logprobs:
|
|
177
|
+
request_kwargs["logprobs"] = True
|
|
178
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
179
|
+
|
|
180
|
+
completion = await self.client.chat.completions.create(**request_kwargs)
|
|
181
|
+
response = completion.choices[0].message.content
|
|
182
|
+
|
|
183
|
+
# Convert the string response to output model
|
|
184
|
+
parsed = self._convert_to_output_model(response, output_model)
|
|
185
|
+
|
|
186
|
+
return parsed, completion
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
print(f"[ERROR] Failed to get vLLM structured output: {e}")
|
|
190
|
+
raise
|
|
191
|
+
|
|
192
|
+
def _extract_logprobs(self, completion: dict):
|
|
193
|
+
logprobs_data = []
|
|
194
|
+
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
195
|
+
|
|
196
|
+
for choice in completion.choices:
|
|
197
|
+
if not getattr(choice, "logprobs", None):
|
|
198
|
+
logger.info("No logprobs found.")
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
for logprob_item in choice.logprobs.content:
|
|
202
|
+
if ignore_pattern.match(logprob_item.token):
|
|
203
|
+
continue
|
|
204
|
+
token_entry = {
|
|
205
|
+
"token": logprob_item.token,
|
|
206
|
+
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
207
|
+
"top_alternatives": [],
|
|
208
|
+
}
|
|
209
|
+
for alt in logprob_item.top_logprobs:
|
|
210
|
+
if ignore_pattern.match(alt.token):
|
|
211
|
+
continue
|
|
212
|
+
token_entry["top_alternatives"].append(
|
|
213
|
+
{
|
|
214
|
+
"token": alt.token,
|
|
215
|
+
"prob": round(math.exp(alt.logprob), 8),
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
logprobs_data.append(token_entry)
|
|
219
|
+
|
|
220
|
+
return logprobs_data
|
|
221
|
+
|
|
222
|
+
async def run(
|
|
223
|
+
self,
|
|
224
|
+
input_text: str,
|
|
225
|
+
prompt_file: str,
|
|
226
|
+
output_model: T,
|
|
227
|
+
with_analysis: bool = False,
|
|
228
|
+
use_modes: bool = False,
|
|
229
|
+
mode: str = "",
|
|
230
|
+
resp_format: Literal["vllm", "parse"] = "parse",
|
|
231
|
+
output_lang: str | None = None,
|
|
232
|
+
logprobs: bool = False,
|
|
233
|
+
top_logprobs: int = 3,
|
|
234
|
+
max_tokens: int | None = None,
|
|
235
|
+
**extra_kwargs,
|
|
236
|
+
) -> dict[str, Any]:
|
|
237
|
+
"""
|
|
238
|
+
Execute the async LLM pipeline with the given input text.
|
|
239
|
+
"""
|
|
240
|
+
prompt_loader = PromptLoader()
|
|
241
|
+
formatter = UserMergeFormatter()
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
cleaned_text = input_text.strip()
|
|
245
|
+
|
|
246
|
+
prompt_configs = prompt_loader.load(
|
|
247
|
+
prompt_file=prompt_file,
|
|
248
|
+
text=cleaned_text,
|
|
249
|
+
mode=mode if use_modes else "",
|
|
250
|
+
**extra_kwargs,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
messages: list[dict[str, str]] = []
|
|
254
|
+
|
|
255
|
+
if with_analysis:
|
|
256
|
+
analysis = await self._analyze(prompt_configs)
|
|
257
|
+
messages.append(
|
|
258
|
+
self._build_user_message(f"Based on this analysis: {analysis}")
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if output_lang:
|
|
262
|
+
messages.append(
|
|
263
|
+
self._build_user_message(
|
|
264
|
+
f"Respond only in the {output_lang} language."
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
messages.append(self._build_user_message(prompt_configs["main_template"]))
|
|
269
|
+
messages = formatter.format(messages)
|
|
270
|
+
|
|
271
|
+
if resp_format == "vllm":
|
|
272
|
+
parsed, completion = await self._vllm_completion(
|
|
273
|
+
messages,
|
|
274
|
+
output_model,
|
|
275
|
+
logprobs,
|
|
276
|
+
top_logprobs,
|
|
277
|
+
max_tokens,
|
|
278
|
+
)
|
|
279
|
+
elif resp_format == "parse":
|
|
280
|
+
parsed, completion = await self._parse_completion(
|
|
281
|
+
messages,
|
|
282
|
+
output_model,
|
|
283
|
+
logprobs,
|
|
284
|
+
top_logprobs,
|
|
285
|
+
max_tokens,
|
|
286
|
+
)
|
|
287
|
+
else:
|
|
288
|
+
logger.error(f"Unknown resp_format: {resp_format}")
|
|
289
|
+
|
|
290
|
+
# Ensure output_model has a `result` field
|
|
291
|
+
if not hasattr(parsed, "result"):
|
|
292
|
+
logger.error(
|
|
293
|
+
"The provided output_model must define a field named 'result'"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
results = {"result": parsed.result}
|
|
297
|
+
|
|
298
|
+
if logprobs:
|
|
299
|
+
results["logprobs"] = self._extract_logprobs(completion)
|
|
300
|
+
|
|
301
|
+
if with_analysis:
|
|
302
|
+
results["analysis"] = analysis
|
|
303
|
+
|
|
304
|
+
return results
|
|
305
|
+
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logger.error(f"Async TheTool failed: {e}")
|
|
308
|
+
return {"Error": str(e), "result": ""}
|