hamtaa-texttools 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.0.1.dist-info → hamtaa_texttools-1.0.3.dist-info}/METADATA +18 -6
- hamtaa_texttools-1.0.3.dist-info/RECORD +29 -0
- texttools/__init__.py +3 -3
- texttools/{utils/batch_manager → batch}/batch_runner.py +1 -1
- texttools/formatters/user_merge_formatter/user_merge_formatter.py +0 -17
- texttools/prompts/README.md +31 -0
- texttools/prompts/categorizer.yaml +31 -0
- texttools/prompts/keyword_extractor.yaml +14 -0
- texttools/prompts/ner_extractor.yaml +21 -0
- texttools/prompts/question_detector.yaml +15 -0
- texttools/prompts/question_generator.yaml +23 -0
- texttools/prompts/question_merger.yaml +49 -0
- texttools/prompts/question_rewriter.yaml +46 -0
- texttools/prompts/subject_question_generator.yaml +26 -0
- texttools/prompts/summarizer.yaml +12 -0
- texttools/prompts/translator.yaml +15 -0
- texttools/tools/__init__.py +2 -1
- texttools/tools/async_the_tool.py +263 -0
- texttools/tools/internals/async_operator.py +288 -0
- texttools/tools/{operator.py → internals/operator.py} +133 -63
- texttools/tools/{output_models.py → internals/output_models.py} +8 -0
- texttools/tools/{prompt_loader.py → internals/prompt_loader.py} +16 -18
- texttools/tools/the_tool.py +181 -72
- hamtaa_texttools-1.0.1.dist-info/RECORD +0 -18
- texttools/prompts/__init__.py +0 -0
- texttools/utils/__init__.py +0 -4
- {hamtaa_texttools-1.0.1.dist-info → hamtaa_texttools-1.0.3.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.0.1.dist-info → hamtaa_texttools-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.0.1.dist-info → hamtaa_texttools-1.0.3.dist-info}/top_level.txt +0 -0
- /texttools/{utils/batch_manager → batch}/__init__.py +0 -0
- /texttools/{utils/batch_manager → batch}/batch_manager.py +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, TypeVar, Literal, Optional
|
|
4
6
|
import json
|
|
5
7
|
|
|
6
8
|
from openai import OpenAI
|
|
@@ -9,7 +11,7 @@ from pydantic import BaseModel
|
|
|
9
11
|
from texttools.formatters.user_merge_formatter.user_merge_formatter import (
|
|
10
12
|
UserMergeFormatter,
|
|
11
13
|
)
|
|
12
|
-
from texttools.tools.prompt_loader import PromptLoader
|
|
14
|
+
from texttools.tools.internals.prompt_loader import PromptLoader
|
|
13
15
|
|
|
14
16
|
# Base Model type for output models
|
|
15
17
|
T = TypeVar("T", bound=BaseModel)
|
|
@@ -42,13 +44,6 @@ class Operator:
|
|
|
42
44
|
- RESP_FORMAT: str → "vllm" or "parse"
|
|
43
45
|
"""
|
|
44
46
|
|
|
45
|
-
PROMPT_FILE: str
|
|
46
|
-
OUTPUT_MODEL: Type[T]
|
|
47
|
-
WITH_ANALYSIS: bool = False
|
|
48
|
-
USE_MODES: bool
|
|
49
|
-
MODE: str = ""
|
|
50
|
-
RESP_FORMAT: Literal["vllm", "parse"] = "vllm"
|
|
51
|
-
|
|
52
47
|
def __init__(
|
|
53
48
|
self,
|
|
54
49
|
client: OpenAI,
|
|
@@ -59,17 +54,12 @@ class Operator:
|
|
|
59
54
|
):
|
|
60
55
|
self.client: OpenAI = client
|
|
61
56
|
self.model = model
|
|
62
|
-
self.prompt_loader = PromptLoader()
|
|
63
|
-
self.formatter = UserMergeFormatter()
|
|
64
57
|
self.temperature = temperature
|
|
65
58
|
self.client_kwargs = client_kwargs
|
|
66
59
|
|
|
67
60
|
def _build_user_message(self, prompt: str) -> dict[str, str]:
|
|
68
61
|
return {"role": "user", "content": prompt}
|
|
69
62
|
|
|
70
|
-
def _apply_formatter(self, messages: list[dict[str, str]]) -> list[dict[str, str]]:
|
|
71
|
-
return self.formatter.format(messages)
|
|
72
|
-
|
|
73
63
|
def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
|
|
74
64
|
try:
|
|
75
65
|
completion = self.client.chat.completions.create(
|
|
@@ -85,30 +75,35 @@ class Operator:
|
|
|
85
75
|
print(f"[ERROR] Analysis failed: {e}")
|
|
86
76
|
raise
|
|
87
77
|
|
|
88
|
-
def _analyze(self) -> str:
|
|
89
|
-
analyze_prompt =
|
|
78
|
+
def _analyze(self, prompt_configs: dict[str, str]) -> str:
|
|
79
|
+
analyze_prompt = prompt_configs["analyze_template"]
|
|
90
80
|
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
91
81
|
analysis = self._analysis_completion(analyze_message)
|
|
92
82
|
|
|
93
83
|
return analysis
|
|
94
84
|
|
|
95
|
-
def
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
85
|
+
def _parse_completion(
|
|
86
|
+
self,
|
|
87
|
+
message: list[dict[str, str]],
|
|
88
|
+
output_model: T,
|
|
89
|
+
logprobs: bool = False,
|
|
90
|
+
top_logprobs: int = 3,
|
|
91
|
+
) -> tuple[T, Any]:
|
|
102
92
|
try:
|
|
103
|
-
|
|
104
|
-
model
|
|
105
|
-
messages
|
|
106
|
-
response_format
|
|
107
|
-
temperature
|
|
93
|
+
request_kwargs = {
|
|
94
|
+
"model": self.model,
|
|
95
|
+
"messages": message,
|
|
96
|
+
"response_format": output_model,
|
|
97
|
+
"temperature": self.temperature,
|
|
108
98
|
**self.client_kwargs,
|
|
109
|
-
|
|
99
|
+
}
|
|
100
|
+
if logprobs:
|
|
101
|
+
request_kwargs["logprobs"] = True
|
|
102
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
103
|
+
|
|
104
|
+
completion = self.client.beta.chat.completions.parse(**request_kwargs)
|
|
110
105
|
parsed = completion.choices[0].message.parsed
|
|
111
|
-
return parsed
|
|
106
|
+
return parsed, completion
|
|
112
107
|
|
|
113
108
|
except Exception as e:
|
|
114
109
|
print(f"[ERROR] Failed to parse completion: {e}")
|
|
@@ -119,24 +114,20 @@ class Operator:
|
|
|
119
114
|
Clean JSON response by removing code block markers and whitespace.
|
|
120
115
|
Handles cases like:
|
|
121
116
|
- ```json{"result": "value"}```
|
|
122
|
-
- ```{"result": "value"}```
|
|
123
117
|
"""
|
|
124
|
-
# Remove code block markers
|
|
125
118
|
cleaned = response.strip()
|
|
126
119
|
|
|
127
|
-
# Remove ```json
|
|
120
|
+
# Remove ```json marker
|
|
128
121
|
if cleaned.startswith("```json"):
|
|
129
|
-
cleaned = cleaned[7:]
|
|
130
|
-
elif cleaned.startswith("```"):
|
|
131
|
-
cleaned = cleaned[3:] # Remove ```
|
|
122
|
+
cleaned = cleaned[7:]
|
|
132
123
|
|
|
133
|
-
# Remove trailing ```
|
|
124
|
+
# Remove trailing ```
|
|
134
125
|
if cleaned.endswith("```"):
|
|
135
126
|
cleaned = cleaned[:-3]
|
|
136
127
|
|
|
137
128
|
return cleaned.strip()
|
|
138
129
|
|
|
139
|
-
def _convert_to_output_model(self, response_string: str) -> T:
|
|
130
|
+
def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
|
|
140
131
|
"""
|
|
141
132
|
Convert a JSON response string to output model.
|
|
142
133
|
|
|
@@ -151,11 +142,16 @@ class Operator:
|
|
|
151
142
|
# Clean the response string
|
|
152
143
|
cleaned_json = self._clean_json_response(response_string)
|
|
153
144
|
|
|
145
|
+
# Fix Python-style booleans
|
|
146
|
+
cleaned_json = cleaned_json.replace("False", "false").replace(
|
|
147
|
+
"True", "true"
|
|
148
|
+
)
|
|
149
|
+
|
|
154
150
|
# Convert string to Python dictionary
|
|
155
151
|
response_dict = json.loads(cleaned_json)
|
|
156
152
|
|
|
157
153
|
# Convert dictionary to output model
|
|
158
|
-
return
|
|
154
|
+
return output_model(**response_dict)
|
|
159
155
|
|
|
160
156
|
except json.JSONDecodeError as e:
|
|
161
157
|
raise ValueError(
|
|
@@ -164,28 +160,84 @@ class Operator:
|
|
|
164
160
|
except Exception as e:
|
|
165
161
|
raise ValueError(f"Failed to convert to output model: {e}")
|
|
166
162
|
|
|
167
|
-
def _vllm_completion(
|
|
163
|
+
def _vllm_completion(
|
|
164
|
+
self,
|
|
165
|
+
message: list[dict[str, str]],
|
|
166
|
+
output_model: T,
|
|
167
|
+
logprobs: bool = False,
|
|
168
|
+
top_logprobs: int = 3,
|
|
169
|
+
) -> tuple[T, Any]:
|
|
168
170
|
try:
|
|
169
|
-
json_schema =
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
171
|
+
json_schema = output_model.model_json_schema()
|
|
172
|
+
|
|
173
|
+
# Build kwargs dynamically
|
|
174
|
+
request_kwargs = {
|
|
175
|
+
"model": self.model,
|
|
176
|
+
"messages": message,
|
|
177
|
+
"extra_body": {"guided_json": json_schema},
|
|
178
|
+
"temperature": self.temperature,
|
|
175
179
|
**self.client_kwargs,
|
|
176
|
-
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if logprobs:
|
|
183
|
+
request_kwargs["logprobs"] = True
|
|
184
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
185
|
+
|
|
186
|
+
completion = self.client.chat.completions.create(**request_kwargs)
|
|
177
187
|
response = completion.choices[0].message.content
|
|
178
188
|
|
|
179
189
|
# Convert the string response to output model
|
|
180
|
-
|
|
190
|
+
parsed = self._convert_to_output_model(response, output_model)
|
|
181
191
|
|
|
182
|
-
return
|
|
192
|
+
return parsed, completion
|
|
183
193
|
|
|
184
194
|
except Exception as e:
|
|
185
195
|
print(f"[ERROR] Failed to get vLLM structured output: {e}")
|
|
186
196
|
raise
|
|
187
197
|
|
|
188
|
-
def
|
|
198
|
+
def _extract_logprobs(self, completion: dict):
|
|
199
|
+
logprobs_data = []
|
|
200
|
+
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
201
|
+
|
|
202
|
+
for choice in completion.choices:
|
|
203
|
+
if not getattr(choice, "logprobs", None):
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
for logprob_item in choice.logprobs.content:
|
|
207
|
+
if ignore_pattern.match(logprob_item.token):
|
|
208
|
+
continue
|
|
209
|
+
token_entry = {
|
|
210
|
+
"token": logprob_item.token,
|
|
211
|
+
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
212
|
+
"top_alternatives": [],
|
|
213
|
+
}
|
|
214
|
+
for alt in logprob_item.top_logprobs:
|
|
215
|
+
if ignore_pattern.match(alt.token):
|
|
216
|
+
continue
|
|
217
|
+
token_entry["top_alternatives"].append(
|
|
218
|
+
{
|
|
219
|
+
"token": alt.token,
|
|
220
|
+
"prob": round(math.exp(alt.logprob), 8),
|
|
221
|
+
}
|
|
222
|
+
)
|
|
223
|
+
logprobs_data.append(token_entry)
|
|
224
|
+
|
|
225
|
+
return logprobs_data
|
|
226
|
+
|
|
227
|
+
def run(
|
|
228
|
+
self,
|
|
229
|
+
input_text: str,
|
|
230
|
+
prompt_file: str,
|
|
231
|
+
output_model: T,
|
|
232
|
+
with_analysis: bool = False,
|
|
233
|
+
use_modes: bool = False,
|
|
234
|
+
mode: str = "",
|
|
235
|
+
resp_format: Literal["vllm", "parse"] = "parse",
|
|
236
|
+
output_lang: Optional[str] = None,
|
|
237
|
+
logprobs: bool = False,
|
|
238
|
+
top_logprobs: int = 3,
|
|
239
|
+
**extra_kwargs,
|
|
240
|
+
) -> dict[str, Any]:
|
|
189
241
|
"""
|
|
190
242
|
Execute the LLM pipeline with the given input text.
|
|
191
243
|
|
|
@@ -196,36 +248,54 @@ class Operator:
|
|
|
196
248
|
Returns:
|
|
197
249
|
Dictionary containing the parsed result and optional analysis
|
|
198
250
|
"""
|
|
251
|
+
prompt_loader = PromptLoader()
|
|
252
|
+
formatter = UserMergeFormatter()
|
|
253
|
+
|
|
199
254
|
try:
|
|
200
255
|
cleaned_text = input_text.strip()
|
|
201
256
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
257
|
+
prompt_configs = prompt_loader.load_prompts(
|
|
258
|
+
prompt_file,
|
|
259
|
+
use_modes,
|
|
260
|
+
mode,
|
|
206
261
|
cleaned_text,
|
|
207
262
|
**extra_kwargs,
|
|
208
263
|
)
|
|
209
264
|
|
|
210
265
|
messages: list[dict[str, str]] = []
|
|
211
266
|
|
|
212
|
-
if
|
|
213
|
-
analysis = self._analyze()
|
|
267
|
+
if with_analysis:
|
|
268
|
+
analysis = self._analyze(prompt_configs)
|
|
214
269
|
messages.append(
|
|
215
270
|
self._build_user_message(f"Based on this analysis: {analysis}")
|
|
216
271
|
)
|
|
217
272
|
|
|
218
|
-
|
|
219
|
-
|
|
273
|
+
if output_lang:
|
|
274
|
+
messages.append(
|
|
275
|
+
self._build_user_message(
|
|
276
|
+
f"Respond only in the {output_lang} language."
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
messages.append(self._build_user_message(prompt_configs["main_template"]))
|
|
281
|
+
|
|
282
|
+
messages = formatter.format(messages)
|
|
220
283
|
|
|
221
|
-
if
|
|
222
|
-
parsed = self._vllm_completion(
|
|
223
|
-
|
|
224
|
-
|
|
284
|
+
if resp_format == "vllm":
|
|
285
|
+
parsed, completion = self._vllm_completion(
|
|
286
|
+
messages, output_model, logprobs, top_logprobs
|
|
287
|
+
)
|
|
288
|
+
elif resp_format == "parse":
|
|
289
|
+
parsed, completion = self._parse_completion(
|
|
290
|
+
messages, output_model, logprobs, top_logprobs
|
|
291
|
+
)
|
|
225
292
|
|
|
226
293
|
results = {"result": parsed.result}
|
|
227
294
|
|
|
228
|
-
if
|
|
295
|
+
if logprobs:
|
|
296
|
+
results["logprobs"] = self._extract_logprobs(completion)
|
|
297
|
+
|
|
298
|
+
if with_analysis:
|
|
229
299
|
results["analysis"] = analysis
|
|
230
300
|
|
|
231
301
|
return results
|
|
@@ -11,6 +11,14 @@ class StrOutput(BaseModel):
|
|
|
11
11
|
result: str
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class BoolOutput(BaseModel):
|
|
15
|
+
"""
|
|
16
|
+
Output model for a single boolean result.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
result: bool
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
class ListStrOutput(BaseModel):
|
|
15
23
|
"""
|
|
16
24
|
Output model for a list of strings result.
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
import yaml
|
|
4
3
|
|
|
@@ -25,16 +24,17 @@ class PromptLoader:
|
|
|
25
24
|
MAIN_TEMPLATE: str = "main_template"
|
|
26
25
|
ANALYZE_TEMPLATE: str = "analyze_template"
|
|
27
26
|
|
|
28
|
-
def
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _get_prompt_path(self, prompt_file: str) -> Path:
|
|
32
|
-
return Path(__file__).parent.parent / self.PROMPTS_DIR / prompt_file
|
|
27
|
+
def _get_prompt_path(self, prompt_file: str, prompts_dir: str) -> Path:
|
|
28
|
+
return Path(__file__).parent.parent.parent / prompts_dir / prompt_file
|
|
33
29
|
|
|
34
30
|
def _load_templates(
|
|
35
|
-
self,
|
|
31
|
+
self,
|
|
32
|
+
prompts_dir: str,
|
|
33
|
+
prompt_file: str,
|
|
34
|
+
use_modes: bool,
|
|
35
|
+
mode: str,
|
|
36
36
|
) -> dict[str, str]:
|
|
37
|
-
prompt_path = self._get_prompt_path(prompt_file)
|
|
37
|
+
prompt_path = self._get_prompt_path(prompt_file, prompts_dir)
|
|
38
38
|
|
|
39
39
|
if not prompt_path.exists():
|
|
40
40
|
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
|
@@ -45,18 +45,13 @@ class PromptLoader:
|
|
|
45
45
|
except yaml.YAMLError as e:
|
|
46
46
|
raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
|
|
47
47
|
|
|
48
|
-
if self.MAIN_TEMPLATE not in data:
|
|
49
|
-
raise ValueError(
|
|
50
|
-
f"Missing required '{self.MAIN_TEMPLATE}' in {prompt_file}"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
48
|
return {
|
|
54
|
-
|
|
49
|
+
"main_template": data["main_template"][mode]
|
|
55
50
|
if use_modes
|
|
56
|
-
else data[
|
|
57
|
-
|
|
51
|
+
else data["main_template"],
|
|
52
|
+
"analyze_template": data.get("analyze_template")[mode]
|
|
58
53
|
if use_modes
|
|
59
|
-
else data.get(
|
|
54
|
+
else data.get("analyze_template"),
|
|
60
55
|
}
|
|
61
56
|
|
|
62
57
|
def _build_format_args(self, input_text: str, **extra_kwargs) -> dict[str, str]:
|
|
@@ -72,9 +67,12 @@ class PromptLoader:
|
|
|
72
67
|
use_modes: bool,
|
|
73
68
|
mode: str,
|
|
74
69
|
input_text: str,
|
|
70
|
+
prompts_dir: str = "prompts",
|
|
75
71
|
**extra_kwargs,
|
|
76
72
|
) -> dict[str, str]:
|
|
77
|
-
template_configs = self._load_templates(
|
|
73
|
+
template_configs = self._load_templates(
|
|
74
|
+
prompts_dir, prompt_file, use_modes, mode
|
|
75
|
+
)
|
|
78
76
|
format_args = self._build_format_args(input_text, **extra_kwargs)
|
|
79
77
|
|
|
80
78
|
# Inject variables inside each template
|