hamtaa-texttools 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.8.dist-info}/METADATA +13 -22
- hamtaa_texttools-1.0.8.dist-info/RECORD +30 -0
- texttools/tools/async_the_tool.py +246 -140
- texttools/tools/internals/async_operator.py +81 -212
- texttools/tools/internals/base_operator.py +85 -0
- texttools/tools/internals/formatters.py +24 -0
- texttools/tools/internals/operator.py +32 -150
- texttools/tools/internals/prompt_loader.py +3 -12
- texttools/tools/the_tool.py +163 -283
- hamtaa_texttools-1.0.6.dist-info/RECORD +0 -30
- texttools/formatters/base_formatter.py +0 -33
- texttools/formatters/user_merge_formatter.py +0 -30
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.8.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.8.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.0.6.dist-info → hamtaa_texttools-1.0.8.dist-info}/top_level.txt +0 -0
- /texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
- /texttools/prompts/{keyword_extractor.yaml → extract_keywords.yaml} +0 -0
- /texttools/prompts/{question_merger.yaml → merge_questions.yaml} +0 -0
- /texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
- /texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
- /texttools/prompts/{translator.yaml → translate.yaml} +0 -0
|
@@ -1,17 +1,11 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import math
|
|
4
|
-
import re
|
|
5
1
|
from typing import Any, TypeVar, Type, Literal
|
|
6
|
-
import json
|
|
7
2
|
import logging
|
|
8
3
|
|
|
9
4
|
from openai import OpenAI
|
|
10
5
|
from pydantic import BaseModel
|
|
11
6
|
|
|
12
|
-
from texttools.
|
|
13
|
-
|
|
14
|
-
)
|
|
7
|
+
from texttools.tools.internals.base_operator import BaseOperator
|
|
8
|
+
from texttools.tools.internals.formatters import Formatter
|
|
15
9
|
from texttools.tools.internals.prompt_loader import PromptLoader
|
|
16
10
|
|
|
17
11
|
# Base Model type for output models
|
|
@@ -22,79 +16,46 @@ logger = logging.getLogger("operator")
|
|
|
22
16
|
logger.setLevel(logging.INFO)
|
|
23
17
|
|
|
24
18
|
|
|
25
|
-
class Operator:
|
|
19
|
+
class Operator(BaseOperator):
|
|
26
20
|
"""
|
|
27
|
-
Core engine for running text-processing operations with an LLM.
|
|
21
|
+
Core engine for running text-processing operations with an LLM (Sync).
|
|
28
22
|
|
|
29
23
|
It wires together:
|
|
30
24
|
- `PromptLoader` → loads YAML prompt templates.
|
|
31
25
|
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
32
26
|
- OpenAI client → executes completions/parsed completions.
|
|
33
|
-
|
|
34
|
-
Workflow inside `run()`:
|
|
35
|
-
1. Load prompt templates (`main_template` [+ `analyze_template` if enabled]).
|
|
36
|
-
2. Optionally generate an "analysis" step via `_analyze()`.
|
|
37
|
-
3. Build messages for the LLM.
|
|
38
|
-
4. Call `.beta.chat.completions.parse()` to parse the result into the
|
|
39
|
-
configured `OUTPUT_MODEL` (a Pydantic schema).
|
|
40
|
-
5. Return results as a dict (always `{"result": ...}`, plus `analysis`
|
|
41
|
-
if analysis was enabled).
|
|
42
|
-
|
|
43
|
-
Attributes configured dynamically by `TheTool`:
|
|
44
|
-
- PROMPT_FILE: str → YAML filename
|
|
45
|
-
- OUTPUT_MODEL: Pydantic model class
|
|
46
|
-
- WITH_ANALYSIS: bool → whether to run an analysis phase first
|
|
47
|
-
- USE_MODES: bool → whether to select prompts by mode
|
|
48
|
-
- MODE: str → which mode to use if modes are enabled
|
|
49
|
-
- RESP_FORMAT: str → "vllm" or "parse"
|
|
50
27
|
"""
|
|
51
28
|
|
|
52
|
-
def __init__(self, client: OpenAI):
|
|
53
|
-
self.client
|
|
29
|
+
def __init__(self, client: OpenAI, model: str):
|
|
30
|
+
self.client = client
|
|
31
|
+
self.model = model
|
|
54
32
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _analysis_completion(
|
|
59
|
-
self,
|
|
60
|
-
analyze_message: list[dict[str, str]],
|
|
61
|
-
model: str,
|
|
62
|
-
temperature: float,
|
|
63
|
-
) -> str:
|
|
33
|
+
def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
|
|
34
|
+
analyze_prompt = prompt_configs["analyze_template"]
|
|
35
|
+
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
64
36
|
completion = self.client.chat.completions.create(
|
|
65
|
-
model=model,
|
|
37
|
+
model=self.model,
|
|
66
38
|
messages=analyze_message,
|
|
67
39
|
temperature=temperature,
|
|
68
40
|
)
|
|
69
41
|
analysis = completion.choices[0].message.content.strip()
|
|
70
42
|
return analysis
|
|
71
43
|
|
|
72
|
-
def _analyze(
|
|
73
|
-
self,
|
|
74
|
-
prompt_configs: dict[str, str],
|
|
75
|
-
model: str,
|
|
76
|
-
temperature: float,
|
|
77
|
-
) -> str:
|
|
78
|
-
analyze_prompt = prompt_configs["analyze_template"]
|
|
79
|
-
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
80
|
-
analysis = self._analysis_completion(analyze_message, model, temperature)
|
|
81
|
-
return analysis
|
|
82
|
-
|
|
83
44
|
def _parse_completion(
|
|
84
45
|
self,
|
|
85
46
|
message: list[dict[str, str]],
|
|
86
47
|
output_model: Type[T],
|
|
87
|
-
model: str,
|
|
88
48
|
temperature: float,
|
|
89
49
|
logprobs: bool = False,
|
|
90
50
|
top_logprobs: int = 3,
|
|
91
51
|
) -> tuple[Type[T], Any]:
|
|
92
52
|
request_kwargs = {
|
|
93
|
-
"model": model,
|
|
53
|
+
"model": self.model,
|
|
94
54
|
"messages": message,
|
|
95
55
|
"response_format": output_model,
|
|
96
56
|
"temperature": temperature,
|
|
97
57
|
}
|
|
58
|
+
|
|
98
59
|
if logprobs:
|
|
99
60
|
request_kwargs["logprobs"] = True
|
|
100
61
|
request_kwargs["top_logprobs"] = top_logprobs
|
|
@@ -103,48 +64,10 @@ class Operator:
|
|
|
103
64
|
parsed = completion.choices[0].message.parsed
|
|
104
65
|
return parsed, completion
|
|
105
66
|
|
|
106
|
-
def _clean_json_response(self, response: str) -> str:
|
|
107
|
-
"""
|
|
108
|
-
Clean JSON response by removing code block markers and whitespace.
|
|
109
|
-
Handles cases like:
|
|
110
|
-
- ```json{"result": "value"}```
|
|
111
|
-
"""
|
|
112
|
-
stripped = response.strip()
|
|
113
|
-
cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
|
|
114
|
-
cleaned = re.sub(r"\s*```$", "", cleaned)
|
|
115
|
-
|
|
116
|
-
return cleaned.strip()
|
|
117
|
-
|
|
118
|
-
def _convert_to_output_model(
|
|
119
|
-
self, response_string: str, output_model: Type[T]
|
|
120
|
-
) -> Type[T]:
|
|
121
|
-
"""
|
|
122
|
-
Convert a JSON response string to output model.
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
response_string: The JSON string (may contain code block markers)
|
|
126
|
-
output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
Instance of your output model
|
|
130
|
-
"""
|
|
131
|
-
# Clean the response string
|
|
132
|
-
cleaned_json = self._clean_json_response(response_string)
|
|
133
|
-
|
|
134
|
-
# Fix Python-style booleans
|
|
135
|
-
cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
|
|
136
|
-
|
|
137
|
-
# Convert string to Python dictionary
|
|
138
|
-
response_dict = json.loads(cleaned_json)
|
|
139
|
-
|
|
140
|
-
# Convert dictionary to output model
|
|
141
|
-
return output_model(**response_dict)
|
|
142
|
-
|
|
143
67
|
def _vllm_completion(
|
|
144
68
|
self,
|
|
145
69
|
message: list[dict[str, str]],
|
|
146
70
|
output_model: Type[T],
|
|
147
|
-
model: str,
|
|
148
71
|
temperature: float,
|
|
149
72
|
logprobs: bool = False,
|
|
150
73
|
top_logprobs: int = 3,
|
|
@@ -153,7 +76,7 @@ class Operator:
|
|
|
153
76
|
|
|
154
77
|
# Build kwargs dynamically
|
|
155
78
|
request_kwargs = {
|
|
156
|
-
"model": model,
|
|
79
|
+
"model": self.model,
|
|
157
80
|
"messages": message,
|
|
158
81
|
"extra_body": {"guided_json": json_schema},
|
|
159
82
|
"temperature": temperature,
|
|
@@ -170,73 +93,33 @@ class Operator:
|
|
|
170
93
|
parsed = self._convert_to_output_model(response, output_model)
|
|
171
94
|
return parsed, completion
|
|
172
95
|
|
|
173
|
-
def _extract_logprobs(self, completion: dict):
|
|
174
|
-
logprobs_data = []
|
|
175
|
-
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
176
|
-
|
|
177
|
-
for choice in completion.choices:
|
|
178
|
-
if not getattr(choice, "logprobs", None):
|
|
179
|
-
logger.info("No logprobs found.")
|
|
180
|
-
continue
|
|
181
|
-
|
|
182
|
-
for logprob_item in choice.logprobs.content:
|
|
183
|
-
if ignore_pattern.match(logprob_item.token):
|
|
184
|
-
continue
|
|
185
|
-
token_entry = {
|
|
186
|
-
"token": logprob_item.token,
|
|
187
|
-
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
188
|
-
"top_alternatives": [],
|
|
189
|
-
}
|
|
190
|
-
for alt in logprob_item.top_logprobs:
|
|
191
|
-
if ignore_pattern.match(alt.token):
|
|
192
|
-
continue
|
|
193
|
-
token_entry["top_alternatives"].append(
|
|
194
|
-
{
|
|
195
|
-
"token": alt.token,
|
|
196
|
-
"prob": round(math.exp(alt.logprob), 8),
|
|
197
|
-
}
|
|
198
|
-
)
|
|
199
|
-
logprobs_data.append(token_entry)
|
|
200
|
-
|
|
201
|
-
return logprobs_data
|
|
202
|
-
|
|
203
96
|
def run(
|
|
204
97
|
self,
|
|
205
|
-
text: str,
|
|
206
98
|
# User parameters
|
|
207
|
-
|
|
99
|
+
text: str,
|
|
208
100
|
with_analysis: bool,
|
|
101
|
+
output_lang: str | None,
|
|
102
|
+
user_prompt: str | None,
|
|
209
103
|
temperature: float,
|
|
210
104
|
logprobs: bool,
|
|
211
|
-
top_logprobs: int,
|
|
212
|
-
|
|
213
|
-
output_lang: str | None,
|
|
214
|
-
# Each tool's parameters
|
|
105
|
+
top_logprobs: int | None,
|
|
106
|
+
# Internal parameters
|
|
215
107
|
prompt_file: str,
|
|
216
108
|
output_model: Type[T],
|
|
217
|
-
resp_format: Literal["vllm", "parse"]
|
|
218
|
-
mode: str | None
|
|
109
|
+
resp_format: Literal["vllm", "parse"],
|
|
110
|
+
mode: str | None,
|
|
219
111
|
**extra_kwargs,
|
|
220
112
|
) -> dict[str, Any]:
|
|
221
113
|
"""
|
|
222
114
|
Execute the LLM pipeline with the given input text.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
text: The text to process (will be stripped of whitespace)
|
|
226
|
-
**extra_kwargs: Additional variables to inject into prompt templates
|
|
227
|
-
|
|
228
|
-
Returns:
|
|
229
|
-
Dictionary containing the parsed result and optional analysis
|
|
230
115
|
"""
|
|
231
116
|
prompt_loader = PromptLoader()
|
|
232
|
-
formatter =
|
|
117
|
+
formatter = Formatter()
|
|
233
118
|
|
|
234
119
|
try:
|
|
235
|
-
cleaned_text = text.strip()
|
|
236
|
-
|
|
237
120
|
prompt_configs = prompt_loader.load(
|
|
238
121
|
prompt_file=prompt_file,
|
|
239
|
-
text=
|
|
122
|
+
text=text.strip(),
|
|
240
123
|
mode=mode,
|
|
241
124
|
**extra_kwargs,
|
|
242
125
|
)
|
|
@@ -244,7 +127,7 @@ class Operator:
|
|
|
244
127
|
messages: list[dict[str, str]] = []
|
|
245
128
|
|
|
246
129
|
if with_analysis:
|
|
247
|
-
analysis = self._analyze(prompt_configs,
|
|
130
|
+
analysis = self._analyze(prompt_configs, temperature)
|
|
248
131
|
messages.append(
|
|
249
132
|
self._build_user_message(f"Based on this analysis: {analysis}")
|
|
250
133
|
)
|
|
@@ -262,16 +145,15 @@ class Operator:
|
|
|
262
145
|
)
|
|
263
146
|
|
|
264
147
|
messages.append(self._build_user_message(prompt_configs["main_template"]))
|
|
265
|
-
|
|
266
|
-
messages = formatter.format(messages)
|
|
148
|
+
messages = formatter.user_merge_format(messages)
|
|
267
149
|
|
|
268
150
|
if resp_format == "vllm":
|
|
269
151
|
parsed, completion = self._vllm_completion(
|
|
270
|
-
messages, output_model,
|
|
152
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
271
153
|
)
|
|
272
154
|
elif resp_format == "parse":
|
|
273
155
|
parsed, completion = self._parse_completion(
|
|
274
|
-
messages, output_model,
|
|
156
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
275
157
|
)
|
|
276
158
|
|
|
277
159
|
# Ensure output_model has a `result` field
|
|
@@ -280,16 +162,16 @@ class Operator:
|
|
|
280
162
|
"The provided output_model must define a field named 'result'"
|
|
281
163
|
)
|
|
282
164
|
|
|
283
|
-
|
|
165
|
+
result = {"result": parsed.result}
|
|
284
166
|
|
|
285
167
|
if logprobs:
|
|
286
|
-
|
|
168
|
+
result["logprobs"] = self._extract_logprobs(completion)
|
|
287
169
|
|
|
288
170
|
if with_analysis:
|
|
289
|
-
|
|
171
|
+
result["analysis"] = analysis
|
|
290
172
|
|
|
291
|
-
return
|
|
173
|
+
return result
|
|
292
174
|
|
|
293
175
|
except Exception as e:
|
|
294
176
|
logger.error(f"TheTool failed: {e}")
|
|
295
|
-
return {"
|
|
177
|
+
return {"error": str(e), "result": ""}
|
|
@@ -18,24 +18,15 @@ class PromptLoader:
|
|
|
18
18
|
}
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
def __init__(self):
|
|
22
|
-
self.base_dir = Path(__file__).parent.parent.parent / Path("prompts")
|
|
23
|
-
|
|
24
21
|
MAIN_TEMPLATE: str = "main_template"
|
|
25
22
|
ANALYZE_TEMPLATE: str = "analyze_template"
|
|
26
23
|
|
|
27
24
|
# Use lru_cache to load each file once
|
|
28
25
|
@lru_cache(maxsize=32)
|
|
29
26
|
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
|
34
|
-
|
|
35
|
-
try:
|
|
36
|
-
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
37
|
-
except yaml.YAMLError as e:
|
|
38
|
-
raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
|
|
27
|
+
base_dir = Path(__file__).parent.parent.parent / Path("prompts")
|
|
28
|
+
prompt_path = base_dir / prompt_file
|
|
29
|
+
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
39
30
|
|
|
40
31
|
return {
|
|
41
32
|
self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
|