hamtaa-texttools 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/METADATA +15 -15
- hamtaa_texttools-1.0.7.dist-info/RECORD +31 -0
- texttools/batch/batch_manager.py +7 -18
- texttools/batch/batch_runner.py +96 -45
- texttools/prompts/README.md +4 -0
- texttools/prompts/{keyword_extractor.yaml → extract_keywords.yaml} +6 -6
- texttools/prompts/{question_merger.yaml → merge_questions.yaml} +5 -5
- texttools/tools/async_the_tool.py +204 -143
- texttools/tools/internals/async_operator.py +98 -204
- texttools/tools/internals/base_operator.py +85 -0
- texttools/tools/internals/operator.py +27 -130
- texttools/tools/internals/prompt_loader.py +12 -22
- texttools/tools/the_tool.py +162 -225
- hamtaa_texttools-1.0.5.dist-info/RECORD +0 -30
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/top_level.txt +0 -0
- /texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
- /texttools/prompts/{question_detector.yaml → is_question.yaml} +0 -0
- /texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
- /texttools/prompts/{custom_tool.yaml → run_custom.yaml} +0 -0
- /texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +0 -0
- /texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
- /texttools/prompts/{question_generator.yaml → text_to_question.yaml} +0 -0
- /texttools/prompts/{translator.yaml → translate.yaml} +0 -0
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
import re
|
|
5
|
-
from typing import Any, TypeVar, Type, Literal, Optional
|
|
6
|
-
import json
|
|
3
|
+
from typing import Any, TypeVar, Type, Literal
|
|
7
4
|
import logging
|
|
8
5
|
|
|
9
6
|
from openai import OpenAI
|
|
10
7
|
from pydantic import BaseModel
|
|
11
8
|
|
|
9
|
+
from texttools.tools.internals.base_operator import BaseOperator
|
|
12
10
|
from texttools.formatters.user_merge_formatter import (
|
|
13
11
|
UserMergeFormatter,
|
|
14
12
|
)
|
|
@@ -22,7 +20,7 @@ logger = logging.getLogger("operator")
|
|
|
22
20
|
logger.setLevel(logging.INFO)
|
|
23
21
|
|
|
24
22
|
|
|
25
|
-
class Operator:
|
|
23
|
+
class Operator(BaseOperator):
|
|
26
24
|
"""
|
|
27
25
|
Core engine for running text-processing operations with an LLM.
|
|
28
26
|
|
|
@@ -30,71 +28,46 @@ class Operator:
|
|
|
30
28
|
- `PromptLoader` → loads YAML prompt templates.
|
|
31
29
|
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
32
30
|
- OpenAI client → executes completions/parsed completions.
|
|
33
|
-
|
|
34
|
-
Workflow inside `run()`:
|
|
35
|
-
1. Load prompt templates (`main_template` [+ `analyze_template` if enabled]).
|
|
36
|
-
2. Optionally generate an "analysis" step via `_analyze()`.
|
|
37
|
-
3. Build messages for the LLM.
|
|
38
|
-
4. Call `.beta.chat.completions.parse()` to parse the result into the
|
|
39
|
-
configured `OUTPUT_MODEL` (a Pydantic schema).
|
|
40
|
-
5. Return results as a dict (always `{"result": ...}`, plus `analysis`
|
|
41
|
-
if analysis was enabled).
|
|
42
|
-
|
|
43
|
-
Attributes configured dynamically by `TheTool`:
|
|
44
|
-
- PROMPT_FILE: str → YAML filename
|
|
45
|
-
- OUTPUT_MODEL: Pydantic model class
|
|
46
|
-
- WITH_ANALYSIS: bool → whether to run an analysis phase first
|
|
47
|
-
- USE_MODES: bool → whether to select prompts by mode
|
|
48
|
-
- MODE: str → which mode to use if modes are enabled
|
|
49
|
-
- RESP_FORMAT: str → "vllm" or "parse"
|
|
50
31
|
"""
|
|
51
32
|
|
|
52
|
-
def __init__(self, client: OpenAI):
|
|
33
|
+
def __init__(self, client: OpenAI, model: str):
|
|
53
34
|
self.client: OpenAI = client
|
|
54
|
-
|
|
55
|
-
def _build_user_message(self, prompt: str) -> dict[str, str]:
|
|
56
|
-
return {"role": "user", "content": prompt}
|
|
35
|
+
self.model = model
|
|
57
36
|
|
|
58
37
|
def _analysis_completion(
|
|
59
38
|
self,
|
|
60
39
|
analyze_message: list[dict[str, str]],
|
|
61
|
-
model: str,
|
|
62
40
|
temperature: float,
|
|
63
41
|
) -> str:
|
|
64
42
|
completion = self.client.chat.completions.create(
|
|
65
|
-
model=model,
|
|
43
|
+
model=self.model,
|
|
66
44
|
messages=analyze_message,
|
|
67
45
|
temperature=temperature,
|
|
68
46
|
)
|
|
69
47
|
analysis = completion.choices[0].message.content.strip()
|
|
70
48
|
return analysis
|
|
71
49
|
|
|
72
|
-
def _analyze(
|
|
73
|
-
self,
|
|
74
|
-
prompt_configs: dict[str, str],
|
|
75
|
-
model: str,
|
|
76
|
-
temperature: float,
|
|
77
|
-
) -> str:
|
|
50
|
+
def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
|
|
78
51
|
analyze_prompt = prompt_configs["analyze_template"]
|
|
79
52
|
analyze_message = [self._build_user_message(analyze_prompt)]
|
|
80
|
-
analysis = self._analysis_completion(analyze_message,
|
|
53
|
+
analysis = self._analysis_completion(analyze_message, temperature)
|
|
81
54
|
return analysis
|
|
82
55
|
|
|
83
56
|
def _parse_completion(
|
|
84
57
|
self,
|
|
85
58
|
message: list[dict[str, str]],
|
|
86
59
|
output_model: Type[T],
|
|
87
|
-
model: str,
|
|
88
60
|
temperature: float,
|
|
89
61
|
logprobs: bool = False,
|
|
90
62
|
top_logprobs: int = 3,
|
|
91
63
|
) -> tuple[Type[T], Any]:
|
|
92
64
|
request_kwargs = {
|
|
93
|
-
"model": model,
|
|
65
|
+
"model": self.model,
|
|
94
66
|
"messages": message,
|
|
95
67
|
"response_format": output_model,
|
|
96
68
|
"temperature": temperature,
|
|
97
69
|
}
|
|
70
|
+
|
|
98
71
|
if logprobs:
|
|
99
72
|
request_kwargs["logprobs"] = True
|
|
100
73
|
request_kwargs["top_logprobs"] = top_logprobs
|
|
@@ -103,57 +76,20 @@ class Operator:
|
|
|
103
76
|
parsed = completion.choices[0].message.parsed
|
|
104
77
|
return parsed, completion
|
|
105
78
|
|
|
106
|
-
def _clean_json_response(self, response: str) -> str:
|
|
107
|
-
"""
|
|
108
|
-
Clean JSON response by removing code block markers and whitespace.
|
|
109
|
-
Handles cases like:
|
|
110
|
-
- ```json{"result": "value"}```
|
|
111
|
-
"""
|
|
112
|
-
stripped = response.strip()
|
|
113
|
-
cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
|
|
114
|
-
cleaned = re.sub(r"\s*```$", "", cleaned)
|
|
115
|
-
|
|
116
|
-
return cleaned.strip()
|
|
117
|
-
|
|
118
|
-
def _convert_to_output_model(
|
|
119
|
-
self, response_string: str, output_model: Type[T]
|
|
120
|
-
) -> Type[T]:
|
|
121
|
-
"""
|
|
122
|
-
Convert a JSON response string to output model.
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
response_string: The JSON string (may contain code block markers)
|
|
126
|
-
output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
Instance of your output model
|
|
130
|
-
"""
|
|
131
|
-
# Clean the response string
|
|
132
|
-
cleaned_json = self._clean_json_response(response_string)
|
|
133
|
-
|
|
134
|
-
# Fix Python-style booleans
|
|
135
|
-
cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
|
|
136
|
-
|
|
137
|
-
# Convert string to Python dictionary
|
|
138
|
-
response_dict = json.loads(cleaned_json)
|
|
139
|
-
|
|
140
|
-
# Convert dictionary to output model
|
|
141
|
-
return output_model(**response_dict)
|
|
142
|
-
|
|
143
79
|
def _vllm_completion(
|
|
144
80
|
self,
|
|
145
81
|
message: list[dict[str, str]],
|
|
146
82
|
output_model: Type[T],
|
|
147
|
-
model: str,
|
|
148
83
|
temperature: float,
|
|
149
84
|
logprobs: bool = False,
|
|
150
85
|
top_logprobs: int = 3,
|
|
86
|
+
max_tokens: int | None = None,
|
|
151
87
|
) -> tuple[Type[T], Any]:
|
|
152
88
|
json_schema = output_model.model_json_schema()
|
|
153
89
|
|
|
154
90
|
# Build kwargs dynamically
|
|
155
91
|
request_kwargs = {
|
|
156
|
-
"model": model,
|
|
92
|
+
"model": self.model,
|
|
157
93
|
"messages": message,
|
|
158
94
|
"extra_body": {"guided_json": json_schema},
|
|
159
95
|
"temperature": temperature,
|
|
@@ -170,63 +106,25 @@ class Operator:
|
|
|
170
106
|
parsed = self._convert_to_output_model(response, output_model)
|
|
171
107
|
return parsed, completion
|
|
172
108
|
|
|
173
|
-
def _extract_logprobs(self, completion: dict):
|
|
174
|
-
logprobs_data = []
|
|
175
|
-
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
176
|
-
|
|
177
|
-
for choice in completion.choices:
|
|
178
|
-
if not getattr(choice, "logprobs", None):
|
|
179
|
-
logger.info("No logprobs found.")
|
|
180
|
-
continue
|
|
181
|
-
|
|
182
|
-
for logprob_item in choice.logprobs.content:
|
|
183
|
-
if ignore_pattern.match(logprob_item.token):
|
|
184
|
-
continue
|
|
185
|
-
token_entry = {
|
|
186
|
-
"token": logprob_item.token,
|
|
187
|
-
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
188
|
-
"top_alternatives": [],
|
|
189
|
-
}
|
|
190
|
-
for alt in logprob_item.top_logprobs:
|
|
191
|
-
if ignore_pattern.match(alt.token):
|
|
192
|
-
continue
|
|
193
|
-
token_entry["top_alternatives"].append(
|
|
194
|
-
{
|
|
195
|
-
"token": alt.token,
|
|
196
|
-
"prob": round(math.exp(alt.logprob), 8),
|
|
197
|
-
}
|
|
198
|
-
)
|
|
199
|
-
logprobs_data.append(token_entry)
|
|
200
|
-
|
|
201
|
-
return logprobs_data
|
|
202
|
-
|
|
203
109
|
def run(
|
|
204
110
|
self,
|
|
205
|
-
text: str,
|
|
206
111
|
# User parameters
|
|
207
|
-
|
|
112
|
+
text: str,
|
|
208
113
|
with_analysis: bool,
|
|
114
|
+
output_lang: str | None,
|
|
115
|
+
user_prompt: str | None,
|
|
209
116
|
temperature: float,
|
|
210
117
|
logprobs: bool,
|
|
211
|
-
top_logprobs: int,
|
|
212
|
-
|
|
213
|
-
output_lang: str | None,
|
|
214
|
-
# Each tool's parameters
|
|
118
|
+
top_logprobs: int | None,
|
|
119
|
+
# Internal parameters
|
|
215
120
|
prompt_file: str,
|
|
216
121
|
output_model: Type[T],
|
|
217
|
-
resp_format: Literal["vllm", "parse"]
|
|
218
|
-
mode: str | None
|
|
122
|
+
resp_format: Literal["vllm", "parse"],
|
|
123
|
+
mode: str | None,
|
|
219
124
|
**extra_kwargs,
|
|
220
125
|
) -> dict[str, Any]:
|
|
221
126
|
"""
|
|
222
127
|
Execute the LLM pipeline with the given input text.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
text: The text to process (will be stripped of whitespace)
|
|
226
|
-
**extra_kwargs: Additional variables to inject into prompt templates
|
|
227
|
-
|
|
228
|
-
Returns:
|
|
229
|
-
Dictionary containing the parsed result and optional analysis
|
|
230
128
|
"""
|
|
231
129
|
prompt_loader = PromptLoader()
|
|
232
130
|
formatter = UserMergeFormatter()
|
|
@@ -244,7 +142,7 @@ class Operator:
|
|
|
244
142
|
messages: list[dict[str, str]] = []
|
|
245
143
|
|
|
246
144
|
if with_analysis:
|
|
247
|
-
analysis = self._analyze(prompt_configs,
|
|
145
|
+
analysis = self._analyze(prompt_configs, temperature)
|
|
248
146
|
messages.append(
|
|
249
147
|
self._build_user_message(f"Based on this analysis: {analysis}")
|
|
250
148
|
)
|
|
@@ -262,16 +160,15 @@ class Operator:
|
|
|
262
160
|
)
|
|
263
161
|
|
|
264
162
|
messages.append(self._build_user_message(prompt_configs["main_template"]))
|
|
265
|
-
|
|
266
163
|
messages = formatter.format(messages)
|
|
267
164
|
|
|
268
165
|
if resp_format == "vllm":
|
|
269
166
|
parsed, completion = self._vllm_completion(
|
|
270
|
-
messages, output_model,
|
|
167
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
271
168
|
)
|
|
272
169
|
elif resp_format == "parse":
|
|
273
170
|
parsed, completion = self._parse_completion(
|
|
274
|
-
messages, output_model,
|
|
171
|
+
messages, output_model, temperature, logprobs, top_logprobs
|
|
275
172
|
)
|
|
276
173
|
|
|
277
174
|
# Ensure output_model has a `result` field
|
|
@@ -280,16 +177,16 @@ class Operator:
|
|
|
280
177
|
"The provided output_model must define a field named 'result'"
|
|
281
178
|
)
|
|
282
179
|
|
|
283
|
-
|
|
180
|
+
result = {"result": parsed.result}
|
|
284
181
|
|
|
285
182
|
if logprobs:
|
|
286
|
-
|
|
183
|
+
result["logprobs"] = self._extract_logprobs(completion)
|
|
287
184
|
|
|
288
185
|
if with_analysis:
|
|
289
|
-
|
|
186
|
+
result["analysis"] = analysis
|
|
290
187
|
|
|
291
|
-
return
|
|
188
|
+
return result
|
|
292
189
|
|
|
293
190
|
except Exception as e:
|
|
294
|
-
logger.error(f"
|
|
191
|
+
logger.error(f"TheTool failed: {e}")
|
|
295
192
|
return {"Error": str(e), "result": ""}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from functools import lru_cache
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
@@ -7,10 +7,6 @@ class PromptLoader:
|
|
|
7
7
|
"""
|
|
8
8
|
Utility for loading and formatting YAML prompt templates.
|
|
9
9
|
|
|
10
|
-
Each YAML file under `prompts/` must define at least a `main_template`,
|
|
11
|
-
and optionally an `analyze_template`. These can either be a single string
|
|
12
|
-
or a dictionary keyed by mode names (if `use_modes=True`).
|
|
13
|
-
|
|
14
10
|
Responsibilities:
|
|
15
11
|
- Load and parse YAML prompt definitions.
|
|
16
12
|
- Select the right template (by mode, if applicable).
|
|
@@ -22,31 +18,30 @@ class PromptLoader:
|
|
|
22
18
|
}
|
|
23
19
|
"""
|
|
24
20
|
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.base_dir = Path(__file__).parent.parent.parent / Path("prompts")
|
|
23
|
+
|
|
25
24
|
MAIN_TEMPLATE: str = "main_template"
|
|
26
25
|
ANALYZE_TEMPLATE: str = "analyze_template"
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
prompt_file
|
|
32
|
-
mode: str | None,
|
|
33
|
-
) -> dict[str, str]:
|
|
34
|
-
prompt_path = Path(__file__).parent.parent.parent / prompts_dir / prompt_file
|
|
27
|
+
# Use lru_cache to load each file once
|
|
28
|
+
@lru_cache(maxsize=32)
|
|
29
|
+
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
30
|
+
prompt_path = self.base_dir / prompt_file
|
|
35
31
|
|
|
36
32
|
if not prompt_path.exists():
|
|
37
33
|
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
|
38
34
|
|
|
39
35
|
try:
|
|
40
|
-
# Load the data
|
|
41
36
|
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
42
37
|
except yaml.YAMLError as e:
|
|
43
38
|
raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
|
|
44
39
|
|
|
45
40
|
return {
|
|
46
|
-
|
|
41
|
+
self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
|
|
47
42
|
if mode
|
|
48
43
|
else data[self.MAIN_TEMPLATE],
|
|
49
|
-
|
|
44
|
+
self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
|
|
50
45
|
if mode
|
|
51
46
|
else data.get(self.ANALYZE_TEMPLATE),
|
|
52
47
|
}
|
|
@@ -59,14 +54,9 @@ class PromptLoader:
|
|
|
59
54
|
return format_args
|
|
60
55
|
|
|
61
56
|
def load(
|
|
62
|
-
self,
|
|
63
|
-
prompt_file: str,
|
|
64
|
-
text: str,
|
|
65
|
-
mode: str,
|
|
66
|
-
prompts_dir: str = "prompts",
|
|
67
|
-
**extra_kwargs,
|
|
57
|
+
self, prompt_file: str, text: str, mode: str, **extra_kwargs
|
|
68
58
|
) -> dict[str, str]:
|
|
69
|
-
template_configs = self._load_templates(
|
|
59
|
+
template_configs = self._load_templates(prompt_file, mode)
|
|
70
60
|
format_args = self._build_format_args(text, **extra_kwargs)
|
|
71
61
|
|
|
72
62
|
# Inject variables inside each template
|