hamtaa-texttools 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (32) hide show
  1. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.5.dist-info}/METADATA +192 -141
  2. hamtaa_texttools-1.0.5.dist-info/RECORD +30 -0
  3. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.5.dist-info}/licenses/LICENSE +20 -20
  4. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.5.dist-info}/top_level.txt +0 -0
  5. texttools/__init__.py +9 -9
  6. texttools/batch/__init__.py +4 -4
  7. texttools/batch/batch_manager.py +240 -240
  8. texttools/batch/batch_runner.py +212 -212
  9. texttools/formatters/base_formatter.py +33 -33
  10. texttools/formatters/{user_merge_formatter/user_merge_formatter.py → user_merge_formatter.py} +30 -30
  11. texttools/prompts/README.md +31 -31
  12. texttools/prompts/categorizer.yaml +28 -31
  13. texttools/prompts/custom_tool.yaml +7 -0
  14. texttools/prompts/keyword_extractor.yaml +18 -14
  15. texttools/prompts/ner_extractor.yaml +20 -21
  16. texttools/prompts/question_detector.yaml +13 -14
  17. texttools/prompts/question_generator.yaml +19 -22
  18. texttools/prompts/question_merger.yaml +45 -48
  19. texttools/prompts/rewriter.yaml +111 -0
  20. texttools/prompts/subject_question_generator.yaml +22 -26
  21. texttools/prompts/summarizer.yaml +13 -11
  22. texttools/prompts/translator.yaml +14 -14
  23. texttools/tools/__init__.py +4 -4
  24. texttools/tools/async_the_tool.py +277 -263
  25. texttools/tools/internals/async_operator.py +297 -288
  26. texttools/tools/internals/operator.py +295 -306
  27. texttools/tools/internals/output_models.py +52 -62
  28. texttools/tools/internals/prompt_loader.py +76 -82
  29. texttools/tools/the_tool.py +501 -400
  30. hamtaa_texttools-1.0.4.dist-info/RECORD +0 -29
  31. texttools/prompts/question_rewriter.yaml +0 -46
  32. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.5.dist-info}/WHEEL +0 -0
@@ -1,306 +1,295 @@
1
- from __future__ import annotations
2
-
3
- import math
4
- import re
5
- from typing import Any, TypeVar, Literal, Optional
6
- import json
7
-
8
- from openai import OpenAI
9
- from pydantic import BaseModel
10
-
11
- from texttools.formatters.user_merge_formatter.user_merge_formatter import (
12
- UserMergeFormatter,
13
- )
14
- from texttools.tools.internals.prompt_loader import PromptLoader
15
-
16
- # Base Model type for output models
17
- T = TypeVar("T", bound=BaseModel)
18
-
19
-
20
- class Operator:
21
- """
22
- Core engine for running text-processing operations with an LLM.
23
-
24
- It wires together:
25
- - `PromptLoader` → loads YAML prompt templates.
26
- - `UserMergeFormatter` → applies formatting to messages (e.g., merging).
27
- - OpenAI client executes completions/parsed completions.
28
-
29
- Workflow inside `run()`:
30
- 1. Load prompt templates (`main_template` [+ `analyze_template` if enabled]).
31
- 2. Optionally generate an "analysis" step via `_analyze()`.
32
- 3. Build messages for the LLM.
33
- 4. Call `.beta.chat.completions.parse()` to parse the result into the
34
- configured `OUTPUT_MODEL` (a Pydantic schema).
35
- 5. Return results as a dict (always `{"result": ...}`, plus `analysis`
36
- if analysis was enabled).
37
-
38
- Attributes configured dynamically by `TheTool`:
39
- - PROMPT_FILE: str YAML filename
40
- - OUTPUT_MODEL: Pydantic model class
41
- - WITH_ANALYSIS: bool → whether to run an analysis phase first
42
- - USE_MODES: bool → whether to select prompts by mode
43
- - MODE: str which mode to use if modes are enabled
44
- - RESP_FORMAT: str → "vllm" or "parse"
45
- """
46
-
47
- def __init__(
48
- self,
49
- client: OpenAI,
50
- *,
51
- model: str,
52
- temperature: float = 0.0,
53
- **client_kwargs: Any,
54
- ):
55
- self.client: OpenAI = client
56
- self.model = model
57
- self.temperature = temperature
58
- self.client_kwargs = client_kwargs
59
-
60
- def _build_user_message(self, prompt: str) -> dict[str, str]:
61
- return {"role": "user", "content": prompt}
62
-
63
- def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
64
- try:
65
- completion = self.client.chat.completions.create(
66
- model=self.model,
67
- messages=analyze_message,
68
- temperature=self.temperature,
69
- **self.client_kwargs,
70
- )
71
- analysis = completion.choices[0].message.content.strip()
72
- return analysis
73
-
74
- except Exception as e:
75
- print(f"[ERROR] Analysis failed: {e}")
76
- raise
77
-
78
- def _analyze(self, prompt_configs: dict[str, str]) -> str:
79
- analyze_prompt = prompt_configs["analyze_template"]
80
- analyze_message = [self._build_user_message(analyze_prompt)]
81
- analysis = self._analysis_completion(analyze_message)
82
-
83
- return analysis
84
-
85
- def _parse_completion(
86
- self,
87
- message: list[dict[str, str]],
88
- output_model: T,
89
- logprobs: bool = False,
90
- top_logprobs: int = 3,
91
- ) -> tuple[T, Any]:
92
- try:
93
- request_kwargs = {
94
- "model": self.model,
95
- "messages": message,
96
- "response_format": output_model,
97
- "temperature": self.temperature,
98
- **self.client_kwargs,
99
- }
100
- if logprobs:
101
- request_kwargs["logprobs"] = True
102
- request_kwargs["top_logprobs"] = top_logprobs
103
-
104
- completion = self.client.beta.chat.completions.parse(**request_kwargs)
105
- parsed = completion.choices[0].message.parsed
106
- return parsed, completion
107
-
108
- except Exception as e:
109
- print(f"[ERROR] Failed to parse completion: {e}")
110
- raise
111
-
112
- def _clean_json_response(self, response: str) -> str:
113
- """
114
- Clean JSON response by removing code block markers and whitespace.
115
- Handles cases like:
116
- - ```json{"result": "value"}```
117
- """
118
- cleaned = response.strip()
119
-
120
- # Remove ```json marker
121
- if cleaned.startswith("```json"):
122
- cleaned = cleaned[7:]
123
-
124
- # Remove trailing ```
125
- if cleaned.endswith("```"):
126
- cleaned = cleaned[:-3]
127
-
128
- return cleaned.strip()
129
-
130
- def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
131
- """
132
- Convert a JSON response string to output model.
133
-
134
- Args:
135
- response_string: The JSON string (may contain code block markers)
136
- output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
137
-
138
- Returns:
139
- Instance of your output model
140
- """
141
- try:
142
- # Clean the response string
143
- cleaned_json = self._clean_json_response(response_string)
144
-
145
- # Fix Python-style booleans
146
- cleaned_json = cleaned_json.replace("False", "false").replace(
147
- "True", "true"
148
- )
149
-
150
- # Convert string to Python dictionary
151
- response_dict = json.loads(cleaned_json)
152
-
153
- # Convert dictionary to output model
154
- return output_model(**response_dict)
155
-
156
- except json.JSONDecodeError as e:
157
- raise ValueError(
158
- f"Failed to parse JSON response: {e}\nResponse: {response_string}"
159
- )
160
- except Exception as e:
161
- raise ValueError(f"Failed to convert to output model: {e}")
162
-
163
- def _vllm_completion(
164
- self,
165
- message: list[dict[str, str]],
166
- output_model: T,
167
- logprobs: bool = False,
168
- top_logprobs: int = 3,
169
- ) -> tuple[T, Any]:
170
- try:
171
- json_schema = output_model.model_json_schema()
172
-
173
- # Build kwargs dynamically
174
- request_kwargs = {
175
- "model": self.model,
176
- "messages": message,
177
- "extra_body": {"guided_json": json_schema},
178
- "temperature": self.temperature,
179
- **self.client_kwargs,
180
- }
181
-
182
- if logprobs:
183
- request_kwargs["logprobs"] = True
184
- request_kwargs["top_logprobs"] = top_logprobs
185
-
186
- completion = self.client.chat.completions.create(**request_kwargs)
187
- response = completion.choices[0].message.content
188
-
189
- # Convert the string response to output model
190
- parsed = self._convert_to_output_model(response, output_model)
191
-
192
- return parsed, completion
193
-
194
- except Exception as e:
195
- print(f"[ERROR] Failed to get vLLM structured output: {e}")
196
- raise
197
-
198
- def _extract_logprobs(self, completion: dict):
199
- logprobs_data = []
200
- ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
201
-
202
- for choice in completion.choices:
203
- if not getattr(choice, "logprobs", None):
204
- continue
205
-
206
- for logprob_item in choice.logprobs.content:
207
- if ignore_pattern.match(logprob_item.token):
208
- continue
209
- token_entry = {
210
- "token": logprob_item.token,
211
- "prob": round(math.exp(logprob_item.logprob), 8),
212
- "top_alternatives": [],
213
- }
214
- for alt in logprob_item.top_logprobs:
215
- if ignore_pattern.match(alt.token):
216
- continue
217
- token_entry["top_alternatives"].append(
218
- {
219
- "token": alt.token,
220
- "prob": round(math.exp(alt.logprob), 8),
221
- }
222
- )
223
- logprobs_data.append(token_entry)
224
-
225
- return logprobs_data
226
-
227
- def run(
228
- self,
229
- input_text: str,
230
- prompt_file: str,
231
- output_model: T,
232
- with_analysis: bool = False,
233
- use_modes: bool = False,
234
- mode: str = "",
235
- resp_format: Literal["vllm", "parse"] = "parse",
236
- output_lang: Optional[str] = None,
237
- logprobs: bool = False,
238
- top_logprobs: int = 3,
239
- **extra_kwargs,
240
- ) -> dict[str, Any]:
241
- """
242
- Execute the LLM pipeline with the given input text.
243
-
244
- Args:
245
- input_text: The text to process (will be stripped of whitespace)
246
- **extra_kwargs: Additional variables to inject into prompt templates
247
-
248
- Returns:
249
- Dictionary containing the parsed result and optional analysis
250
- """
251
- prompt_loader = PromptLoader()
252
- formatter = UserMergeFormatter()
253
-
254
- try:
255
- cleaned_text = input_text.strip()
256
-
257
- prompt_configs = prompt_loader.load_prompts(
258
- prompt_file,
259
- use_modes,
260
- mode,
261
- cleaned_text,
262
- **extra_kwargs,
263
- )
264
-
265
- messages: list[dict[str, str]] = []
266
-
267
- if with_analysis:
268
- analysis = self._analyze(prompt_configs)
269
- messages.append(
270
- self._build_user_message(f"Based on this analysis: {analysis}")
271
- )
272
-
273
- if output_lang:
274
- messages.append(
275
- self._build_user_message(
276
- f"Respond only in the {output_lang} language."
277
- )
278
- )
279
-
280
- messages.append(self._build_user_message(prompt_configs["main_template"]))
281
-
282
- messages = formatter.format(messages)
283
-
284
- if resp_format == "vllm":
285
- parsed, completion = self._vllm_completion(
286
- messages, output_model, logprobs, top_logprobs
287
- )
288
- elif resp_format == "parse":
289
- parsed, completion = self._parse_completion(
290
- messages, output_model, logprobs, top_logprobs
291
- )
292
-
293
- results = {"result": parsed.result}
294
-
295
- if logprobs:
296
- results["logprobs"] = self._extract_logprobs(completion)
297
-
298
- if with_analysis:
299
- results["analysis"] = analysis
300
-
301
- return results
302
-
303
- except Exception as e:
304
- # Print error clearly and exit
305
- print(f"[ERROR] Operation failed: {e}")
306
- exit(1)
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import re
5
+ from typing import Any, TypeVar, Type, Literal, Optional
6
+ import json
7
+ import logging
8
+
9
+ from openai import OpenAI
10
+ from pydantic import BaseModel
11
+
12
+ from texttools.formatters.user_merge_formatter import (
13
+ UserMergeFormatter,
14
+ )
15
+ from texttools.tools.internals.prompt_loader import PromptLoader
16
+
17
+ # Base Model type for output models
18
+ T = TypeVar("T", bound=BaseModel)
19
+
20
+ # Configure logger
21
+ logger = logging.getLogger("operator")
22
+ logger.setLevel(logging.INFO)
23
+
24
+
25
+ class Operator:
26
+ """
27
+ Core engine for running text-processing operations with an LLM.
28
+
29
+ It wires together:
30
+ - `PromptLoader` loads YAML prompt templates.
31
+ - `UserMergeFormatter` applies formatting to messages (e.g., merging).
32
+ - OpenAI client executes completions/parsed completions.
33
+
34
+ Workflow inside `run()`:
35
+ 1. Load prompt templates (`main_template` [+ `analyze_template` if enabled]).
36
+ 2. Optionally generate an "analysis" step via `_analyze()`.
37
+ 3. Build messages for the LLM.
38
+ 4. Call `.beta.chat.completions.parse()` to parse the result into the
39
+ configured `OUTPUT_MODEL` (a Pydantic schema).
40
+ 5. Return results as a dict (always `{"result": ...}`, plus `analysis`
41
+ if analysis was enabled).
42
+
43
+ Attributes configured dynamically by `TheTool`:
44
+ - PROMPT_FILE: str → YAML filename
45
+ - OUTPUT_MODEL: Pydantic model class
46
+ - WITH_ANALYSIS: bool → whether to run an analysis phase first
47
+ - USE_MODES: bool → whether to select prompts by mode
48
+ - MODE: str → which mode to use if modes are enabled
49
+ - RESP_FORMAT: str → "vllm" or "parse"
50
+ """
51
+
52
+ def __init__(self, client: OpenAI):
53
+ self.client: OpenAI = client
54
+
55
+ def _build_user_message(self, prompt: str) -> dict[str, str]:
56
+ return {"role": "user", "content": prompt}
57
+
58
+ def _analysis_completion(
59
+ self,
60
+ analyze_message: list[dict[str, str]],
61
+ model: str,
62
+ temperature: float,
63
+ ) -> str:
64
+ completion = self.client.chat.completions.create(
65
+ model=model,
66
+ messages=analyze_message,
67
+ temperature=temperature,
68
+ )
69
+ analysis = completion.choices[0].message.content.strip()
70
+ return analysis
71
+
72
+ def _analyze(
73
+ self,
74
+ prompt_configs: dict[str, str],
75
+ model: str,
76
+ temperature: float,
77
+ ) -> str:
78
+ analyze_prompt = prompt_configs["analyze_template"]
79
+ analyze_message = [self._build_user_message(analyze_prompt)]
80
+ analysis = self._analysis_completion(analyze_message, model, temperature)
81
+ return analysis
82
+
83
+ def _parse_completion(
84
+ self,
85
+ message: list[dict[str, str]],
86
+ output_model: Type[T],
87
+ model: str,
88
+ temperature: float,
89
+ logprobs: bool = False,
90
+ top_logprobs: int = 3,
91
+ ) -> tuple[Type[T], Any]:
92
+ request_kwargs = {
93
+ "model": model,
94
+ "messages": message,
95
+ "response_format": output_model,
96
+ "temperature": temperature,
97
+ }
98
+ if logprobs:
99
+ request_kwargs["logprobs"] = True
100
+ request_kwargs["top_logprobs"] = top_logprobs
101
+
102
+ completion = self.client.beta.chat.completions.parse(**request_kwargs)
103
+ parsed = completion.choices[0].message.parsed
104
+ return parsed, completion
105
+
106
+ def _clean_json_response(self, response: str) -> str:
107
+ """
108
+ Clean JSON response by removing code block markers and whitespace.
109
+ Handles cases like:
110
+ - ```json{"result": "value"}```
111
+ """
112
+ stripped = response.strip()
113
+ cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
114
+ cleaned = re.sub(r"\s*```$", "", cleaned)
115
+
116
+ return cleaned.strip()
117
+
118
+ def _convert_to_output_model(
119
+ self, response_string: str, output_model: Type[T]
120
+ ) -> Type[T]:
121
+ """
122
+ Convert a JSON response string to output model.
123
+
124
+ Args:
125
+ response_string: The JSON string (may contain code block markers)
126
+ output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
127
+
128
+ Returns:
129
+ Instance of your output model
130
+ """
131
+ # Clean the response string
132
+ cleaned_json = self._clean_json_response(response_string)
133
+
134
+ # Fix Python-style booleans
135
+ cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
136
+
137
+ # Convert string to Python dictionary
138
+ response_dict = json.loads(cleaned_json)
139
+
140
+ # Convert dictionary to output model
141
+ return output_model(**response_dict)
142
+
143
+ def _vllm_completion(
144
+ self,
145
+ message: list[dict[str, str]],
146
+ output_model: Type[T],
147
+ model: str,
148
+ temperature: float,
149
+ logprobs: bool = False,
150
+ top_logprobs: int = 3,
151
+ ) -> tuple[Type[T], Any]:
152
+ json_schema = output_model.model_json_schema()
153
+
154
+ # Build kwargs dynamically
155
+ request_kwargs = {
156
+ "model": model,
157
+ "messages": message,
158
+ "extra_body": {"guided_json": json_schema},
159
+ "temperature": temperature,
160
+ }
161
+
162
+ if logprobs:
163
+ request_kwargs["logprobs"] = True
164
+ request_kwargs["top_logprobs"] = top_logprobs
165
+
166
+ completion = self.client.chat.completions.create(**request_kwargs)
167
+ response = completion.choices[0].message.content
168
+
169
+ # Convert the string response to output model
170
+ parsed = self._convert_to_output_model(response, output_model)
171
+ return parsed, completion
172
+
173
+ def _extract_logprobs(self, completion: dict):
174
+ logprobs_data = []
175
+ ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
176
+
177
+ for choice in completion.choices:
178
+ if not getattr(choice, "logprobs", None):
179
+ logger.info("No logprobs found.")
180
+ continue
181
+
182
+ for logprob_item in choice.logprobs.content:
183
+ if ignore_pattern.match(logprob_item.token):
184
+ continue
185
+ token_entry = {
186
+ "token": logprob_item.token,
187
+ "prob": round(math.exp(logprob_item.logprob), 8),
188
+ "top_alternatives": [],
189
+ }
190
+ for alt in logprob_item.top_logprobs:
191
+ if ignore_pattern.match(alt.token):
192
+ continue
193
+ token_entry["top_alternatives"].append(
194
+ {
195
+ "token": alt.token,
196
+ "prob": round(math.exp(alt.logprob), 8),
197
+ }
198
+ )
199
+ logprobs_data.append(token_entry)
200
+
201
+ return logprobs_data
202
+
203
+ def run(
204
+ self,
205
+ text: str,
206
+ # User parameters
207
+ model: str,
208
+ with_analysis: bool,
209
+ temperature: float,
210
+ logprobs: bool,
211
+ top_logprobs: int,
212
+ user_prompt: str | None,
213
+ output_lang: str | None,
214
+ # Each tool's parameters
215
+ prompt_file: str,
216
+ output_model: Type[T],
217
+ resp_format: Literal["vllm", "parse"] = "parse",
218
+ mode: str | None = None,
219
+ **extra_kwargs,
220
+ ) -> dict[str, Any]:
221
+ """
222
+ Execute the LLM pipeline with the given input text.
223
+
224
+ Args:
225
+ text: The text to process (will be stripped of whitespace)
226
+ **extra_kwargs: Additional variables to inject into prompt templates
227
+
228
+ Returns:
229
+ Dictionary containing the parsed result and optional analysis
230
+ """
231
+ prompt_loader = PromptLoader()
232
+ formatter = UserMergeFormatter()
233
+
234
+ try:
235
+ cleaned_text = text.strip()
236
+
237
+ prompt_configs = prompt_loader.load(
238
+ prompt_file=prompt_file,
239
+ text=cleaned_text,
240
+ mode=mode,
241
+ **extra_kwargs,
242
+ )
243
+
244
+ messages: list[dict[str, str]] = []
245
+
246
+ if with_analysis:
247
+ analysis = self._analyze(prompt_configs, model, temperature)
248
+ messages.append(
249
+ self._build_user_message(f"Based on this analysis: {analysis}")
250
+ )
251
+
252
+ if output_lang:
253
+ messages.append(
254
+ self._build_user_message(
255
+ f"Respond only in the {output_lang} language."
256
+ )
257
+ )
258
+
259
+ if user_prompt:
260
+ messages.append(
261
+ self._build_user_message(f"Consider this instruction {user_prompt}")
262
+ )
263
+
264
+ messages.append(self._build_user_message(prompt_configs["main_template"]))
265
+
266
+ messages = formatter.format(messages)
267
+
268
+ if resp_format == "vllm":
269
+ parsed, completion = self._vllm_completion(
270
+ messages, output_model, model, temperature, logprobs, top_logprobs
271
+ )
272
+ elif resp_format == "parse":
273
+ parsed, completion = self._parse_completion(
274
+ messages, output_model, model, temperature, logprobs, top_logprobs
275
+ )
276
+
277
+ # Ensure output_model has a `result` field
278
+ if not hasattr(parsed, "result"):
279
+ logger.error(
280
+ "The provided output_model must define a field named 'result'"
281
+ )
282
+
283
+ results = {"result": parsed.result}
284
+
285
+ if logprobs:
286
+ results["logprobs"] = self._extract_logprobs(completion)
287
+
288
+ if with_analysis:
289
+ results["analysis"] = analysis
290
+
291
+ return results
292
+
293
+ except Exception as e:
294
+ logger.error(f"Operation failed: {e}")
295
+ return {"Error": str(e), "result": ""}