hamtaa-texttools 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (32) hide show
  1. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/METADATA +192 -141
  2. hamtaa_texttools-1.0.6.dist-info/RECORD +30 -0
  3. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/licenses/LICENSE +20 -20
  4. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/top_level.txt +0 -0
  5. texttools/__init__.py +9 -9
  6. texttools/batch/__init__.py +4 -4
  7. texttools/batch/batch_manager.py +229 -240
  8. texttools/batch/batch_runner.py +263 -212
  9. texttools/formatters/base_formatter.py +33 -33
  10. texttools/formatters/{user_merge_formatter/user_merge_formatter.py → user_merge_formatter.py} +30 -30
  11. texttools/prompts/README.md +35 -31
  12. texttools/prompts/categorizer.yaml +28 -31
  13. texttools/prompts/{question_detector.yaml → is_question.yaml} +13 -14
  14. texttools/prompts/keyword_extractor.yaml +18 -14
  15. texttools/prompts/ner_extractor.yaml +20 -21
  16. texttools/prompts/question_merger.yaml +45 -48
  17. texttools/prompts/rewriter.yaml +111 -0
  18. texttools/prompts/run_custom.yaml +7 -0
  19. texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +22 -26
  20. texttools/prompts/summarizer.yaml +13 -11
  21. texttools/prompts/{question_generator.yaml → text_to_question.yaml} +19 -22
  22. texttools/prompts/translator.yaml +14 -14
  23. texttools/tools/__init__.py +4 -4
  24. texttools/tools/async_the_tool.py +277 -263
  25. texttools/tools/internals/async_operator.py +308 -288
  26. texttools/tools/internals/operator.py +295 -306
  27. texttools/tools/internals/output_models.py +52 -62
  28. texttools/tools/internals/prompt_loader.py +66 -82
  29. texttools/tools/the_tool.py +501 -400
  30. hamtaa_texttools-1.0.4.dist-info/RECORD +0 -29
  31. texttools/prompts/question_rewriter.yaml +0 -46
  32. {hamtaa_texttools-1.0.4.dist-info → hamtaa_texttools-1.0.6.dist-info}/WHEEL +0 -0
@@ -1,288 +1,308 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import math
5
- import re
6
- from typing import Any, Literal, Optional, TypeVar
7
-
8
- from openai import AsyncOpenAI
9
- from pydantic import BaseModel
10
-
11
- from texttools.formatters.user_merge_formatter.user_merge_formatter import (
12
- UserMergeFormatter,
13
- )
14
- from texttools.tools.internals.prompt_loader import PromptLoader
15
-
16
- # Base Model type for output models
17
- T = TypeVar("T", bound=BaseModel)
18
-
19
-
20
- class AsyncOperator:
21
- """
22
- Async version of Operator.
23
-
24
- Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
25
- """
26
-
27
- def __init__(
28
- self,
29
- client: AsyncOpenAI,
30
- *,
31
- model: str,
32
- temperature: float = 0.0,
33
- **client_kwargs: Any,
34
- ):
35
- self.client: AsyncOpenAI = client
36
- self.model = model
37
- self.temperature = temperature
38
- self.client_kwargs = client_kwargs
39
-
40
- def _build_user_message(self, prompt: str) -> dict[str, str]:
41
- return {"role": "user", "content": prompt}
42
-
43
- async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
44
- try:
45
- completion = await self.client.chat.completions.create(
46
- model=self.model,
47
- messages=analyze_message,
48
- temperature=self.temperature,
49
- **self.client_kwargs,
50
- )
51
- analysis = completion.choices[0].message.content.strip()
52
- return analysis
53
-
54
- except Exception as e:
55
- print(f"[ERROR] Analysis failed: {e}")
56
- raise
57
-
58
- async def _analyze(self, prompt_configs: dict[str, str]) -> str:
59
- analyze_prompt = prompt_configs["analyze_template"]
60
- analyze_message = [self._build_user_message(analyze_prompt)]
61
- analysis = await self._analysis_completion(analyze_message)
62
-
63
- return analysis
64
-
65
- async def _parse_completion(
66
- self,
67
- message: list[dict[str, str]],
68
- output_model: T,
69
- logprobs: bool = False,
70
- top_logprobs: int = 3,
71
- ) -> tuple[T, Any]:
72
- try:
73
- request_kwargs = {
74
- "model": self.model,
75
- "messages": message,
76
- "response_format": output_model,
77
- "temperature": self.temperature,
78
- **self.client_kwargs,
79
- }
80
- if logprobs:
81
- request_kwargs["logprobs"] = True
82
- request_kwargs["top_logprobs"] = top_logprobs
83
-
84
- completion = await self.client.beta.chat.completions.parse(**request_kwargs)
85
- parsed = completion.choices[0].message.parsed
86
- return parsed, completion
87
-
88
- except Exception as e:
89
- print(f"[ERROR] Failed to parse completion: {e}")
90
- raise
91
-
92
- def _clean_json_response(self, response: str) -> str:
93
- """
94
- Clean JSON response by removing code block markers and whitespace.
95
- Handles cases like:
96
- - ```json{"result": "value"}```
97
- """
98
- cleaned = response.strip()
99
-
100
- # Remove ```json marker
101
- if cleaned.startswith("```json"):
102
- cleaned = cleaned[7:]
103
-
104
- # Remove trailing ```
105
- if cleaned.endswith("```"):
106
- cleaned = cleaned[:-3]
107
-
108
- return cleaned.strip()
109
-
110
- def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
111
- """
112
- Convert a JSON response string to output model.
113
-
114
- Args:
115
- response_string: The JSON string (may contain code block markers)
116
- output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
117
-
118
- Returns:
119
- Instance of your output model
120
- """
121
- try:
122
- # Clean the response string
123
- cleaned_json = self._clean_json_response(response_string)
124
-
125
- # Fix Python-style booleans
126
- cleaned_json = cleaned_json.replace("False", "false").replace(
127
- "True", "true"
128
- )
129
-
130
- # Convert string to Python dictionary
131
- response_dict = json.loads(cleaned_json)
132
-
133
- # Convert dictionary to output model
134
- return output_model(**response_dict)
135
-
136
- except json.JSONDecodeError as e:
137
- raise ValueError(
138
- f"Failed to parse JSON response: {e}\nResponse: {response_string}"
139
- )
140
- except Exception as e:
141
- raise ValueError(f"Failed to convert to output model: {e}")
142
-
143
- async def _vllm_completion(
144
- self,
145
- message: list[dict[str, str]],
146
- output_model: T,
147
- logprobs: bool = False,
148
- top_logprobs: int = 3,
149
- ) -> tuple[T, Any]:
150
- try:
151
- json_schema = output_model.model_json_schema()
152
-
153
- # Build kwargs dynamically
154
- request_kwargs = {
155
- "model": self.model,
156
- "messages": message,
157
- "extra_body": {"guided_json": json_schema},
158
- "temperature": self.temperature,
159
- **self.client_kwargs,
160
- }
161
-
162
- if logprobs:
163
- request_kwargs["logprobs"] = True
164
- request_kwargs["top_logprobs"] = top_logprobs
165
-
166
- completion = await self.client.chat.completions.create(**request_kwargs)
167
- response = completion.choices[0].message.content
168
-
169
- # Convert the string response to output model
170
- parsed = self._convert_to_output_model(response, output_model)
171
-
172
- return parsed, completion
173
-
174
- except Exception as e:
175
- print(f"[ERROR] Failed to get vLLM structured output: {e}")
176
- raise
177
-
178
- def _extract_logprobs(self, completion: dict):
179
- logprobs_data = []
180
- ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
181
-
182
- for choice in completion.choices:
183
- if not getattr(choice, "logprobs", None):
184
- continue
185
-
186
- for logprob_item in choice.logprobs.content:
187
- if ignore_pattern.match(logprob_item.token):
188
- continue
189
- token_entry = {
190
- "token": logprob_item.token,
191
- "prob": round(math.exp(logprob_item.logprob), 8),
192
- "top_alternatives": [],
193
- }
194
- for alt in logprob_item.top_logprobs:
195
- if ignore_pattern.match(alt.token):
196
- continue
197
- token_entry["top_alternatives"].append(
198
- {
199
- "token": alt.token,
200
- "prob": round(math.exp(alt.logprob), 8),
201
- }
202
- )
203
- logprobs_data.append(token_entry)
204
-
205
- return logprobs_data
206
-
207
- async def run(
208
- self,
209
- input_text: str,
210
- prompt_file: str,
211
- output_model: T,
212
- with_analysis: bool = False,
213
- use_modes: bool = False,
214
- mode: str = "",
215
- resp_format: Literal["vllm", "parse"] = "parse",
216
- output_lang: Optional[str] = None,
217
- logprobs: bool = False,
218
- top_logprobs: int = 3,
219
- **extra_kwargs,
220
- ) -> dict[str, Any]:
221
- """
222
- Execute the async LLM pipeline with the given input text.
223
-
224
- Args:
225
- input_text: The text to process (will be stripped of whitespace)
226
- **extra_kwargs: Additional variables to inject into prompt templates
227
-
228
- Returns:
229
- Dictionary containing the parsed result and optional analysis
230
- """
231
- prompt_loader = PromptLoader()
232
- formatter = UserMergeFormatter()
233
-
234
- try:
235
- cleaned_text = input_text.strip()
236
-
237
- prompt_configs = prompt_loader.load_prompts(
238
- prompt_file,
239
- use_modes,
240
- mode,
241
- cleaned_text,
242
- **extra_kwargs,
243
- )
244
-
245
- messages: list[dict[str, str]] = []
246
-
247
- if with_analysis:
248
- analysis = await self._analyze(prompt_configs)
249
- messages.append(
250
- self._build_user_message(f"Based on this analysis: {analysis}")
251
- )
252
-
253
- if output_lang:
254
- messages.append(
255
- self._build_user_message(
256
- f"Respond only in the {output_lang} language."
257
- )
258
- )
259
-
260
- messages.append(self._build_user_message(prompt_configs["main_template"]))
261
-
262
- messages = formatter.format(messages)
263
-
264
- if resp_format == "vllm":
265
- parsed, completion = await self._vllm_completion(
266
- messages, output_model, logprobs, top_logprobs
267
- )
268
- elif resp_format == "parse":
269
- parsed, completion = await self._parse_completion(
270
- messages, output_model, logprobs, top_logprobs
271
- )
272
- else:
273
- raise ValueError(f"Unknown resp_format: {resp_format}")
274
-
275
- results = {"result": parsed.result}
276
-
277
- if logprobs:
278
- results["logprobs"] = self._extract_logprobs(completion)
279
-
280
- if with_analysis:
281
- results["analysis"] = analysis
282
-
283
- return results
284
-
285
- except Exception as e:
286
- # Print error clearly and re-raise for the caller to handle
287
- print(f"[ERROR] Async operation failed: {e}")
288
- raise
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ import re
6
+ from typing import Any, Literal, TypeVar
7
+ import logging
8
+
9
+ from openai import AsyncOpenAI
10
+ from pydantic import BaseModel
11
+
12
+ from texttools.formatters.user_merge_formatter import (
13
+ UserMergeFormatter,
14
+ )
15
+ from texttools.tools.internals.prompt_loader import PromptLoader
16
+
17
+ # Base Model type for output models
18
+ T = TypeVar("T", bound=BaseModel)
19
+
20
+ # Configure logger
21
+ logger = logging.getLogger("async_operator")
22
+ logger.setLevel(logging.INFO)
23
+
24
+
25
+ class AsyncOperator:
26
+ """
27
+ Async version of Operator.
28
+
29
+ Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ client: AsyncOpenAI,
35
+ *,
36
+ model: str,
37
+ temperature: float = 0.0,
38
+ **client_kwargs: Any,
39
+ ):
40
+ self.client: AsyncOpenAI = client
41
+ self.model = model
42
+ self.temperature = temperature
43
+ self.client_kwargs = client_kwargs
44
+
45
+ def _build_user_message(self, prompt: str) -> dict[str, str]:
46
+ return {"role": "user", "content": prompt}
47
+
48
+ async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
49
+ try:
50
+ completion = await self.client.chat.completions.create(
51
+ model=self.model,
52
+ messages=analyze_message,
53
+ temperature=self.temperature,
54
+ **self.client_kwargs,
55
+ )
56
+ analysis = completion.choices[0].message.content.strip()
57
+ return analysis
58
+
59
+ except Exception as e:
60
+ print(f"[ERROR] Analysis failed: {e}")
61
+ raise
62
+
63
+ async def _analyze(self, prompt_configs: dict[str, str]) -> str:
64
+ analyze_prompt = prompt_configs["analyze_template"]
65
+ analyze_message = [self._build_user_message(analyze_prompt)]
66
+ analysis = await self._analysis_completion(analyze_message)
67
+
68
+ return analysis
69
+
70
+ async def _parse_completion(
71
+ self,
72
+ message: list[dict[str, str]],
73
+ output_model: T,
74
+ logprobs: bool = False,
75
+ top_logprobs: int = 3,
76
+ max_tokens: int | None = None,
77
+ ) -> tuple[T, Any]:
78
+ try:
79
+ request_kwargs = {
80
+ "model": self.model,
81
+ "messages": message,
82
+ "response_format": output_model,
83
+ "temperature": self.temperature,
84
+ **self.client_kwargs,
85
+ }
86
+
87
+ if max_tokens is not None:
88
+ request_kwargs["max_tokens"] = max_tokens
89
+
90
+ if logprobs:
91
+ request_kwargs["logprobs"] = True
92
+ request_kwargs["top_logprobs"] = top_logprobs
93
+
94
+ completion = await self.client.beta.chat.completions.parse(**request_kwargs)
95
+ parsed = completion.choices[0].message.parsed
96
+ return parsed, completion
97
+
98
+ except Exception as e:
99
+ print(f"[ERROR] Failed to parse completion: {e}")
100
+ raise
101
+
102
+ def _clean_json_response(self, response: str) -> str:
103
+ """
104
+ Clean JSON response by removing code block markers and whitespace.
105
+ Handles cases like:
106
+ - ```json{"result": "value"}```
107
+ """
108
+ cleaned = response.strip()
109
+
110
+ # Remove ```json marker
111
+ if cleaned.startswith("```json"):
112
+ cleaned = cleaned[7:]
113
+
114
+ # Remove trailing ```
115
+ if cleaned.endswith("```"):
116
+ cleaned = cleaned[:-3]
117
+
118
+ return cleaned.strip()
119
+
120
+ def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
121
+ """
122
+ Convert a JSON response string to output model.
123
+
124
+ Args:
125
+ response_string: The JSON string (may contain code block markers)
126
+ output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
127
+
128
+ Returns:
129
+ Instance of your output model
130
+ """
131
+ try:
132
+ # Clean the response string
133
+ cleaned_json = self._clean_json_response(response_string)
134
+
135
+ # Fix Python-style booleans
136
+ cleaned_json = cleaned_json.replace("False", "false").replace(
137
+ "True", "true"
138
+ )
139
+
140
+ # Convert string to Python dictionary
141
+ response_dict = json.loads(cleaned_json)
142
+
143
+ # Convert dictionary to output model
144
+ return output_model(**response_dict)
145
+
146
+ except json.JSONDecodeError as e:
147
+ raise ValueError(
148
+ f"Failed to parse JSON response: {e}\nResponse: {response_string}"
149
+ )
150
+ except Exception as e:
151
+ raise ValueError(f"Failed to convert to output model: {e}")
152
+
153
+ async def _vllm_completion(
154
+ self,
155
+ message: list[dict[str, str]],
156
+ output_model: T,
157
+ logprobs: bool = False,
158
+ top_logprobs: int = 3,
159
+ max_tokens: int | None = None,
160
+ ) -> tuple[T, Any]:
161
+ try:
162
+ json_schema = output_model.model_json_schema()
163
+
164
+ # Build kwargs dynamically
165
+ request_kwargs = {
166
+ "model": self.model,
167
+ "messages": message,
168
+ "extra_body": {"guided_json": json_schema},
169
+ "temperature": self.temperature,
170
+ **self.client_kwargs,
171
+ }
172
+
173
+ if max_tokens is not None:
174
+ request_kwargs["max_tokens"] = max_tokens
175
+
176
+ if logprobs:
177
+ request_kwargs["logprobs"] = True
178
+ request_kwargs["top_logprobs"] = top_logprobs
179
+
180
+ completion = await self.client.chat.completions.create(**request_kwargs)
181
+ response = completion.choices[0].message.content
182
+
183
+ # Convert the string response to output model
184
+ parsed = self._convert_to_output_model(response, output_model)
185
+
186
+ return parsed, completion
187
+
188
+ except Exception as e:
189
+ print(f"[ERROR] Failed to get vLLM structured output: {e}")
190
+ raise
191
+
192
+ def _extract_logprobs(self, completion: dict):
193
+ logprobs_data = []
194
+ ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
195
+
196
+ for choice in completion.choices:
197
+ if not getattr(choice, "logprobs", None):
198
+ logger.info("No logprobs found.")
199
+ continue
200
+
201
+ for logprob_item in choice.logprobs.content:
202
+ if ignore_pattern.match(logprob_item.token):
203
+ continue
204
+ token_entry = {
205
+ "token": logprob_item.token,
206
+ "prob": round(math.exp(logprob_item.logprob), 8),
207
+ "top_alternatives": [],
208
+ }
209
+ for alt in logprob_item.top_logprobs:
210
+ if ignore_pattern.match(alt.token):
211
+ continue
212
+ token_entry["top_alternatives"].append(
213
+ {
214
+ "token": alt.token,
215
+ "prob": round(math.exp(alt.logprob), 8),
216
+ }
217
+ )
218
+ logprobs_data.append(token_entry)
219
+
220
+ return logprobs_data
221
+
222
+ async def run(
223
+ self,
224
+ input_text: str,
225
+ prompt_file: str,
226
+ output_model: T,
227
+ with_analysis: bool = False,
228
+ use_modes: bool = False,
229
+ mode: str = "",
230
+ resp_format: Literal["vllm", "parse"] = "parse",
231
+ output_lang: str | None = None,
232
+ logprobs: bool = False,
233
+ top_logprobs: int = 3,
234
+ max_tokens: int | None = None,
235
+ **extra_kwargs,
236
+ ) -> dict[str, Any]:
237
+ """
238
+ Execute the async LLM pipeline with the given input text.
239
+ """
240
+ prompt_loader = PromptLoader()
241
+ formatter = UserMergeFormatter()
242
+
243
+ try:
244
+ cleaned_text = input_text.strip()
245
+
246
+ prompt_configs = prompt_loader.load(
247
+ prompt_file=prompt_file,
248
+ text=cleaned_text,
249
+ mode=mode if use_modes else "",
250
+ **extra_kwargs,
251
+ )
252
+
253
+ messages: list[dict[str, str]] = []
254
+
255
+ if with_analysis:
256
+ analysis = await self._analyze(prompt_configs)
257
+ messages.append(
258
+ self._build_user_message(f"Based on this analysis: {analysis}")
259
+ )
260
+
261
+ if output_lang:
262
+ messages.append(
263
+ self._build_user_message(
264
+ f"Respond only in the {output_lang} language."
265
+ )
266
+ )
267
+
268
+ messages.append(self._build_user_message(prompt_configs["main_template"]))
269
+ messages = formatter.format(messages)
270
+
271
+ if resp_format == "vllm":
272
+ parsed, completion = await self._vllm_completion(
273
+ messages,
274
+ output_model,
275
+ logprobs,
276
+ top_logprobs,
277
+ max_tokens,
278
+ )
279
+ elif resp_format == "parse":
280
+ parsed, completion = await self._parse_completion(
281
+ messages,
282
+ output_model,
283
+ logprobs,
284
+ top_logprobs,
285
+ max_tokens,
286
+ )
287
+ else:
288
+ logger.error(f"Unknown resp_format: {resp_format}")
289
+
290
+ # Ensure output_model has a `result` field
291
+ if not hasattr(parsed, "result"):
292
+ logger.error(
293
+ "The provided output_model must define a field named 'result'"
294
+ )
295
+
296
+ results = {"result": parsed.result}
297
+
298
+ if logprobs:
299
+ results["logprobs"] = self._extract_logprobs(completion)
300
+
301
+ if with_analysis:
302
+ results["analysis"] = analysis
303
+
304
+ return results
305
+
306
+ except Exception as e:
307
+ logger.error(f"Async TheTool failed: {e}")
308
+ return {"Error": str(e), "result": ""}