hamtaa-texttools 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

@@ -1,14 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
- import math
5
- import re
6
- from typing import Any, Literal, TypeVar
3
+ from typing import Any, TypeVar, Type, Literal
7
4
  import logging
8
5
 
9
6
  from openai import AsyncOpenAI
10
7
  from pydantic import BaseModel
11
8
 
9
+ from texttools.tools.internals.base_operator import BaseOperator
12
10
  from texttools.formatters.user_merge_formatter import (
13
11
  UserMergeFormatter,
14
12
  )
@@ -22,238 +20,128 @@ logger = logging.getLogger("async_operator")
22
20
  logger.setLevel(logging.INFO)
23
21
 
24
22
 
25
- class AsyncOperator:
23
+ class AsyncOperator(BaseOperator):
26
24
  """
27
- Async version of Operator.
25
+ Core engine for running text-processing operations with an LLM (Async).
28
26
 
29
- Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
27
+ It wires together:
28
+ - `PromptLoader` → loads YAML prompt templates.
29
+ - `UserMergeFormatter` → applies formatting to messages (e.g., merging).
30
+ - AsyncOpenAI client → executes completions/parsed completions.
30
31
  """
31
32
 
32
- def __init__(
33
- self,
34
- client: AsyncOpenAI,
35
- *,
36
- model: str,
37
- temperature: float = 0.0,
38
- **client_kwargs: Any,
39
- ):
33
+ def __init__(self, client: AsyncOpenAI, model: str):
40
34
  self.client: AsyncOpenAI = client
41
35
  self.model = model
42
- self.temperature = temperature
43
- self.client_kwargs = client_kwargs
44
-
45
- def _build_user_message(self, prompt: str) -> dict[str, str]:
46
- return {"role": "user", "content": prompt}
47
36
 
48
- async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
49
- try:
50
- completion = await self.client.chat.completions.create(
51
- model=self.model,
52
- messages=analyze_message,
53
- temperature=self.temperature,
54
- **self.client_kwargs,
55
- )
56
- analysis = completion.choices[0].message.content.strip()
57
- return analysis
58
-
59
- except Exception as e:
60
- print(f"[ERROR] Analysis failed: {e}")
61
- raise
37
+ async def _analysis_completion(
38
+ self,
39
+ analyze_message: list[dict[str, str]],
40
+ temperature: float,
41
+ ) -> str:
42
+ completion = await self.client.chat.completions.create(
43
+ model=self.model,
44
+ messages=analyze_message,
45
+ temperature=temperature,
46
+ )
47
+ analysis = completion.choices[0].message.content.strip()
48
+ return analysis
62
49
 
63
- async def _analyze(self, prompt_configs: dict[str, str]) -> str:
50
+ async def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
64
51
  analyze_prompt = prompt_configs["analyze_template"]
65
52
  analyze_message = [self._build_user_message(analyze_prompt)]
66
- analysis = await self._analysis_completion(analyze_message)
67
-
53
+ analysis = await self._analysis_completion(analyze_message, temperature)
68
54
  return analysis
69
55
 
70
56
  async def _parse_completion(
71
57
  self,
72
58
  message: list[dict[str, str]],
73
- output_model: T,
59
+ output_model: Type[T],
60
+ temperature: float,
74
61
  logprobs: bool = False,
75
62
  top_logprobs: int = 3,
76
- max_tokens: int | None = None,
77
- ) -> tuple[T, Any]:
78
- try:
79
- request_kwargs = {
80
- "model": self.model,
81
- "messages": message,
82
- "response_format": output_model,
83
- "temperature": self.temperature,
84
- **self.client_kwargs,
85
- }
86
-
87
- if max_tokens is not None:
88
- request_kwargs["max_tokens"] = max_tokens
89
-
90
- if logprobs:
91
- request_kwargs["logprobs"] = True
92
- request_kwargs["top_logprobs"] = top_logprobs
93
-
94
- completion = await self.client.beta.chat.completions.parse(**request_kwargs)
95
- parsed = completion.choices[0].message.parsed
96
- return parsed, completion
97
-
98
- except Exception as e:
99
- print(f"[ERROR] Failed to parse completion: {e}")
100
- raise
101
-
102
- def _clean_json_response(self, response: str) -> str:
103
- """
104
- Clean JSON response by removing code block markers and whitespace.
105
- Handles cases like:
106
- - ```json{"result": "value"}```
107
- """
108
- cleaned = response.strip()
109
-
110
- # Remove ```json marker
111
- if cleaned.startswith("```json"):
112
- cleaned = cleaned[7:]
113
-
114
- # Remove trailing ```
115
- if cleaned.endswith("```"):
116
- cleaned = cleaned[:-3]
117
-
118
- return cleaned.strip()
119
-
120
- def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
121
- """
122
- Convert a JSON response string to output model.
123
-
124
- Args:
125
- response_string: The JSON string (may contain code block markers)
126
- output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
127
-
128
- Returns:
129
- Instance of your output model
130
- """
131
- try:
132
- # Clean the response string
133
- cleaned_json = self._clean_json_response(response_string)
134
-
135
- # Fix Python-style booleans
136
- cleaned_json = cleaned_json.replace("False", "false").replace(
137
- "True", "true"
138
- )
139
-
140
- # Convert string to Python dictionary
141
- response_dict = json.loads(cleaned_json)
142
-
143
- # Convert dictionary to output model
144
- return output_model(**response_dict)
145
-
146
- except json.JSONDecodeError as e:
147
- raise ValueError(
148
- f"Failed to parse JSON response: {e}\nResponse: {response_string}"
149
- )
150
- except Exception as e:
151
- raise ValueError(f"Failed to convert to output model: {e}")
63
+ ) -> tuple[Type[T], Any]:
64
+ request_kwargs = {
65
+ "model": self.model,
66
+ "messages": message,
67
+ "response_format": output_model,
68
+ "temperature": temperature,
69
+ }
70
+
71
+ if logprobs:
72
+ request_kwargs["logprobs"] = True
73
+ request_kwargs["top_logprobs"] = top_logprobs
74
+
75
+ completion = await self.client.beta.chat.completions.parse(**request_kwargs)
76
+ parsed = completion.choices[0].message.parsed
77
+ return parsed, completion
152
78
 
153
79
  async def _vllm_completion(
154
80
  self,
155
81
  message: list[dict[str, str]],
156
- output_model: T,
82
+ output_model: Type[T],
83
+ temperature: float,
157
84
  logprobs: bool = False,
158
85
  top_logprobs: int = 3,
159
- max_tokens: int | None = None,
160
- ) -> tuple[T, Any]:
161
- try:
162
- json_schema = output_model.model_json_schema()
163
-
164
- # Build kwargs dynamically
165
- request_kwargs = {
166
- "model": self.model,
167
- "messages": message,
168
- "extra_body": {"guided_json": json_schema},
169
- "temperature": self.temperature,
170
- **self.client_kwargs,
171
- }
86
+ ) -> tuple[Type[T], Any]:
87
+ json_schema = output_model.model_json_schema()
172
88
 
173
- if max_tokens is not None:
174
- request_kwargs["max_tokens"] = max_tokens
89
+ # Build kwargs dynamically
90
+ request_kwargs = {
91
+ "model": self.model,
92
+ "messages": message,
93
+ "extra_body": {"guided_json": json_schema},
94
+ "temperature": temperature,
95
+ }
175
96
 
176
- if logprobs:
177
- request_kwargs["logprobs"] = True
178
- request_kwargs["top_logprobs"] = top_logprobs
179
-
180
- completion = await self.client.chat.completions.create(**request_kwargs)
181
- response = completion.choices[0].message.content
182
-
183
- # Convert the string response to output model
184
- parsed = self._convert_to_output_model(response, output_model)
97
+ if logprobs:
98
+ request_kwargs["logprobs"] = True
99
+ request_kwargs["top_logprobs"] = top_logprobs
185
100
 
186
- return parsed, completion
187
-
188
- except Exception as e:
189
- print(f"[ERROR] Failed to get vLLM structured output: {e}")
190
- raise
191
-
192
- def _extract_logprobs(self, completion: dict):
193
- logprobs_data = []
194
- ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
195
-
196
- for choice in completion.choices:
197
- if not getattr(choice, "logprobs", None):
198
- logger.info("No logprobs found.")
199
- continue
200
-
201
- for logprob_item in choice.logprobs.content:
202
- if ignore_pattern.match(logprob_item.token):
203
- continue
204
- token_entry = {
205
- "token": logprob_item.token,
206
- "prob": round(math.exp(logprob_item.logprob), 8),
207
- "top_alternatives": [],
208
- }
209
- for alt in logprob_item.top_logprobs:
210
- if ignore_pattern.match(alt.token):
211
- continue
212
- token_entry["top_alternatives"].append(
213
- {
214
- "token": alt.token,
215
- "prob": round(math.exp(alt.logprob), 8),
216
- }
217
- )
218
- logprobs_data.append(token_entry)
101
+ completion = await self.client.chat.completions.create(**request_kwargs)
102
+ response = completion.choices[0].message.content
219
103
 
220
- return logprobs_data
104
+ # Convert the string response to output model
105
+ parsed = self._convert_to_output_model(response, output_model)
106
+ return parsed, completion
221
107
 
222
108
  async def run(
223
109
  self,
224
- input_text: str,
110
+ # User parameters
111
+ text: str,
112
+ with_analysis: bool,
113
+ output_lang: str | None,
114
+ user_prompt: str | None,
115
+ temperature: float,
116
+ logprobs: bool,
117
+ top_logprobs: int | None,
118
+ # Internal parameters
225
119
  prompt_file: str,
226
- output_model: T,
227
- with_analysis: bool = False,
228
- use_modes: bool = False,
229
- mode: str = "",
230
- resp_format: Literal["vllm", "parse"] = "parse",
231
- output_lang: str | None = None,
232
- logprobs: bool = False,
233
- top_logprobs: int = 3,
234
- max_tokens: int | None = None,
120
+ output_model: Type[T],
121
+ resp_format: Literal["vllm", "parse"],
122
+ mode: str | None,
235
123
  **extra_kwargs,
236
124
  ) -> dict[str, Any]:
237
125
  """
238
- Execute the async LLM pipeline with the given input text.
126
+ Execute the async LLM pipeline with the given input text. (Async)
239
127
  """
240
128
  prompt_loader = PromptLoader()
241
129
  formatter = UserMergeFormatter()
242
130
 
243
131
  try:
244
- cleaned_text = input_text.strip()
132
+ cleaned_text = text.strip()
245
133
 
246
134
  prompt_configs = prompt_loader.load(
247
135
  prompt_file=prompt_file,
248
136
  text=cleaned_text,
249
- mode=mode if use_modes else "",
137
+ mode=mode,
250
138
  **extra_kwargs,
251
139
  )
252
140
 
253
141
  messages: list[dict[str, str]] = []
254
142
 
255
143
  if with_analysis:
256
- analysis = await self._analyze(prompt_configs)
144
+ analysis = await self._analyze(prompt_configs, temperature)
257
145
  messages.append(
258
146
  self._build_user_message(f"Based on this analysis: {analysis}")
259
147
  )
@@ -265,27 +153,22 @@ class AsyncOperator:
265
153
  )
266
154
  )
267
155
 
156
+ if user_prompt:
157
+ messages.append(
158
+ self._build_user_message(f"Consider this instruction {user_prompt}")
159
+ )
160
+
268
161
  messages.append(self._build_user_message(prompt_configs["main_template"]))
269
162
  messages = formatter.format(messages)
270
163
 
271
164
  if resp_format == "vllm":
272
165
  parsed, completion = await self._vllm_completion(
273
- messages,
274
- output_model,
275
- logprobs,
276
- top_logprobs,
277
- max_tokens,
166
+ messages, output_model, temperature, logprobs, top_logprobs
278
167
  )
279
168
  elif resp_format == "parse":
280
- parsed, completion = await self._parse_completion(
281
- messages,
282
- output_model,
283
- logprobs,
284
- top_logprobs,
285
- max_tokens,
169
+ parsed, completion = await self._vllm_completion(
170
+ messages, output_model, temperature, logprobs, top_logprobs
286
171
  )
287
- else:
288
- logger.error(f"Unknown resp_format: {resp_format}")
289
172
 
290
173
  # Ensure output_model has a `result` field
291
174
  if not hasattr(parsed, "result"):
@@ -0,0 +1,85 @@
1
+ from typing import TypeVar, Type
2
+ import json
3
+ import re
4
+ import math
5
+
6
+ from pydantic import BaseModel
7
+ from openai import OpenAI, AsyncOpenAI
8
+
9
+ # Base Model type for output models
10
+ T = TypeVar("T", bound=BaseModel)
11
+
12
+
13
+ class BaseOperator:
14
+ def __init__(self, client: OpenAI | AsyncOpenAI, model: str):
15
+ self.client = client
16
+ self.model = model
17
+
18
+ def _build_user_message(self, prompt: str) -> dict[str, str]:
19
+ return {"role": "user", "content": prompt}
20
+
21
+ def _clean_json_response(self, response: str) -> str:
22
+ """
23
+ Clean JSON response by removing code block markers and whitespace.
24
+ Handles cases like:
25
+ - ```json{"result": "value"}```
26
+ """
27
+ stripped = response.strip()
28
+ cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
29
+ cleaned = re.sub(r"\s*```$", "", cleaned)
30
+
31
+ return cleaned.strip()
32
+
33
+ def _convert_to_output_model(
34
+ self, response_string: str, output_model: Type[T]
35
+ ) -> Type[T]:
36
+ """
37
+ Convert a JSON response string to output model.
38
+
39
+ Args:
40
+ response_string: The JSON string (may contain code block markers)
41
+ output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
42
+
43
+ Returns:
44
+ Instance of your output model
45
+ """
46
+ # Clean the response string
47
+ cleaned_json = self._clean_json_response(response_string)
48
+
49
+ # Fix Python-style booleans
50
+ cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
51
+
52
+ # Convert string to Python dictionary
53
+ response_dict = json.loads(cleaned_json)
54
+
55
+ # Convert dictionary to output model
56
+ return output_model(**response_dict)
57
+
58
+ def _extract_logprobs(self, completion: dict):
59
+ logprobs_data = []
60
+ ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
61
+
62
+ for choice in completion.choices:
63
+ if not getattr(choice, "logprobs", None):
64
+ continue
65
+
66
+ for logprob_item in choice.logprobs.content:
67
+ if ignore_pattern.match(logprob_item.token):
68
+ continue
69
+ token_entry = {
70
+ "token": logprob_item.token,
71
+ "prob": round(math.exp(logprob_item.logprob), 8),
72
+ "top_alternatives": [],
73
+ }
74
+ for alt in logprob_item.top_logprobs:
75
+ if ignore_pattern.match(alt.token):
76
+ continue
77
+ token_entry["top_alternatives"].append(
78
+ {
79
+ "token": alt.token,
80
+ "prob": round(math.exp(alt.logprob), 8),
81
+ }
82
+ )
83
+ logprobs_data.append(token_entry)
84
+
85
+ return logprobs_data