hamtaa-texttools 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (25) hide show
  1. {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/METADATA +15 -15
  2. hamtaa_texttools-1.0.7.dist-info/RECORD +31 -0
  3. texttools/batch/batch_manager.py +7 -18
  4. texttools/batch/batch_runner.py +96 -45
  5. texttools/prompts/README.md +4 -0
  6. texttools/prompts/{keyword_extractor.yaml → extract_keywords.yaml} +6 -6
  7. texttools/prompts/{question_merger.yaml → merge_questions.yaml} +5 -5
  8. texttools/tools/async_the_tool.py +204 -143
  9. texttools/tools/internals/async_operator.py +98 -204
  10. texttools/tools/internals/base_operator.py +85 -0
  11. texttools/tools/internals/operator.py +27 -130
  12. texttools/tools/internals/prompt_loader.py +12 -22
  13. texttools/tools/the_tool.py +162 -225
  14. hamtaa_texttools-1.0.5.dist-info/RECORD +0 -30
  15. {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/WHEEL +0 -0
  16. {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/licenses/LICENSE +0 -0
  17. {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.0.7.dist-info}/top_level.txt +0 -0
  18. /texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
  19. /texttools/prompts/{question_detector.yaml → is_question.yaml} +0 -0
  20. /texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
  21. /texttools/prompts/{custom_tool.yaml → run_custom.yaml} +0 -0
  22. /texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +0 -0
  23. /texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
  24. /texttools/prompts/{question_generator.yaml → text_to_question.yaml} +0 -0
  25. /texttools/prompts/{translator.yaml → translate.yaml} +0 -0
@@ -1,14 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
- import math
4
- import re
5
- from typing import Any, TypeVar, Type, Literal, Optional
6
- import json
3
+ from typing import Any, TypeVar, Type, Literal
7
4
  import logging
8
5
 
9
6
  from openai import OpenAI
10
7
  from pydantic import BaseModel
11
8
 
9
+ from texttools.tools.internals.base_operator import BaseOperator
12
10
  from texttools.formatters.user_merge_formatter import (
13
11
  UserMergeFormatter,
14
12
  )
@@ -22,7 +20,7 @@ logger = logging.getLogger("operator")
22
20
  logger.setLevel(logging.INFO)
23
21
 
24
22
 
25
- class Operator:
23
+ class Operator(BaseOperator):
26
24
  """
27
25
  Core engine for running text-processing operations with an LLM.
28
26
 
@@ -30,71 +28,46 @@ class Operator:
30
28
  - `PromptLoader` → loads YAML prompt templates.
31
29
  - `UserMergeFormatter` → applies formatting to messages (e.g., merging).
32
30
  - OpenAI client → executes completions/parsed completions.
33
-
34
- Workflow inside `run()`:
35
- 1. Load prompt templates (`main_template` [+ `analyze_template` if enabled]).
36
- 2. Optionally generate an "analysis" step via `_analyze()`.
37
- 3. Build messages for the LLM.
38
- 4. Call `.beta.chat.completions.parse()` to parse the result into the
39
- configured `OUTPUT_MODEL` (a Pydantic schema).
40
- 5. Return results as a dict (always `{"result": ...}`, plus `analysis`
41
- if analysis was enabled).
42
-
43
- Attributes configured dynamically by `TheTool`:
44
- - PROMPT_FILE: str → YAML filename
45
- - OUTPUT_MODEL: Pydantic model class
46
- - WITH_ANALYSIS: bool → whether to run an analysis phase first
47
- - USE_MODES: bool → whether to select prompts by mode
48
- - MODE: str → which mode to use if modes are enabled
49
- - RESP_FORMAT: str → "vllm" or "parse"
50
31
  """
51
32
 
52
- def __init__(self, client: OpenAI):
33
+ def __init__(self, client: OpenAI, model: str):
53
34
  self.client: OpenAI = client
54
-
55
- def _build_user_message(self, prompt: str) -> dict[str, str]:
56
- return {"role": "user", "content": prompt}
35
+ self.model = model
57
36
 
58
37
  def _analysis_completion(
59
38
  self,
60
39
  analyze_message: list[dict[str, str]],
61
- model: str,
62
40
  temperature: float,
63
41
  ) -> str:
64
42
  completion = self.client.chat.completions.create(
65
- model=model,
43
+ model=self.model,
66
44
  messages=analyze_message,
67
45
  temperature=temperature,
68
46
  )
69
47
  analysis = completion.choices[0].message.content.strip()
70
48
  return analysis
71
49
 
72
- def _analyze(
73
- self,
74
- prompt_configs: dict[str, str],
75
- model: str,
76
- temperature: float,
77
- ) -> str:
50
+ def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
78
51
  analyze_prompt = prompt_configs["analyze_template"]
79
52
  analyze_message = [self._build_user_message(analyze_prompt)]
80
- analysis = self._analysis_completion(analyze_message, model, temperature)
53
+ analysis = self._analysis_completion(analyze_message, temperature)
81
54
  return analysis
82
55
 
83
56
  def _parse_completion(
84
57
  self,
85
58
  message: list[dict[str, str]],
86
59
  output_model: Type[T],
87
- model: str,
88
60
  temperature: float,
89
61
  logprobs: bool = False,
90
62
  top_logprobs: int = 3,
91
63
  ) -> tuple[Type[T], Any]:
92
64
  request_kwargs = {
93
- "model": model,
65
+ "model": self.model,
94
66
  "messages": message,
95
67
  "response_format": output_model,
96
68
  "temperature": temperature,
97
69
  }
70
+
98
71
  if logprobs:
99
72
  request_kwargs["logprobs"] = True
100
73
  request_kwargs["top_logprobs"] = top_logprobs
@@ -103,57 +76,20 @@ class Operator:
103
76
  parsed = completion.choices[0].message.parsed
104
77
  return parsed, completion
105
78
 
106
- def _clean_json_response(self, response: str) -> str:
107
- """
108
- Clean JSON response by removing code block markers and whitespace.
109
- Handles cases like:
110
- - ```json{"result": "value"}```
111
- """
112
- stripped = response.strip()
113
- cleaned = re.sub(r"^```(?:json)?\s*", "", stripped)
114
- cleaned = re.sub(r"\s*```$", "", cleaned)
115
-
116
- return cleaned.strip()
117
-
118
- def _convert_to_output_model(
119
- self, response_string: str, output_model: Type[T]
120
- ) -> Type[T]:
121
- """
122
- Convert a JSON response string to output model.
123
-
124
- Args:
125
- response_string: The JSON string (may contain code block markers)
126
- output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
127
-
128
- Returns:
129
- Instance of your output model
130
- """
131
- # Clean the response string
132
- cleaned_json = self._clean_json_response(response_string)
133
-
134
- # Fix Python-style booleans
135
- cleaned_json = cleaned_json.replace("False", "false").replace("True", "true")
136
-
137
- # Convert string to Python dictionary
138
- response_dict = json.loads(cleaned_json)
139
-
140
- # Convert dictionary to output model
141
- return output_model(**response_dict)
142
-
143
79
  def _vllm_completion(
144
80
  self,
145
81
  message: list[dict[str, str]],
146
82
  output_model: Type[T],
147
- model: str,
148
83
  temperature: float,
149
84
  logprobs: bool = False,
150
85
  top_logprobs: int = 3,
86
+ max_tokens: int | None = None,
151
87
  ) -> tuple[Type[T], Any]:
152
88
  json_schema = output_model.model_json_schema()
153
89
 
154
90
  # Build kwargs dynamically
155
91
  request_kwargs = {
156
- "model": model,
92
+ "model": self.model,
157
93
  "messages": message,
158
94
  "extra_body": {"guided_json": json_schema},
159
95
  "temperature": temperature,
@@ -170,63 +106,25 @@ class Operator:
170
106
  parsed = self._convert_to_output_model(response, output_model)
171
107
  return parsed, completion
172
108
 
173
- def _extract_logprobs(self, completion: dict):
174
- logprobs_data = []
175
- ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
176
-
177
- for choice in completion.choices:
178
- if not getattr(choice, "logprobs", None):
179
- logger.info("No logprobs found.")
180
- continue
181
-
182
- for logprob_item in choice.logprobs.content:
183
- if ignore_pattern.match(logprob_item.token):
184
- continue
185
- token_entry = {
186
- "token": logprob_item.token,
187
- "prob": round(math.exp(logprob_item.logprob), 8),
188
- "top_alternatives": [],
189
- }
190
- for alt in logprob_item.top_logprobs:
191
- if ignore_pattern.match(alt.token):
192
- continue
193
- token_entry["top_alternatives"].append(
194
- {
195
- "token": alt.token,
196
- "prob": round(math.exp(alt.logprob), 8),
197
- }
198
- )
199
- logprobs_data.append(token_entry)
200
-
201
- return logprobs_data
202
-
203
109
  def run(
204
110
  self,
205
- text: str,
206
111
  # User parameters
207
- model: str,
112
+ text: str,
208
113
  with_analysis: bool,
114
+ output_lang: str | None,
115
+ user_prompt: str | None,
209
116
  temperature: float,
210
117
  logprobs: bool,
211
- top_logprobs: int,
212
- user_prompt: str | None,
213
- output_lang: str | None,
214
- # Each tool's parameters
118
+ top_logprobs: int | None,
119
+ # Internal parameters
215
120
  prompt_file: str,
216
121
  output_model: Type[T],
217
- resp_format: Literal["vllm", "parse"] = "parse",
218
- mode: str | None = None,
122
+ resp_format: Literal["vllm", "parse"],
123
+ mode: str | None,
219
124
  **extra_kwargs,
220
125
  ) -> dict[str, Any]:
221
126
  """
222
127
  Execute the LLM pipeline with the given input text.
223
-
224
- Args:
225
- text: The text to process (will be stripped of whitespace)
226
- **extra_kwargs: Additional variables to inject into prompt templates
227
-
228
- Returns:
229
- Dictionary containing the parsed result and optional analysis
230
128
  """
231
129
  prompt_loader = PromptLoader()
232
130
  formatter = UserMergeFormatter()
@@ -244,7 +142,7 @@ class Operator:
244
142
  messages: list[dict[str, str]] = []
245
143
 
246
144
  if with_analysis:
247
- analysis = self._analyze(prompt_configs, model, temperature)
145
+ analysis = self._analyze(prompt_configs, temperature)
248
146
  messages.append(
249
147
  self._build_user_message(f"Based on this analysis: {analysis}")
250
148
  )
@@ -262,16 +160,15 @@ class Operator:
262
160
  )
263
161
 
264
162
  messages.append(self._build_user_message(prompt_configs["main_template"]))
265
-
266
163
  messages = formatter.format(messages)
267
164
 
268
165
  if resp_format == "vllm":
269
166
  parsed, completion = self._vllm_completion(
270
- messages, output_model, model, temperature, logprobs, top_logprobs
167
+ messages, output_model, temperature, logprobs, top_logprobs
271
168
  )
272
169
  elif resp_format == "parse":
273
170
  parsed, completion = self._parse_completion(
274
- messages, output_model, model, temperature, logprobs, top_logprobs
171
+ messages, output_model, temperature, logprobs, top_logprobs
275
172
  )
276
173
 
277
174
  # Ensure output_model has a `result` field
@@ -280,16 +177,16 @@ class Operator:
280
177
  "The provided output_model must define a field named 'result'"
281
178
  )
282
179
 
283
- results = {"result": parsed.result}
180
+ result = {"result": parsed.result}
284
181
 
285
182
  if logprobs:
286
- results["logprobs"] = self._extract_logprobs(completion)
183
+ result["logprobs"] = self._extract_logprobs(completion)
287
184
 
288
185
  if with_analysis:
289
- results["analysis"] = analysis
186
+ result["analysis"] = analysis
290
187
 
291
- return results
188
+ return result
292
189
 
293
190
  except Exception as e:
294
- logger.error(f"Operation failed: {e}")
191
+ logger.error(f"TheTool failed: {e}")
295
192
  return {"Error": str(e), "result": ""}
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from functools import lru_cache
2
2
  from pathlib import Path
3
3
  import yaml
4
4
 
@@ -7,10 +7,6 @@ class PromptLoader:
7
7
  """
8
8
  Utility for loading and formatting YAML prompt templates.
9
9
 
10
- Each YAML file under `prompts/` must define at least a `main_template`,
11
- and optionally an `analyze_template`. These can either be a single string
12
- or a dictionary keyed by mode names (if `use_modes=True`).
13
-
14
10
  Responsibilities:
15
11
  - Load and parse YAML prompt definitions.
16
12
  - Select the right template (by mode, if applicable).
@@ -22,31 +18,30 @@ class PromptLoader:
22
18
  }
23
19
  """
24
20
 
21
+ def __init__(self):
22
+ self.base_dir = Path(__file__).parent.parent.parent / Path("prompts")
23
+
25
24
  MAIN_TEMPLATE: str = "main_template"
26
25
  ANALYZE_TEMPLATE: str = "analyze_template"
27
26
 
28
- def _load_templates(
29
- self,
30
- prompts_dir: str,
31
- prompt_file: str,
32
- mode: str | None,
33
- ) -> dict[str, str]:
34
- prompt_path = Path(__file__).parent.parent.parent / prompts_dir / prompt_file
27
+ # Use lru_cache to load each file once
28
+ @lru_cache(maxsize=32)
29
+ def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
30
+ prompt_path = self.base_dir / prompt_file
35
31
 
36
32
  if not prompt_path.exists():
37
33
  raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
38
34
 
39
35
  try:
40
- # Load the data
41
36
  data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
42
37
  except yaml.YAMLError as e:
43
38
  raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
44
39
 
45
40
  return {
46
- "main_template": data[self.MAIN_TEMPLATE][mode]
41
+ self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
47
42
  if mode
48
43
  else data[self.MAIN_TEMPLATE],
49
- "analyze_template": data.get(self.ANALYZE_TEMPLATE)[mode]
44
+ self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
50
45
  if mode
51
46
  else data.get(self.ANALYZE_TEMPLATE),
52
47
  }
@@ -59,14 +54,9 @@ class PromptLoader:
59
54
  return format_args
60
55
 
61
56
  def load(
62
- self,
63
- prompt_file: str,
64
- text: str,
65
- mode: str,
66
- prompts_dir: str = "prompts",
67
- **extra_kwargs,
57
+ self, prompt_file: str, text: str, mode: str, **extra_kwargs
68
58
  ) -> dict[str, str]:
69
- template_configs = self._load_templates(prompts_dir, prompt_file, mode)
59
+ template_configs = self._load_templates(prompt_file, mode)
70
60
  format_args = self._build_format_args(text, **extra_kwargs)
71
61
 
72
62
  # Inject variables inside each template