hamtaa-texttools 1.0.5__py3-none-any.whl → 1.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hamtaa_texttools-1.1.16.dist-info/METADATA +255 -0
- hamtaa_texttools-1.1.16.dist-info/RECORD +31 -0
- texttools/__init__.py +6 -8
- texttools/batch/batch_config.py +26 -0
- texttools/batch/batch_runner.py +144 -139
- texttools/batch/{batch_manager.py → internals/batch_manager.py} +42 -54
- texttools/batch/internals/utils.py +16 -0
- texttools/prompts/README.md +8 -4
- texttools/prompts/categorize.yaml +77 -0
- texttools/prompts/detect_entity.yaml +22 -0
- texttools/prompts/extract_keywords.yaml +68 -0
- texttools/prompts/{question_merger.yaml → merge_questions.yaml} +5 -5
- texttools/tools/async_tools.py +804 -0
- texttools/tools/internals/async_operator.py +139 -236
- texttools/tools/internals/formatters.py +24 -0
- texttools/tools/internals/models.py +183 -0
- texttools/tools/internals/operator_utils.py +54 -0
- texttools/tools/internals/prompt_loader.py +23 -43
- texttools/tools/internals/sync_operator.py +201 -0
- texttools/tools/sync_tools.py +804 -0
- hamtaa_texttools-1.0.5.dist-info/METADATA +0 -192
- hamtaa_texttools-1.0.5.dist-info/RECORD +0 -30
- texttools/batch/__init__.py +0 -4
- texttools/formatters/base_formatter.py +0 -33
- texttools/formatters/user_merge_formatter.py +0 -30
- texttools/prompts/categorizer.yaml +0 -28
- texttools/prompts/keyword_extractor.yaml +0 -18
- texttools/tools/__init__.py +0 -4
- texttools/tools/async_the_tool.py +0 -277
- texttools/tools/internals/operator.py +0 -295
- texttools/tools/internals/output_models.py +0 -52
- texttools/tools/the_tool.py +0 -501
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/top_level.txt +0 -0
- /texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
- /texttools/prompts/{question_detector.yaml → is_question.yaml} +0 -0
- /texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
- /texttools/prompts/{custom_tool.yaml → run_custom.yaml} +0 -0
- /texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +0 -0
- /texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
- /texttools/prompts/{question_generator.yaml → text_to_question.yaml} +0 -0
- /texttools/prompts/{translator.yaml → translate.yaml} +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import math
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OperatorUtils:
|
|
7
|
+
@staticmethod
|
|
8
|
+
def build_user_message(prompt: str) -> dict[str, str]:
|
|
9
|
+
return {"role": "user", "content": prompt}
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def extract_logprobs(completion: dict) -> list[dict]:
|
|
13
|
+
"""
|
|
14
|
+
Extracts and filters token probabilities from completion logprobs.
|
|
15
|
+
Skips punctuation and structural tokens, returns cleaned probability data.
|
|
16
|
+
"""
|
|
17
|
+
logprobs_data = []
|
|
18
|
+
|
|
19
|
+
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
20
|
+
|
|
21
|
+
for choice in completion.choices:
|
|
22
|
+
if not getattr(choice, "logprobs", None):
|
|
23
|
+
return []
|
|
24
|
+
|
|
25
|
+
for logprob_item in choice.logprobs.content:
|
|
26
|
+
if ignore_pattern.match(logprob_item.token):
|
|
27
|
+
continue
|
|
28
|
+
token_entry = {
|
|
29
|
+
"token": logprob_item.token,
|
|
30
|
+
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
31
|
+
"top_alternatives": [],
|
|
32
|
+
}
|
|
33
|
+
for alt in logprob_item.top_logprobs:
|
|
34
|
+
if ignore_pattern.match(alt.token):
|
|
35
|
+
continue
|
|
36
|
+
token_entry["top_alternatives"].append(
|
|
37
|
+
{
|
|
38
|
+
"token": alt.token,
|
|
39
|
+
"prob": round(math.exp(alt.logprob), 8),
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
logprobs_data.append(token_entry)
|
|
43
|
+
|
|
44
|
+
return logprobs_data
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def get_retry_temp(base_temp: float) -> float:
|
|
48
|
+
"""
|
|
49
|
+
Calculate temperature for retry attempts.
|
|
50
|
+
"""
|
|
51
|
+
delta_temp = random.choice([-1, 1]) * random.uniform(0.1, 0.9)
|
|
52
|
+
new_temp = base_temp + delta_temp
|
|
53
|
+
|
|
54
|
+
return max(0.0, min(new_temp, 1.5))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from functools import lru_cache
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import yaml
|
|
4
4
|
|
|
@@ -7,66 +7,46 @@ class PromptLoader:
|
|
|
7
7
|
"""
|
|
8
8
|
Utility for loading and formatting YAML prompt templates.
|
|
9
9
|
|
|
10
|
-
Each YAML file under `prompts/` must define at least a `main_template`,
|
|
11
|
-
and optionally an `analyze_template`. These can either be a single string
|
|
12
|
-
or a dictionary keyed by mode names (if `use_modes=True`).
|
|
13
|
-
|
|
14
10
|
Responsibilities:
|
|
15
11
|
- Load and parse YAML prompt definitions.
|
|
16
12
|
- Select the right template (by mode, if applicable).
|
|
17
13
|
- Inject variables (`{input}`, plus any extra kwargs) into the templates.
|
|
18
|
-
- Return a dict with:
|
|
19
|
-
{
|
|
20
|
-
"main_template": "...",
|
|
21
|
-
"analyze_template": "..." | None
|
|
22
|
-
}
|
|
23
14
|
"""
|
|
24
15
|
|
|
25
|
-
MAIN_TEMPLATE
|
|
26
|
-
ANALYZE_TEMPLATE
|
|
27
|
-
|
|
28
|
-
def _load_templates(
|
|
29
|
-
self,
|
|
30
|
-
prompts_dir: str,
|
|
31
|
-
prompt_file: str,
|
|
32
|
-
mode: str | None,
|
|
33
|
-
) -> dict[str, str]:
|
|
34
|
-
prompt_path = Path(__file__).parent.parent.parent / prompts_dir / prompt_file
|
|
16
|
+
MAIN_TEMPLATE = "main_template"
|
|
17
|
+
ANALYZE_TEMPLATE = "analyze_template"
|
|
35
18
|
|
|
36
|
-
|
|
37
|
-
|
|
19
|
+
@staticmethod
|
|
20
|
+
def _build_format_args(text: str, **extra_kwargs) -> dict[str, str]:
|
|
21
|
+
# Base formatting args
|
|
22
|
+
format_args = {"input": text}
|
|
23
|
+
# Merge extras
|
|
24
|
+
format_args.update(extra_kwargs)
|
|
25
|
+
return format_args
|
|
38
26
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
27
|
+
# Use lru_cache to load each file once
|
|
28
|
+
@lru_cache(maxsize=32)
|
|
29
|
+
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
30
|
+
"""
|
|
31
|
+
Loads prompt templates from YAML file with optional mode selection.
|
|
32
|
+
"""
|
|
33
|
+
base_dir = Path(__file__).parent.parent.parent / Path("prompts")
|
|
34
|
+
prompt_path = base_dir / prompt_file
|
|
35
|
+
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
44
36
|
|
|
45
37
|
return {
|
|
46
|
-
|
|
38
|
+
self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
|
|
47
39
|
if mode
|
|
48
40
|
else data[self.MAIN_TEMPLATE],
|
|
49
|
-
|
|
41
|
+
self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
|
|
50
42
|
if mode
|
|
51
43
|
else data.get(self.ANALYZE_TEMPLATE),
|
|
52
44
|
}
|
|
53
45
|
|
|
54
|
-
def _build_format_args(self, text: str, **extra_kwargs) -> dict[str, str]:
|
|
55
|
-
# Base formatting args
|
|
56
|
-
format_args = {"input": text}
|
|
57
|
-
# Merge extras
|
|
58
|
-
format_args.update(extra_kwargs)
|
|
59
|
-
return format_args
|
|
60
|
-
|
|
61
46
|
def load(
|
|
62
|
-
self,
|
|
63
|
-
prompt_file: str,
|
|
64
|
-
text: str,
|
|
65
|
-
mode: str,
|
|
66
|
-
prompts_dir: str = "prompts",
|
|
67
|
-
**extra_kwargs,
|
|
47
|
+
self, prompt_file: str, text: str, mode: str, **extra_kwargs
|
|
68
48
|
) -> dict[str, str]:
|
|
69
|
-
template_configs = self._load_templates(
|
|
49
|
+
template_configs = self._load_templates(prompt_file, mode)
|
|
70
50
|
format_args = self._build_format_args(text, **extra_kwargs)
|
|
71
51
|
|
|
72
52
|
# Inject variables inside each template
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from typing import Any, TypeVar, Type
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from texttools.tools.internals.models import ToolOutput
|
|
9
|
+
from texttools.tools.internals.operator_utils import OperatorUtils
|
|
10
|
+
from texttools.tools.internals.formatters import Formatter
|
|
11
|
+
from texttools.tools.internals.prompt_loader import PromptLoader
|
|
12
|
+
|
|
13
|
+
# Base Model type for output models
|
|
14
|
+
T = TypeVar("T", bound=BaseModel)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("texttools.operator")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Operator:
|
|
20
|
+
"""
|
|
21
|
+
Core engine for running text-processing operations with an LLM (Sync).
|
|
22
|
+
|
|
23
|
+
It wires together:
|
|
24
|
+
- `PromptLoader` → loads YAML prompt templates.
|
|
25
|
+
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
26
|
+
- OpenAI client → executes completions/parsed completions.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, client: OpenAI, model: str):
|
|
30
|
+
self._client = client
|
|
31
|
+
self._model = model
|
|
32
|
+
|
|
33
|
+
def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Calls OpenAI API for analysis using the configured prompt template.
|
|
36
|
+
Returns the analyzed content as a string.
|
|
37
|
+
"""
|
|
38
|
+
analyze_prompt = prompt_configs["analyze_template"]
|
|
39
|
+
analyze_message = [OperatorUtils.build_user_message(analyze_prompt)]
|
|
40
|
+
completion = self._client.chat.completions.create(
|
|
41
|
+
model=self._model,
|
|
42
|
+
messages=analyze_message,
|
|
43
|
+
temperature=temperature,
|
|
44
|
+
)
|
|
45
|
+
analysis = completion.choices[0].message.content.strip()
|
|
46
|
+
return analysis
|
|
47
|
+
|
|
48
|
+
def _parse_completion(
|
|
49
|
+
self,
|
|
50
|
+
message: list[dict[str, str]],
|
|
51
|
+
output_model: Type[T],
|
|
52
|
+
temperature: float,
|
|
53
|
+
logprobs: bool = False,
|
|
54
|
+
top_logprobs: int = 3,
|
|
55
|
+
priority: int | None = 0,
|
|
56
|
+
) -> tuple[T, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Parses a chat completion using OpenAI's structured output format.
|
|
59
|
+
Returns both the parsed object and the raw completion for logprobs.
|
|
60
|
+
"""
|
|
61
|
+
request_kwargs = {
|
|
62
|
+
"model": self._model,
|
|
63
|
+
"messages": message,
|
|
64
|
+
"response_format": output_model,
|
|
65
|
+
"temperature": temperature,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if logprobs:
|
|
69
|
+
request_kwargs["logprobs"] = True
|
|
70
|
+
request_kwargs["top_logprobs"] = top_logprobs
|
|
71
|
+
|
|
72
|
+
if priority:
|
|
73
|
+
request_kwargs["extra_body"] = {"priority": priority}
|
|
74
|
+
|
|
75
|
+
completion = self._client.beta.chat.completions.parse(**request_kwargs)
|
|
76
|
+
parsed = completion.choices[0].message.parsed
|
|
77
|
+
return parsed, completion
|
|
78
|
+
|
|
79
|
+
def run(
|
|
80
|
+
self,
|
|
81
|
+
# User parameters
|
|
82
|
+
text: str,
|
|
83
|
+
with_analysis: bool,
|
|
84
|
+
output_lang: str | None,
|
|
85
|
+
user_prompt: str | None,
|
|
86
|
+
temperature: float,
|
|
87
|
+
logprobs: bool,
|
|
88
|
+
top_logprobs: int | None,
|
|
89
|
+
validator: Callable[[Any], bool] | None,
|
|
90
|
+
max_validation_retries: int | None,
|
|
91
|
+
# Internal parameters
|
|
92
|
+
prompt_file: str,
|
|
93
|
+
output_model: Type[T],
|
|
94
|
+
mode: str | None,
|
|
95
|
+
priority: int | None = 0,
|
|
96
|
+
**extra_kwargs,
|
|
97
|
+
) -> ToolOutput:
|
|
98
|
+
"""
|
|
99
|
+
Execute the LLM pipeline with the given input text.
|
|
100
|
+
"""
|
|
101
|
+
prompt_loader = PromptLoader()
|
|
102
|
+
formatter = Formatter()
|
|
103
|
+
output = ToolOutput()
|
|
104
|
+
try:
|
|
105
|
+
# Prompt configs contain two keys: main_template and analyze template, both are string
|
|
106
|
+
prompt_configs = prompt_loader.load(
|
|
107
|
+
prompt_file=prompt_file,
|
|
108
|
+
text=text.strip(),
|
|
109
|
+
mode=mode,
|
|
110
|
+
**extra_kwargs,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
messages = []
|
|
114
|
+
|
|
115
|
+
if with_analysis:
|
|
116
|
+
analysis = self._analyze(prompt_configs, temperature)
|
|
117
|
+
messages.append(
|
|
118
|
+
OperatorUtils.build_user_message(
|
|
119
|
+
f"Based on this analysis: {analysis}"
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if output_lang:
|
|
124
|
+
messages.append(
|
|
125
|
+
OperatorUtils.build_user_message(
|
|
126
|
+
f"Respond only in the {output_lang} language."
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if user_prompt:
|
|
131
|
+
messages.append(
|
|
132
|
+
OperatorUtils.build_user_message(
|
|
133
|
+
f"Consider this instruction {user_prompt}"
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
messages.append(
|
|
138
|
+
OperatorUtils.build_user_message(prompt_configs["main_template"])
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
messages = formatter.user_merge_format(messages)
|
|
142
|
+
|
|
143
|
+
parsed, completion = self._parse_completion(
|
|
144
|
+
messages, output_model, temperature, logprobs, top_logprobs, priority
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
output.result = parsed.result
|
|
148
|
+
|
|
149
|
+
# Retry logic if validation fails
|
|
150
|
+
if validator and not validator(output.result):
|
|
151
|
+
for attempt in range(max_validation_retries):
|
|
152
|
+
logger.warning(
|
|
153
|
+
f"Validation failed, retrying for the {attempt + 1} time."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Generate new temperature for retry
|
|
157
|
+
retry_temperature = OperatorUtils.get_retry_temp(temperature)
|
|
158
|
+
try:
|
|
159
|
+
parsed, completion = self._parse_completion(
|
|
160
|
+
messages,
|
|
161
|
+
output_model,
|
|
162
|
+
retry_temperature,
|
|
163
|
+
logprobs,
|
|
164
|
+
top_logprobs,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
output.result = parsed.result
|
|
168
|
+
|
|
169
|
+
# Check if retry was successful
|
|
170
|
+
if validator(output.result):
|
|
171
|
+
logger.info(
|
|
172
|
+
f"Validation passed on retry attempt {attempt + 1}"
|
|
173
|
+
)
|
|
174
|
+
break
|
|
175
|
+
else:
|
|
176
|
+
logger.warning(
|
|
177
|
+
f"Validation still failing after retry attempt {attempt + 1}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error(f"Retry attempt {attempt + 1} failed: {e}")
|
|
182
|
+
# Continue to next retry attempt if this one fails
|
|
183
|
+
|
|
184
|
+
# Final check after all retries
|
|
185
|
+
if validator and not validator(output.result):
|
|
186
|
+
output.errors.append("Validation failed after all retry attempts")
|
|
187
|
+
|
|
188
|
+
if logprobs:
|
|
189
|
+
output.logprobs = OperatorUtils.extract_logprobs(completion)
|
|
190
|
+
|
|
191
|
+
if with_analysis:
|
|
192
|
+
output.analysis = analysis
|
|
193
|
+
|
|
194
|
+
output.process = prompt_file[:-5]
|
|
195
|
+
|
|
196
|
+
return output
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.error(f"TheTool failed: {e}")
|
|
200
|
+
output.errors.append(str(e))
|
|
201
|
+
return output
|