hamtaa-texttools 1.3.2__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/METADATA +40 -47
- hamtaa_texttools-2.1.0.dist-info/RECORD +30 -0
- {hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/WHEEL +1 -1
- {hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/licenses/LICENSE +1 -1
- texttools/__init__.py +1 -1
- texttools/core/internal_models.py +16 -7
- texttools/core/operators/async_operator.py +10 -16
- texttools/core/operators/sync_operator.py +10 -16
- texttools/core/utils.py +260 -0
- texttools/models.py +77 -22
- texttools/prompts/{rewrite.yaml → augment.yaml} +3 -3
- texttools/prompts/categorize.yaml +7 -8
- texttools/prompts/extract_entities.yaml +2 -2
- texttools/prompts/extract_keywords.yaml +4 -2
- texttools/prompts/{check_fact.yaml → is_fact.yaml} +5 -4
- texttools/prompts/is_question.yaml +1 -1
- texttools/prompts/merge_questions.yaml +8 -6
- texttools/prompts/propositionize.yaml +11 -7
- texttools/prompts/run_custom.yaml +3 -1
- texttools/prompts/summarize.yaml +3 -3
- texttools/prompts/to_question.yaml +60 -0
- texttools/prompts/translate.yaml +4 -4
- texttools/tools/async_tools.py +152 -169
- texttools/tools/sync_tools.py +138 -150
- hamtaa_texttools-1.3.2.dist-info/RECORD +0 -31
- texttools/core/engine.py +0 -262
- texttools/prompts/subject_to_question.yaml +0 -26
- texttools/prompts/text_to_question.yaml +0 -26
- {hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
hamtaa_texttools-1.3.2.dist-info/licenses/LICENSE,sha256=Hb2YOBKy2MJQLnyLrX37B4ZVuac8eaIcE71SvVIMOLg,1082
|
|
2
|
-
texttools/__init__.py,sha256=RK1GAU6pq2lGwFtHdrCX5JkPRHmOLGcmGH67hd_7VAQ,175
|
|
3
|
-
texttools/models.py,sha256=5eT2cSrFq8Xa38kANznV7gbi7lwB2PoDxciLKTpsd6c,2516
|
|
4
|
-
texttools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
texttools/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
texttools/core/engine.py,sha256=AjifrcJl6PeRu1W6nu9zcxySn-1439Ef2La4d7GpNKY,9481
|
|
7
|
-
texttools/core/exceptions.py,sha256=6SDjUL1rmd3ngzD3ytF4LyTRj3bQMSFR9ECrLoqXXHw,395
|
|
8
|
-
texttools/core/internal_models.py,sha256=J1qGEO8V0OoX6_-1yxbSmZSR79tJF0ExAIG1QuvH0L0,1734
|
|
9
|
-
texttools/core/operators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
texttools/core/operators/async_operator.py,sha256=-72YQEGFkbk2uYW6PHkLT4wGxhj2p6Uqy3sJtVa9-rk,6386
|
|
11
|
-
texttools/core/operators/sync_operator.py,sha256=mfXtEOlIAhHo4SHaHRKjGb0Z1T894clv-toUzUcbfpo,6291
|
|
12
|
-
texttools/prompts/categorize.yaml,sha256=42Rp3SgVHaDLKrJ27_uK788LiQud0pOXJthz4r0a40Y,1214
|
|
13
|
-
texttools/prompts/check_fact.yaml,sha256=zWFQDRhEE1ij9wSeeenS9YSTM-bY5zzUaG390zUgmcs,714
|
|
14
|
-
texttools/prompts/extract_entities.yaml,sha256=_zYKHNJDIzVDI_-TnwFCKyMs-XLM5igvmWhvSTc3INQ,637
|
|
15
|
-
texttools/prompts/extract_keywords.yaml,sha256=1o4u3uwzapNtB1BUpNIRL5qtrwjW0Yhvyq0TZJiafdg,3272
|
|
16
|
-
texttools/prompts/is_question.yaml,sha256=jnPARd2ZiulLzHW_r4WAsz3sOryfz6Gy5-yYXp-2hd0,496
|
|
17
|
-
texttools/prompts/merge_questions.yaml,sha256=l9Q2OEjPp3SDkxbq3zZCj2ZmXacWSnmYMpUr3l6r5yE,1816
|
|
18
|
-
texttools/prompts/propositionize.yaml,sha256=nbGAfbm1-2Hoc0JLtqZi-S7VHQfnMmuTKI7dZeBxQW0,1403
|
|
19
|
-
texttools/prompts/rewrite.yaml,sha256=klEm8MqXK-Bo8RsS5R9KLMT0zlD-BKo_G6tz9lpAcEY,5420
|
|
20
|
-
texttools/prompts/run_custom.yaml,sha256=IETY9H0wPGWIIzcnupfbwwKQblwZrbYAxB754W9MhgU,125
|
|
21
|
-
texttools/prompts/subject_to_question.yaml,sha256=AK16pZW9HUppIF8JBSEenbUNOU3aqeVV781_WUXnLqk,1160
|
|
22
|
-
texttools/prompts/summarize.yaml,sha256=rPh060Bx_yI1W2JNg-nr83LUk9itatYLKM8ciH2pOvg,486
|
|
23
|
-
texttools/prompts/text_to_question.yaml,sha256=pUwPgK9l5f8S4E5fCht9JY7PFVK2aY1InPfASr7R5o4,1017
|
|
24
|
-
texttools/prompts/translate.yaml,sha256=Dd5bs3O8SI-FlVSwHMYGeEjMmdOWeRlcfBHkhixCx7c,665
|
|
25
|
-
texttools/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
texttools/tools/async_tools.py,sha256=2suwx8N0aRnowaSOpV6C57AqPlmQe5Z0Yx4E5QIMkmU,46939
|
|
27
|
-
texttools/tools/sync_tools.py,sha256=mEuL-nlbxVW30dPE3hGkAUnYXbul-3gN2Le4CMVFCgU,42528
|
|
28
|
-
hamtaa_texttools-1.3.2.dist-info/METADATA,sha256=LjhXLwovneW5Ii1DvAYhFT4JR64ar23UyptCvCO6Hpc,7448
|
|
29
|
-
hamtaa_texttools-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
30
|
-
hamtaa_texttools-1.3.2.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
31
|
-
hamtaa_texttools-1.3.2.dist-info/RECORD,,
|
texttools/core/engine.py
DELETED
|
@@ -1,262 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import math
|
|
3
|
-
import random
|
|
4
|
-
import re
|
|
5
|
-
from functools import lru_cache
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
8
|
-
|
|
9
|
-
import yaml
|
|
10
|
-
|
|
11
|
-
from .exceptions import PromptError
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class PromptLoader:
|
|
15
|
-
"""
|
|
16
|
-
Utility for loading and formatting YAML prompt templates.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
MAIN_TEMPLATE = "main_template"
|
|
20
|
-
ANALYZE_TEMPLATE = "analyze_template"
|
|
21
|
-
|
|
22
|
-
@lru_cache(maxsize=32)
|
|
23
|
-
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
24
|
-
try:
|
|
25
|
-
base_dir = Path(__file__).parent.parent / Path("prompts")
|
|
26
|
-
prompt_path = base_dir / prompt_file
|
|
27
|
-
|
|
28
|
-
if not prompt_path.exists():
|
|
29
|
-
raise PromptError(f"Prompt file not found: {prompt_file}")
|
|
30
|
-
|
|
31
|
-
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
32
|
-
|
|
33
|
-
if self.MAIN_TEMPLATE not in data:
|
|
34
|
-
raise PromptError(f"Missing 'main_template' in {prompt_file}")
|
|
35
|
-
|
|
36
|
-
if self.ANALYZE_TEMPLATE not in data:
|
|
37
|
-
raise PromptError(f"Missing 'analyze_template' in {prompt_file}")
|
|
38
|
-
|
|
39
|
-
if mode and mode not in data.get(self.MAIN_TEMPLATE, {}):
|
|
40
|
-
raise PromptError(f"Mode '{mode}' not found in {prompt_file}")
|
|
41
|
-
|
|
42
|
-
main_template = (
|
|
43
|
-
data[self.MAIN_TEMPLATE][mode]
|
|
44
|
-
if mode and isinstance(data[self.MAIN_TEMPLATE], dict)
|
|
45
|
-
else data[self.MAIN_TEMPLATE]
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
analyze_template = (
|
|
49
|
-
data[self.ANALYZE_TEMPLATE][mode]
|
|
50
|
-
if mode and isinstance(data[self.ANALYZE_TEMPLATE], dict)
|
|
51
|
-
else data[self.ANALYZE_TEMPLATE]
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
if not main_template or not main_template.strip():
|
|
55
|
-
raise PromptError(
|
|
56
|
-
f"Empty main_template in {prompt_file}"
|
|
57
|
-
+ (f" for mode '{mode}'" if mode else "")
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
return {
|
|
61
|
-
self.MAIN_TEMPLATE: main_template,
|
|
62
|
-
self.ANALYZE_TEMPLATE: analyze_template,
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
except yaml.YAMLError as e:
|
|
66
|
-
raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
|
|
67
|
-
except Exception as e:
|
|
68
|
-
raise PromptError(f"Failed to load prompt {prompt_file}: {e}")
|
|
69
|
-
|
|
70
|
-
def load(
|
|
71
|
-
self, prompt_file: str, text: str, mode: str, **extra_kwargs
|
|
72
|
-
) -> dict[str, str]:
|
|
73
|
-
try:
|
|
74
|
-
format_args = {"text": text}
|
|
75
|
-
format_args.update(extra_kwargs)
|
|
76
|
-
|
|
77
|
-
template_configs = self._load_templates(prompt_file, mode)
|
|
78
|
-
for key, value in template_configs.items():
|
|
79
|
-
template_configs[key] = value.format(**format_args)
|
|
80
|
-
|
|
81
|
-
return template_configs
|
|
82
|
-
|
|
83
|
-
except KeyError as e:
|
|
84
|
-
raise PromptError(f"Missing template variable: {e}")
|
|
85
|
-
except Exception as e:
|
|
86
|
-
raise PromptError(f"Failed to format prompt: {e}")
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class OperatorUtils:
|
|
90
|
-
@staticmethod
|
|
91
|
-
def build_main_prompt(
|
|
92
|
-
main_template: str,
|
|
93
|
-
analysis: str | None,
|
|
94
|
-
output_lang: str | None,
|
|
95
|
-
user_prompt: str | None,
|
|
96
|
-
) -> str:
|
|
97
|
-
parts = []
|
|
98
|
-
|
|
99
|
-
if analysis:
|
|
100
|
-
parts.append(f"Based on this analysis: {analysis}")
|
|
101
|
-
if output_lang:
|
|
102
|
-
parts.append(f"Respond only in the {output_lang} language.")
|
|
103
|
-
if user_prompt:
|
|
104
|
-
parts.append(f"Consider this instruction: {user_prompt}")
|
|
105
|
-
|
|
106
|
-
parts.append(main_template)
|
|
107
|
-
return "\n".join(parts)
|
|
108
|
-
|
|
109
|
-
@staticmethod
|
|
110
|
-
def build_message(prompt: str) -> list[dict[str, str]]:
|
|
111
|
-
return [{"role": "user", "content": prompt}]
|
|
112
|
-
|
|
113
|
-
@staticmethod
|
|
114
|
-
def extract_logprobs(completion: Any) -> list[dict]:
|
|
115
|
-
"""
|
|
116
|
-
Extracts and filters logprobs from completion.
|
|
117
|
-
Skips punctuation and structural tokens.
|
|
118
|
-
"""
|
|
119
|
-
logprobs_data = []
|
|
120
|
-
|
|
121
|
-
ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
|
|
122
|
-
|
|
123
|
-
for choice in completion.choices:
|
|
124
|
-
if not getattr(choice, "logprobs", None):
|
|
125
|
-
raise ValueError("Your model does not support logprobs")
|
|
126
|
-
|
|
127
|
-
for logprob_item in choice.logprobs.content:
|
|
128
|
-
if ignore_pattern.match(logprob_item.token):
|
|
129
|
-
continue
|
|
130
|
-
token_entry = {
|
|
131
|
-
"token": logprob_item.token,
|
|
132
|
-
"prob": round(math.exp(logprob_item.logprob), 8),
|
|
133
|
-
"top_alternatives": [],
|
|
134
|
-
}
|
|
135
|
-
for alt in logprob_item.top_logprobs:
|
|
136
|
-
if ignore_pattern.match(alt.token):
|
|
137
|
-
continue
|
|
138
|
-
token_entry["top_alternatives"].append(
|
|
139
|
-
{
|
|
140
|
-
"token": alt.token,
|
|
141
|
-
"prob": round(math.exp(alt.logprob), 8),
|
|
142
|
-
}
|
|
143
|
-
)
|
|
144
|
-
logprobs_data.append(token_entry)
|
|
145
|
-
|
|
146
|
-
return logprobs_data
|
|
147
|
-
|
|
148
|
-
@staticmethod
|
|
149
|
-
def get_retry_temp(base_temp: float) -> float:
|
|
150
|
-
new_temp = base_temp + random.choice([-1, 1]) * random.uniform(0.1, 0.9)
|
|
151
|
-
return max(0.0, min(new_temp, 1.5))
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def text_to_chunks(text: str, size: int, overlap: int) -> list[str]:
|
|
155
|
-
"""
|
|
156
|
-
Utility for chunking large texts. Used for translation tool
|
|
157
|
-
"""
|
|
158
|
-
separators = ["\n\n", "\n", " ", ""]
|
|
159
|
-
is_separator_regex = False
|
|
160
|
-
keep_separator = True
|
|
161
|
-
length_function = len
|
|
162
|
-
strip_whitespace = True
|
|
163
|
-
chunk_size = size
|
|
164
|
-
chunk_overlap = overlap
|
|
165
|
-
|
|
166
|
-
def _split_text_with_regex(
|
|
167
|
-
text: str, separator: str, keep_separator: bool
|
|
168
|
-
) -> list[str]:
|
|
169
|
-
if not separator:
|
|
170
|
-
return [text]
|
|
171
|
-
if not keep_separator:
|
|
172
|
-
return re.split(separator, text)
|
|
173
|
-
_splits = re.split(f"({separator})", text)
|
|
174
|
-
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
|
175
|
-
if len(_splits) % 2 == 0:
|
|
176
|
-
splits += [_splits[-1]]
|
|
177
|
-
return [_splits[0]] + splits if _splits[0] else splits
|
|
178
|
-
|
|
179
|
-
def _join_docs(docs: list[str], separator: str) -> str | None:
|
|
180
|
-
text = separator.join(docs)
|
|
181
|
-
if strip_whitespace:
|
|
182
|
-
text = text.strip()
|
|
183
|
-
return text if text else None
|
|
184
|
-
|
|
185
|
-
def _merge_splits(splits: list[str], separator: str) -> list[str]:
|
|
186
|
-
separator_len = length_function(separator)
|
|
187
|
-
docs = []
|
|
188
|
-
current_doc = []
|
|
189
|
-
total = 0
|
|
190
|
-
for d in splits:
|
|
191
|
-
len_ = length_function(d)
|
|
192
|
-
if total + len_ + (separator_len if current_doc else 0) > chunk_size:
|
|
193
|
-
if total > chunk_size:
|
|
194
|
-
pass
|
|
195
|
-
if current_doc:
|
|
196
|
-
doc = _join_docs(current_doc, separator)
|
|
197
|
-
if doc is not None:
|
|
198
|
-
docs.append(doc)
|
|
199
|
-
while total > chunk_overlap or (
|
|
200
|
-
total + len_ + (separator_len if current_doc else 0)
|
|
201
|
-
> chunk_size
|
|
202
|
-
and total > 0
|
|
203
|
-
):
|
|
204
|
-
total -= length_function(current_doc[0]) + (
|
|
205
|
-
separator_len if len(current_doc) > 1 else 0
|
|
206
|
-
)
|
|
207
|
-
current_doc = current_doc[1:]
|
|
208
|
-
current_doc.append(d)
|
|
209
|
-
total += len_ + (separator_len if len(current_doc) > 1 else 0)
|
|
210
|
-
doc = _join_docs(current_doc, separator)
|
|
211
|
-
if doc is not None:
|
|
212
|
-
docs.append(doc)
|
|
213
|
-
return docs
|
|
214
|
-
|
|
215
|
-
def _split_text(text: str, separators: list[str]) -> list[str]:
|
|
216
|
-
final_chunks = []
|
|
217
|
-
separator = separators[-1]
|
|
218
|
-
new_separators = []
|
|
219
|
-
for i, _s in enumerate(separators):
|
|
220
|
-
separator_ = _s if is_separator_regex else re.escape(_s)
|
|
221
|
-
if not _s:
|
|
222
|
-
separator = _s
|
|
223
|
-
break
|
|
224
|
-
if re.search(separator_, text):
|
|
225
|
-
separator = _s
|
|
226
|
-
new_separators = separators[i + 1 :]
|
|
227
|
-
break
|
|
228
|
-
separator_ = separator if is_separator_regex else re.escape(separator)
|
|
229
|
-
splits = _split_text_with_regex(text, separator_, keep_separator)
|
|
230
|
-
_separator = "" if keep_separator else separator
|
|
231
|
-
good_splits = []
|
|
232
|
-
for s in splits:
|
|
233
|
-
if length_function(s) < chunk_size:
|
|
234
|
-
good_splits.append(s)
|
|
235
|
-
else:
|
|
236
|
-
if good_splits:
|
|
237
|
-
merged_text = _merge_splits(good_splits, _separator)
|
|
238
|
-
final_chunks.extend(merged_text)
|
|
239
|
-
good_splits = []
|
|
240
|
-
if not new_separators:
|
|
241
|
-
final_chunks.append(s)
|
|
242
|
-
else:
|
|
243
|
-
other_info = _split_text(s, new_separators)
|
|
244
|
-
final_chunks.extend(other_info)
|
|
245
|
-
if good_splits:
|
|
246
|
-
merged_text = _merge_splits(good_splits, _separator)
|
|
247
|
-
final_chunks.extend(merged_text)
|
|
248
|
-
return final_chunks
|
|
249
|
-
|
|
250
|
-
return _split_text(text, separators)
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
async def run_with_timeout(coro, timeout: float | None):
|
|
254
|
-
"""
|
|
255
|
-
Utility for timeout logic defined in AsyncTheTool
|
|
256
|
-
"""
|
|
257
|
-
if timeout is None:
|
|
258
|
-
return await coro
|
|
259
|
-
try:
|
|
260
|
-
return await asyncio.wait_for(coro, timeout=timeout)
|
|
261
|
-
except asyncio.TimeoutError:
|
|
262
|
-
raise TimeoutError(f"Operation exceeded timeout of {timeout} seconds")
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
main_template: |
|
|
2
|
-
You are a question from subject generator.
|
|
3
|
-
Given the following subject, generate {number_of_questions} appropriate questions that this subject would directly respond to.
|
|
4
|
-
The generated subject should be independently meaningful,
|
|
5
|
-
and it must not mention any verbs like, this, that, he or she and etc. in the question.
|
|
6
|
-
|
|
7
|
-
There is a `reason` key, fill that up with a summerized version of your thoughts.
|
|
8
|
-
The `reason` must be less than 20 words.
|
|
9
|
-
Don't forget to fill the reason.
|
|
10
|
-
|
|
11
|
-
Respond only in JSON format:
|
|
12
|
-
{{"result": ["question1", "question2", ...], "reason": "string"}}
|
|
13
|
-
|
|
14
|
-
Here is the subject:
|
|
15
|
-
{text}
|
|
16
|
-
|
|
17
|
-
analyze_template: |
|
|
18
|
-
Our goal is to generate questions from the given subject.
|
|
19
|
-
The questions must be meaningfull, some of them should be specific and some should be general.
|
|
20
|
-
But first, in this step we want to analyze the subject that I asked to generate questions for it.
|
|
21
|
-
We need a summerized analysis of the subject.
|
|
22
|
-
What is the subject about?
|
|
23
|
-
What point of views can we see and generate questoins from it? (Questions that real users might have.)
|
|
24
|
-
|
|
25
|
-
Here is the subject:
|
|
26
|
-
{text}
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
main_template: |
|
|
2
|
-
You are a question generator.
|
|
3
|
-
Given the following answer, generate {number_of_questions} appropriate questions that this answer would directly respond to.
|
|
4
|
-
The generated answer should be independently meaningful,
|
|
5
|
-
and not mentioning any verbs like, this, that, he or she on the question.
|
|
6
|
-
|
|
7
|
-
There is a `reason` key, fill that up with a summerized version of your thoughts.
|
|
8
|
-
The `reason` must be less than 20 words.
|
|
9
|
-
Don't forget to fill the reason.
|
|
10
|
-
|
|
11
|
-
Respond only in JSON format:
|
|
12
|
-
{{"result": ["question1", "question2", ...], "reason": "string"}}
|
|
13
|
-
|
|
14
|
-
Here is the answer:
|
|
15
|
-
{text}
|
|
16
|
-
|
|
17
|
-
analyze_template: |
|
|
18
|
-
Analyze the following answer to identify its key facts,
|
|
19
|
-
main subject, and what kind of information it provides.
|
|
20
|
-
Provide a brief, summarized understanding of the answer's content that will
|
|
21
|
-
help in formulating relevant and direct questions.
|
|
22
|
-
Just mention the keypoints that was provided in the answer
|
|
23
|
-
|
|
24
|
-
Here is the answer:
|
|
25
|
-
{text}
|
|
26
|
-
|
|
File without changes
|