hamtaa-texttools 1.3.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {hamtaa_texttools-1.3.1.dist-info → hamtaa_texttools-2.0.0.dist-info}/METADATA +42 -48
  2. hamtaa_texttools-2.0.0.dist-info/RECORD +30 -0
  3. {hamtaa_texttools-1.3.1.dist-info → hamtaa_texttools-2.0.0.dist-info}/WHEEL +1 -1
  4. {hamtaa_texttools-1.3.1.dist-info → hamtaa_texttools-2.0.0.dist-info}/licenses/LICENSE +1 -1
  5. texttools/__init__.py +1 -1
  6. texttools/core/internal_models.py +21 -8
  7. texttools/core/operators/__init__.py +0 -0
  8. texttools/core/operators/async_operator.py +11 -19
  9. texttools/core/operators/sync_operator.py +11 -19
  10. texttools/core/utils.py +260 -0
  11. texttools/models.py +77 -22
  12. texttools/prompts/{rewrite.yaml → augment.yaml} +3 -3
  13. texttools/prompts/categorize.yaml +7 -8
  14. texttools/prompts/extract_entities.yaml +2 -2
  15. texttools/prompts/extract_keywords.yaml +4 -2
  16. texttools/prompts/{check_fact.yaml → is_fact.yaml} +5 -4
  17. texttools/prompts/is_question.yaml +1 -1
  18. texttools/prompts/merge_questions.yaml +8 -6
  19. texttools/prompts/propositionize.yaml +11 -7
  20. texttools/prompts/run_custom.yaml +3 -1
  21. texttools/prompts/summarize.yaml +3 -3
  22. texttools/prompts/to_question.yaml +60 -0
  23. texttools/prompts/translate.yaml +4 -4
  24. texttools/tools/async_tools.py +90 -169
  25. texttools/tools/sync_tools.py +76 -150
  26. hamtaa_texttools-1.3.1.dist-info/RECORD +0 -30
  27. texttools/core/engine.py +0 -264
  28. texttools/prompts/subject_to_question.yaml +0 -26
  29. texttools/prompts/text_to_question.yaml +0 -26
  30. {hamtaa_texttools-1.3.1.dist-info → hamtaa_texttools-2.0.0.dist-info}/top_level.txt +0 -0
texttools/core/engine.py DELETED
@@ -1,264 +0,0 @@
1
- import asyncio
2
- import math
3
- import random
4
- import re
5
- from functools import lru_cache
6
- from pathlib import Path
7
-
8
- import yaml
9
-
10
- from .exceptions import PromptError
11
-
12
-
13
- class PromptLoader:
14
- """
15
- Utility for loading and formatting YAML prompt templates.
16
- """
17
-
18
- MAIN_TEMPLATE = "main_template"
19
- ANALYZE_TEMPLATE = "analyze_template"
20
-
21
- @lru_cache(maxsize=32)
22
- def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
23
- """
24
- Loads prompt templates from YAML file with optional mode selection.
25
- """
26
- try:
27
- base_dir = Path(__file__).parent.parent / Path("prompts")
28
- prompt_path = base_dir / prompt_file
29
-
30
- if not prompt_path.exists():
31
- raise PromptError(f"Prompt file not found: {prompt_file}")
32
-
33
- data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
34
-
35
- if self.MAIN_TEMPLATE not in data:
36
- raise PromptError(f"Missing 'main_template' in {prompt_file}")
37
-
38
- if self.ANALYZE_TEMPLATE not in data:
39
- raise PromptError(f"Missing 'analyze_template' in {prompt_file}")
40
-
41
- if mode and mode not in data.get(self.MAIN_TEMPLATE, {}):
42
- raise PromptError(f"Mode '{mode}' not found in {prompt_file}")
43
-
44
- main_template = (
45
- data[self.MAIN_TEMPLATE][mode]
46
- if mode and isinstance(data[self.MAIN_TEMPLATE], dict)
47
- else data[self.MAIN_TEMPLATE]
48
- )
49
-
50
- analyze_template = (
51
- data[self.ANALYZE_TEMPLATE][mode]
52
- if mode and isinstance(data[self.ANALYZE_TEMPLATE], dict)
53
- else data[self.ANALYZE_TEMPLATE]
54
- )
55
-
56
- if not main_template or not main_template.strip():
57
- raise PromptError(
58
- f"Empty main_template in {prompt_file}"
59
- + (f" for mode '{mode}'" if mode else "")
60
- )
61
-
62
- return {
63
- self.MAIN_TEMPLATE: main_template,
64
- self.ANALYZE_TEMPLATE: analyze_template,
65
- }
66
-
67
- except yaml.YAMLError as e:
68
- raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
69
- except Exception as e:
70
- raise PromptError(f"Failed to load prompt {prompt_file}: {e}")
71
-
72
- def load(
73
- self, prompt_file: str, text: str, mode: str, **extra_kwargs
74
- ) -> dict[str, str]:
75
- try:
76
- template_configs = self._load_templates(prompt_file, mode)
77
- format_args = {"text": text}
78
- format_args.update(extra_kwargs)
79
-
80
- # Inject variables inside each template
81
- for key in template_configs.keys():
82
- template_configs[key] = template_configs[key].format(**format_args)
83
-
84
- return template_configs
85
-
86
- except KeyError as e:
87
- raise PromptError(f"Missing template variable: {e}")
88
- except Exception as e:
89
- raise PromptError(f"Failed to format prompt: {e}")
90
-
91
-
92
- class OperatorUtils:
93
- @staticmethod
94
- def build_main_prompt(
95
- main_template: str,
96
- analysis: str | None,
97
- output_lang: str | None,
98
- user_prompt: str | None,
99
- ) -> str:
100
- main_prompt = ""
101
-
102
- if analysis:
103
- main_prompt += f"Based on this analysis:\n{analysis}\n"
104
-
105
- if output_lang:
106
- main_prompt += f"Respond only in the {output_lang} language.\n"
107
-
108
- if user_prompt:
109
- main_prompt += f"Consider this instruction {user_prompt}\n"
110
-
111
- main_prompt += main_template
112
-
113
- return main_prompt
114
-
115
- @staticmethod
116
- def build_message(prompt: str) -> list[dict[str, str]]:
117
- return [{"role": "user", "content": prompt}]
118
-
119
- @staticmethod
120
- def extract_logprobs(completion: dict) -> list[dict]:
121
- """
122
- Extracts and filters token probabilities from completion logprobs.
123
- Skips punctuation and structural tokens, returns cleaned probability data.
124
- """
125
- logprobs_data = []
126
-
127
- ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
128
-
129
- for choice in completion.choices:
130
- if not getattr(choice, "logprobs", None):
131
- raise ValueError("Your model does not support logprobs")
132
-
133
- for logprob_item in choice.logprobs.content:
134
- if ignore_pattern.match(logprob_item.token):
135
- continue
136
- token_entry = {
137
- "token": logprob_item.token,
138
- "prob": round(math.exp(logprob_item.logprob), 8),
139
- "top_alternatives": [],
140
- }
141
- for alt in logprob_item.top_logprobs:
142
- if ignore_pattern.match(alt.token):
143
- continue
144
- token_entry["top_alternatives"].append(
145
- {
146
- "token": alt.token,
147
- "prob": round(math.exp(alt.logprob), 8),
148
- }
149
- )
150
- logprobs_data.append(token_entry)
151
-
152
- return logprobs_data
153
-
154
- @staticmethod
155
- def get_retry_temp(base_temp: float) -> float:
156
- delta_temp = random.choice([-1, 1]) * random.uniform(0.1, 0.9)
157
- new_temp = base_temp + delta_temp
158
-
159
- return max(0.0, min(new_temp, 1.5))
160
-
161
-
162
- def text_to_chunks(text: str, size: int, overlap: int) -> list[str]:
163
- separators = ["\n\n", "\n", " ", ""]
164
- is_separator_regex = False
165
- keep_separator = True # Equivalent to 'start'
166
- length_function = len
167
- strip_whitespace = True
168
- chunk_size = size
169
- chunk_overlap = overlap
170
-
171
- def _split_text_with_regex(
172
- text: str, separator: str, keep_separator: bool
173
- ) -> list[str]:
174
- if not separator:
175
- return [text]
176
- if not keep_separator:
177
- return re.split(separator, text)
178
- _splits = re.split(f"({separator})", text)
179
- splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
180
- if len(_splits) % 2 == 0:
181
- splits += [_splits[-1]]
182
- return [_splits[0]] + splits if _splits[0] else splits
183
-
184
- def _join_docs(docs: list[str], separator: str) -> str | None:
185
- text = separator.join(docs)
186
- if strip_whitespace:
187
- text = text.strip()
188
- return text if text else None
189
-
190
- def _merge_splits(splits: list[str], separator: str) -> list[str]:
191
- separator_len = length_function(separator)
192
- docs = []
193
- current_doc = []
194
- total = 0
195
- for d in splits:
196
- len_ = length_function(d)
197
- if total + len_ + (separator_len if current_doc else 0) > chunk_size:
198
- if total > chunk_size:
199
- pass
200
- if current_doc:
201
- doc = _join_docs(current_doc, separator)
202
- if doc is not None:
203
- docs.append(doc)
204
- while total > chunk_overlap or (
205
- total + len_ + (separator_len if current_doc else 0)
206
- > chunk_size
207
- and total > 0
208
- ):
209
- total -= length_function(current_doc[0]) + (
210
- separator_len if len(current_doc) > 1 else 0
211
- )
212
- current_doc = current_doc[1:]
213
- current_doc.append(d)
214
- total += len_ + (separator_len if len(current_doc) > 1 else 0)
215
- doc = _join_docs(current_doc, separator)
216
- if doc is not None:
217
- docs.append(doc)
218
- return docs
219
-
220
- def _split_text(text: str, separators: list[str]) -> list[str]:
221
- final_chunks = []
222
- separator = separators[-1]
223
- new_separators = []
224
- for i, _s in enumerate(separators):
225
- separator_ = _s if is_separator_regex else re.escape(_s)
226
- if not _s:
227
- separator = _s
228
- break
229
- if re.search(separator_, text):
230
- separator = _s
231
- new_separators = separators[i + 1 :]
232
- break
233
- separator_ = separator if is_separator_regex else re.escape(separator)
234
- splits = _split_text_with_regex(text, separator_, keep_separator)
235
- _separator = "" if keep_separator else separator
236
- good_splits = []
237
- for s in splits:
238
- if length_function(s) < chunk_size:
239
- good_splits.append(s)
240
- else:
241
- if good_splits:
242
- merged_text = _merge_splits(good_splits, _separator)
243
- final_chunks.extend(merged_text)
244
- good_splits = []
245
- if not new_separators:
246
- final_chunks.append(s)
247
- else:
248
- other_info = _split_text(s, new_separators)
249
- final_chunks.extend(other_info)
250
- if good_splits:
251
- merged_text = _merge_splits(good_splits, _separator)
252
- final_chunks.extend(merged_text)
253
- return final_chunks
254
-
255
- return _split_text(text, separators)
256
-
257
-
258
- async def run_with_timeout(coro, timeout: float | None):
259
- if timeout is None:
260
- return await coro
261
- try:
262
- return await asyncio.wait_for(coro, timeout=timeout)
263
- except asyncio.TimeoutError:
264
- raise TimeoutError(f"Operation exceeded timeout of {timeout} seconds")
@@ -1,26 +0,0 @@
1
- main_template: |
2
- You are a question from subject generator.
3
- Given the following subject, generate {number_of_questions} appropriate questions that this subject would directly respond to.
4
- The generated subject should be independently meaningful,
5
- and it must not mention any verbs like, this, that, he or she and etc. in the question.
6
-
7
- There is a `reason` key, fill that up with a summerized version of your thoughts.
8
- The `reason` must be less than 20 words.
9
- Don't forget to fill the reason.
10
-
11
- Respond only in JSON format:
12
- {{"result": ["question1", "question2", ...], "reason": "string"}}
13
-
14
- Here is the subject:
15
- {text}
16
-
17
- analyze_template: |
18
- Our goal is to generate questions from the given subject.
19
- The questions must be meaningfull, some of them should be specific and some should be general.
20
- But first, in this step we want to analyze the subject that I asked to generate questions for it.
21
- We need a summerized analysis of the subject.
22
- What is the subject about?
23
- What point of views can we see and generate questoins from it? (Questions that real users might have.)
24
-
25
- Here is the subject:
26
- {text}
@@ -1,26 +0,0 @@
1
- main_template: |
2
- You are a question generator.
3
- Given the following answer, generate {number_of_questions} appropriate questions that this answer would directly respond to.
4
- The generated answer should be independently meaningful,
5
- and not mentioning any verbs like, this, that, he or she on the question.
6
-
7
- There is a `reason` key, fill that up with a summerized version of your thoughts.
8
- The `reason` must be less than 20 words.
9
- Don't forget to fill the reason.
10
-
11
- Respond only in JSON format:
12
- {{"result": ["question1", "question2", ...], "reason": "string"}}
13
-
14
- Here is the answer:
15
- {text}
16
-
17
- analyze_template: |
18
- Analyze the following answer to identify its key facts,
19
- main subject, and what kind of information it provides.
20
- Provide a brief, summarized understanding of the answer's content that will
21
- help in formulating relevant and direct questions.
22
- Just mention the keypoints that was provided in the answer
23
-
24
- Here is the answer:
25
- {text}
26
-