hamtaa-texttools 1.3.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ import asyncio
2
+ import math
3
+ import random
4
+ import re
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+ from .exceptions import PromptError
12
+
13
+
14
+ class OperatorUtils:
15
+ """
16
+ Collection of utilities used in operators
17
+ """
18
+
19
+ @staticmethod
20
+ @lru_cache(maxsize=32)
21
+ def _load_prompt_yaml(prompt_file: str) -> dict:
22
+ base_dir = Path(__file__).parent.parent / "prompts"
23
+ prompt_path = base_dir / prompt_file
24
+
25
+ if not prompt_path.exists():
26
+ raise PromptError(f"Prompt file not found: {prompt_file}")
27
+
28
+ try:
29
+ return yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
30
+ except yaml.YAMLError as e:
31
+ raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
32
+
33
+ @staticmethod
34
+ def load_prompt(
35
+ prompt_file: str, text: str, mode: str, **extra_kwargs
36
+ ) -> dict[str, str]:
37
+ try:
38
+ data = OperatorUtils._load_prompt_yaml(prompt_file)
39
+
40
+ if "main_template" not in data:
41
+ raise PromptError(f"Missing 'main_template' in {prompt_file}")
42
+
43
+ if "analyze_template" not in data:
44
+ raise PromptError(f"Missing 'analyze_template' in {prompt_file}")
45
+
46
+ if mode and mode not in data.get("main_template", {}):
47
+ raise PromptError(f"Mode '{mode}' not found in {prompt_file}")
48
+
49
+ main_template = (
50
+ data["main_template"][mode]
51
+ if mode and isinstance(data["main_template"], dict)
52
+ else data["main_template"]
53
+ )
54
+
55
+ analyze_template = (
56
+ data["analyze_template"][mode]
57
+ if mode and isinstance(data["analyze_template"], dict)
58
+ else data["analyze_template"]
59
+ )
60
+
61
+ if not main_template or not main_template.strip():
62
+ raise PromptError(
63
+ f"Empty main_template in {prompt_file}"
64
+ + (f" for mode '{mode}'" if mode else "")
65
+ )
66
+
67
+ template_configs = {
68
+ "main_template": main_template,
69
+ "analyze_template": analyze_template,
70
+ }
71
+
72
+ format_args = {"text": text}
73
+ format_args.update(extra_kwargs)
74
+
75
+ # Inject variables into the templates
76
+ for key, value in template_configs.items():
77
+ template_configs[key] = value.format(**format_args)
78
+
79
+ return template_configs
80
+
81
+ except yaml.YAMLError as e:
82
+ raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
83
+ except KeyError as e:
84
+ raise PromptError(f"Missing template variable: {e}")
85
+ except Exception as e:
86
+ raise PromptError(f"Failed to load prompt {prompt_file}: {e}")
87
+
88
+ @staticmethod
89
+ def build_main_prompt(
90
+ main_template: str,
91
+ analysis: str | None,
92
+ output_lang: str | None,
93
+ user_prompt: str | None,
94
+ ) -> str:
95
+ parts = []
96
+
97
+ if analysis:
98
+ parts.append(f"Based on this analysis: {analysis}")
99
+ if output_lang:
100
+ parts.append(f"Respond only in the {output_lang} language.")
101
+ if user_prompt:
102
+ parts.append(f"Consider this instruction: {user_prompt}")
103
+
104
+ parts.append(main_template)
105
+ return "\n".join(parts)
106
+
107
+ @staticmethod
108
+ def build_message(prompt: str) -> list[dict[str, str]]:
109
+ return [{"role": "user", "content": prompt}]
110
+
111
+ @staticmethod
112
+ def extract_logprobs(completion: Any) -> list[dict]:
113
+ """
114
+ Extracts and filters logprobs from completion.
115
+ Skips punctuation and structural tokens.
116
+ """
117
+ logprobs_data = []
118
+
119
+ ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
120
+
121
+ for choice in completion.choices:
122
+ if not getattr(choice, "logprobs", None):
123
+ raise ValueError("Your model does not support logprobs")
124
+
125
+ for logprob_item in choice.logprobs.content:
126
+ if ignore_pattern.match(logprob_item.token):
127
+ continue
128
+ token_entry = {
129
+ "token": logprob_item.token,
130
+ "prob": round(math.exp(logprob_item.logprob), 8),
131
+ "top_alternatives": [],
132
+ }
133
+ for alt in logprob_item.top_logprobs:
134
+ if ignore_pattern.match(alt.token):
135
+ continue
136
+ token_entry["top_alternatives"].append(
137
+ {
138
+ "token": alt.token,
139
+ "prob": round(math.exp(alt.logprob), 8),
140
+ }
141
+ )
142
+ logprobs_data.append(token_entry)
143
+
144
+ return logprobs_data
145
+
146
+ @staticmethod
147
+ def get_retry_temp(base_temp: float) -> float:
148
+ new_temp = base_temp + random.choice([-1, 1]) * random.uniform(0.1, 0.9)
149
+ return max(0.0, min(new_temp, 1.5))
150
+
151
+
152
+ class TheToolUtils:
153
+ """
154
+ Collection of utilities used in TheTool's tools
155
+ """
156
+
157
+ @staticmethod
158
+ def to_chunks(text: str, size: int, overlap: int) -> list[str]:
159
+ separators = ["\n\n", "\n", " ", ""]
160
+ is_separator_regex = False
161
+ keep_separator = True
162
+ length_function = len
163
+ strip_whitespace = True
164
+ chunk_size = size
165
+ chunk_overlap = overlap
166
+
167
+ def _split_text_with_regex(
168
+ text: str, separator: str, keep_separator: bool
169
+ ) -> list[str]:
170
+ if not separator:
171
+ return [text]
172
+ if not keep_separator:
173
+ return re.split(separator, text)
174
+ _splits = re.split(f"({separator})", text)
175
+ splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
176
+ if len(_splits) % 2 == 0:
177
+ splits += [_splits[-1]]
178
+ return [_splits[0]] + splits if _splits[0] else splits
179
+
180
+ def _join_docs(docs: list[str], separator: str) -> str | None:
181
+ text = separator.join(docs)
182
+ if strip_whitespace:
183
+ text = text.strip()
184
+ return text if text else None
185
+
186
+ def _merge_splits(splits: list[str], separator: str) -> list[str]:
187
+ separator_len = length_function(separator)
188
+ docs = []
189
+ current_doc = []
190
+ total = 0
191
+ for d in splits:
192
+ len_ = length_function(d)
193
+ if total + len_ + (separator_len if current_doc else 0) > chunk_size:
194
+ if total > chunk_size:
195
+ pass
196
+ if current_doc:
197
+ doc = _join_docs(current_doc, separator)
198
+ if doc is not None:
199
+ docs.append(doc)
200
+ while total > chunk_overlap or (
201
+ total + len_ + (separator_len if current_doc else 0)
202
+ > chunk_size
203
+ and total > 0
204
+ ):
205
+ total -= length_function(current_doc[0]) + (
206
+ separator_len if len(current_doc) > 1 else 0
207
+ )
208
+ current_doc = current_doc[1:]
209
+ current_doc.append(d)
210
+ total += len_ + (separator_len if len(current_doc) > 1 else 0)
211
+ doc = _join_docs(current_doc, separator)
212
+ if doc is not None:
213
+ docs.append(doc)
214
+ return docs
215
+
216
+ def _split_text(text: str, separators: list[str]) -> list[str]:
217
+ final_chunks = []
218
+ separator = separators[-1]
219
+ new_separators = []
220
+ for i, _s in enumerate(separators):
221
+ separator_ = _s if is_separator_regex else re.escape(_s)
222
+ if not _s:
223
+ separator = _s
224
+ break
225
+ if re.search(separator_, text):
226
+ separator = _s
227
+ new_separators = separators[i + 1 :]
228
+ break
229
+ separator_ = separator if is_separator_regex else re.escape(separator)
230
+ splits = _split_text_with_regex(text, separator_, keep_separator)
231
+ _separator = "" if keep_separator else separator
232
+ good_splits = []
233
+ for s in splits:
234
+ if length_function(s) < chunk_size:
235
+ good_splits.append(s)
236
+ else:
237
+ if good_splits:
238
+ merged_text = _merge_splits(good_splits, _separator)
239
+ final_chunks.extend(merged_text)
240
+ good_splits = []
241
+ if not new_separators:
242
+ final_chunks.append(s)
243
+ else:
244
+ other_info = _split_text(s, new_separators)
245
+ final_chunks.extend(other_info)
246
+ if good_splits:
247
+ merged_text = _merge_splits(good_splits, _separator)
248
+ final_chunks.extend(merged_text)
249
+ return final_chunks
250
+
251
+ return _split_text(text, separators)
252
+
253
+ @staticmethod
254
+ async def run_with_timeout(coro: Any, timeout: float | None) -> Any:
255
+ if timeout is None:
256
+ return await coro
257
+ try:
258
+ return await asyncio.wait_for(coro, timeout=timeout)
259
+ except asyncio.TimeoutError:
260
+ raise TimeoutError(f"Operation exceeded timeout of {timeout} seconds")
texttools/models.py CHANGED
@@ -3,12 +3,12 @@ from __future__ import annotations
3
3
  from datetime import datetime
4
4
  from typing import Any
5
5
 
6
- from pydantic import BaseModel
6
+ from pydantic import BaseModel, Field
7
7
 
8
8
 
9
9
  class ToolOutputMetadata(BaseModel):
10
10
  tool_name: str
11
- processed_at: datetime = datetime.now()
11
+ processed_at: datetime = Field(default_factory=datetime.now)
12
12
  execution_time: float | None = None
13
13
 
14
14
 
@@ -19,22 +19,26 @@ class ToolOutput(BaseModel):
19
19
  errors: list[str] = []
20
20
  metadata: ToolOutputMetadata | None = None
21
21
 
22
- def __repr__(self) -> str:
23
- return f"ToolOutput({self.model_dump_json(indent=2)})"
22
+ def is_successful(self) -> bool:
23
+ return not self.errors and self.result is not None
24
24
 
25
+ def to_dict(self, exclude_none: bool = False) -> dict:
26
+ return self.model_dump(exclude_none=exclude_none)
25
27
 
26
- class Node:
27
- def __init__(self, name: str, description: str, level: int, parent: Node | None):
28
- self.name = name
29
- self.description = description
30
- self.level = level
31
- self.parent = parent
32
- self.children = {}
28
+ def to_json(self, indent: int = 2, exclude_none: bool = False) -> str:
29
+ return self.model_dump_json(indent=indent, exclude_none=exclude_none)
30
+
31
+
32
+ class Node(BaseModel):
33
+ name: str
34
+ description: str | None
35
+ level: int
36
+ children: dict[str, Node] | None = Field(default_factory=dict)
33
37
 
34
38
 
35
39
  class CategoryTree:
36
40
  def __init__(self):
37
- self._root = Node(name="root", description="root", level=0, parent=None)
41
+ self._root = Node(name="root", description="root", level=0)
38
42
  self._all_nodes = {"root": self._root}
39
43
 
40
44
  def get_all_nodes(self) -> dict[str, Node]:
@@ -56,33 +60,84 @@ class CategoryTree:
56
60
  raise ValueError(f"Cannot add {name} category twice")
57
61
 
58
62
  parent = self.get_node(parent_name)
59
-
60
63
  if not parent:
61
- raise ValueError(f"Parent category '{parent_name}' not found")
64
+ raise ValueError(f"Parent category {parent_name} not found")
62
65
 
63
66
  node_data = {
64
67
  "name": name,
65
68
  "description": description if description else "No description provided",
66
69
  "level": parent.level + 1,
67
- "parent": parent,
68
70
  }
69
71
 
70
72
  new_node = Node(**node_data)
71
73
  parent.children[name] = new_node
72
74
  self._all_nodes[name] = new_node
73
75
 
74
- def remove_node(self, name: str) -> None:
76
+ def _find_parent(self, name: str) -> Node | None:
77
+ def traverse(node: Node) -> Node | None:
78
+ if name in node.children:
79
+ return node
80
+ for child in node.children.values():
81
+ found = traverse(child)
82
+ if found:
83
+ return found
84
+ return None
85
+
86
+ if name == "root":
87
+ return None
88
+
89
+ return traverse(self._root)
90
+
91
+ def remove_node(self, name: str, remove_children: bool = True) -> None:
75
92
  if name == "root":
76
93
  raise ValueError("Cannot remove the root node")
77
94
 
78
95
  node = self.get_node(name)
79
96
  if not node:
80
- raise ValueError(f"Category: '{name}' not found")
97
+ raise ValueError(f"Category: {name} not found")
98
+
99
+ parent = self._find_parent(name)
100
+ if not parent and name != "root":
101
+ raise ValueError("Parent not found, tree inconsistent")
102
+
103
+ if remove_children:
104
+ # Recursively remove children
105
+ for child_name in list(node.children.keys()):
106
+ self.remove_node(child_name, remove_children=True)
107
+ else:
108
+ # Move children to parent (grandparent for the children)
109
+ for child_name, child in list(node.children.items()):
110
+ if child_name in parent.children:
111
+ raise ValueError(f"Name conflict when moving child {child_name}")
112
+ parent.children[child_name] = child
113
+
114
+ # Update levels for moved subtree
115
+ def update_levels(n: Node, new_level: int):
116
+ n.level = new_level
117
+ for c in n.children.values():
118
+ update_levels(c, new_level + 1)
119
+
120
+ update_levels(child, parent.level + 1)
121
+
122
+ del parent.children[name]
123
+ del self._all_nodes[name]
81
124
 
82
- for child_name in list(node.children.keys()):
83
- self.remove_node(child_name)
125
+ def dump_tree(self) -> dict:
126
+ return self._root.model_dump()
84
127
 
85
- if node.parent:
86
- del node.parent.children[name]
128
+ def _index_subtree(self, node: Node):
129
+ if node.name in self._all_nodes:
130
+ raise ValueError(f"Duplicate node name: {node.name}")
87
131
 
88
- del self._all_nodes[name]
132
+ self._all_nodes[node.name] = node
133
+
134
+ for child in node.children.values():
135
+ self._index_subtree(child)
136
+
137
+ @classmethod
138
+ def from_dict(cls, root: dict) -> CategoryTree:
139
+ tree = cls()
140
+ tree._root = Node.model_validate(root)
141
+ tree._all_nodes = {}
142
+ tree._index_subtree(tree._root)
143
+ return tree
@@ -15,7 +15,7 @@ main_template:
15
15
  - Avoid Minor Changes: Do not just add/remove a few words or swap names. Create a fundamentally different sentence.
16
16
 
17
17
  Respond only in JSON format:
18
- {{"result": "str"}}
18
+ {{"result": "rewriteen_text"}}
19
19
 
20
20
  Anchor Text:
21
21
  "{text}"
@@ -32,7 +32,7 @@ main_template:
32
32
  - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
33
33
 
34
34
  Respond only in JSON format:
35
- {{"result": "str"}}
35
+ {{"result": "rewriteen_text"}}
36
36
 
37
37
  Anchor Text:
38
38
  "{text}"
@@ -53,7 +53,7 @@ main_template:
53
53
  - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
54
54
 
55
55
  Respond only in JSON format:
56
- {{"result": "str"}}
56
+ {{"result": "rewriteen_text"}}
57
57
 
58
58
  Anchor Text:
59
59
  "{text}"
@@ -14,23 +14,22 @@ main_template: |
14
14
  - If descriptions are missing or empty, rely on the category name.
15
15
  - If the correct answer cannot be determined with certainty, choose the most likely one.
16
16
 
17
- Output format:
17
+ Respond only in JSON format:
18
18
  {{
19
- "reason": "Explanation of why the input belongs to the category"
20
- "result": "<category_name_only>"
19
+ "reason": "explanation",
20
+ "result": "category_name",
21
21
  }}
22
22
 
23
- Available categories with their descriptions:
23
+ Available categories:
24
24
  {category_list}
25
25
 
26
26
  Here is the text:
27
27
  {text}
28
28
 
29
29
  analyze_template: |
30
- We want to categorize the given text.
31
- To improve categorization, we need an analysis of the text.
32
- Analyze the given text and write its main idea and a short analysis of that.
33
- Analysis should be very short.
30
+ The task is to categorize the given text.
31
+ To improve categorization, you must write an analysis of the text.
32
+ Analyze the given text and write its main idea and a short analysis of it.
34
33
 
35
34
  Here is the text:
36
35
  {text}
@@ -7,8 +7,8 @@ main_template: |
7
7
  {{
8
8
  "result": [
9
9
  {{
10
- "text": "string",
11
- "type": "string",
10
+ "text": "original_text",
11
+ "type": "ne_of_text",
12
12
  }}
13
13
  ]
14
14
  }}
@@ -12,7 +12,7 @@ main_template:
12
12
  - Output between 3 and 7 keywords based on the input length.
13
13
 
14
14
  Respond only in JSON format:
15
- {{"result": ["keyword1", "keyword2", etc.]}}
15
+ {{"result": ["keyword1", "keyword2", ...]}}
16
16
 
17
17
  Here is the text:
18
18
  {text}
@@ -34,7 +34,7 @@ main_template:
34
34
  - Long texts (more than 4 paragraphs): 6–7 keywords
35
35
 
36
36
  Respond only in JSON format:
37
- {{"result": ["keyword1", "keyword2", etc.]}}
37
+ {{"result": ["keyword1", "keyword2", ...]}}
38
38
 
39
39
  Here is the text:
40
40
  {text}
@@ -57,7 +57,9 @@ main_template:
57
57
  Here is the text:
58
58
  {text}
59
59
 
60
+
60
61
  analyze_template:
62
+
61
63
  auto: |
62
64
  Analyze the following text to identify its main topics, concepts, and important terms.
63
65
  Provide a concise summary of your findings that will help in extracting relevant keywords.
@@ -13,12 +13,13 @@ main_template: |
13
13
  {source_text}
14
14
 
15
15
  analyze_template: |
16
- You should analyze a statement and a source text and provide a brief,
17
- summarized analysis that could help in determining that can the statement
18
- be concluded from the source or not.
16
+ You must analyze a statement and a source text and provide a brief,
17
+ summarized analysis that could help in determining whether the statement
18
+ can be concluded from the source or not.
19
19
 
20
20
  The statement is:
21
21
  {text}
22
22
 
23
23
  The source text is:
24
- {source_text}
24
+ {source_text}
25
+
@@ -1,6 +1,6 @@
1
1
  main_template: |
2
2
  You are a question detector.
3
- Determine that if the given text contains any question or not.
3
+ Determine whether the given text contains any question or not.
4
4
 
5
5
  Respond only in JSON format (Output should be a boolean):
6
6
  {{"result": True/False}}
@@ -1,6 +1,6 @@
1
1
  main_template:
2
2
 
3
- default: |
3
+ simple: |
4
4
  You are a language expert.
5
5
  I will give you a list of questions that are semantically similar.
6
6
  Your task is to merge them into one unified question.
@@ -12,27 +12,29 @@ main_template:
12
12
  - Does not omit any unique idea from the originals.
13
13
 
14
14
  Respond only in JSON format:
15
- {{"result": "string"}}
15
+ {{"result": "merged_question"}}
16
16
 
17
17
  Here is the questions:
18
18
  {text}
19
19
 
20
- reason: |
20
+ stepwise: |
21
21
  You are an AI assistant helping to unify semantically similar questions.
22
22
  First, briefly extract the unique intent or content from each input question.
23
23
  Then, write one merged question that combines all their content clearly and naturally, without redundancy.
24
+
24
25
  Step 1: Extract key ideas.
25
26
  Step 2: Write the final merged question.
26
27
 
27
28
  Respond only in JSON format:
28
- {{"result": "string"}}
29
+ {{"result": "merged_question"}}
29
30
 
30
31
  Here is the questions:
31
32
  {text}
32
33
 
34
+
33
35
  analyze_template:
34
36
 
35
- default: |
37
+ simple: |
36
38
  You are a language expert.
37
39
  Analyze the following questions to identify their core intent, key concepts,
38
40
  and the specific information they are seeking.
@@ -42,7 +44,7 @@ analyze_template:
42
44
  Here is the question:
43
45
  {text}
44
46
 
45
- reason: |
47
+ stepwise: |
46
48
  Analyze the following questions to identify their exact wording, phrasing,
47
49
  and the literal meaning it conveys.
48
50
  Provide a brief, summarized analysis of their linguistic structure and current meaning,
@@ -6,19 +6,23 @@ main_template: |
6
6
  A single, self-contained statement of fact that is concise and verifiable.
7
7
 
8
8
  Strict Guidelines:
9
- 1. Remove Meta-Data: STRICTLY EXCLUDE all citations, references, URLs, source attributions (e.g., "Source: makarem.ir"), and conversational fillers (e.g., "Based on the documents...", "In conclusion...").
10
- 2. Resolve Context: Replace pronouns ("it", "this", "they") with the specific nouns they refer to. Each proposition must make sense in isolation.
11
- 3. Preserve Logic: Keep conditions attached to their facts. Do not split a rule from its condition (e.g., "If X, then Y" should be one proposition).
12
- 4. No Redundancy: Do not extract summary statements that merely repeat facts already listed.
9
+ - Remove Meta-Data: STRICTLY EXCLUDE all citations, references, URLs, source attributions (e.g., "Source: makarem.ir"), and conversational fillers (e.g., "Based on the documents...", "In conclusion...").
10
+ - Resolve Context: Replace pronouns ("it", "this", "they") with the specific nouns they refer to. Each proposition must make sense in isolation.
11
+ - Preserve Logic: Keep conditions attached to their facts. Do not split a rule from its condition (e.g., "If X, then Y" should be one proposition).
12
+ - No Redundancy: Do not extract summary statements that merely repeat facts already listed.
13
13
 
14
- Extract the atomic propositions from the following text:
14
+ Respond only in JSON format:
15
+ {{"result": ["text1", "text2", ...]}}
16
+
17
+ Here is the text:
15
18
  {text}
16
19
 
17
20
  analyze_template: |
18
- We want to analyze this text snippet and think about where we can split sentence to atomic meaningful propositions.
21
+ You must analyze this text snippet and think about where we can split sentence to atomic meaningful propositions.
19
22
  An atomic proposition is a single, self-contained fact that is concise,
20
23
  verifiable, and does not rely on external context.
21
24
  You just have to think around the possible propositions in the text and how a proposition can be made.
22
25
 
23
26
  Here is the text:
24
- {text}
27
+ {text}
28
+
@@ -1,7 +1,9 @@
1
1
  main_template: |
2
2
  {text}
3
+
3
4
  Respond only in JSON format:
4
5
  {output_model_str}
5
6
 
6
7
  analyze_template: |
7
- {analyze_template}
8
+ {analyze_template}
9
+
@@ -1,11 +1,11 @@
1
1
  main_template: |
2
- You are a summarizer.
2
+ You are an expert summarizer.
3
3
  You must summarize the given text, preserving its meaning.
4
4
 
5
5
  Respond only in JSON format:
6
- {{"result": "string"}}
6
+ {{"result": "summary"}}
7
7
 
8
- Provide a concise summary of the following text:
8
+ Here is the text:
9
9
  {text}
10
10
 
11
11