PyPI - hamtaa-texttools - Versions diffs - 1.3.2__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

hamtaa-texttools 1.3.2py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/METADATA +40 -47
hamtaa_texttools-2.1.0.dist-info/RECORD +30 -0
{hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/WHEEL +1 -1
{hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/licenses/LICENSE +1 -1
texttools/__init__.py +1 -1
texttools/core/internal_models.py +16 -7
texttools/core/operators/async_operator.py +10 -16
texttools/core/operators/sync_operator.py +10 -16
texttools/core/utils.py +260 -0
texttools/models.py +77 -22
texttools/prompts/{rewrite.yaml → augment.yaml} +3 -3
texttools/prompts/categorize.yaml +7 -8
texttools/prompts/extract_entities.yaml +2 -2
texttools/prompts/extract_keywords.yaml +4 -2
texttools/prompts/{check_fact.yaml → is_fact.yaml} +5 -4
texttools/prompts/is_question.yaml +1 -1
texttools/prompts/merge_questions.yaml +8 -6
texttools/prompts/propositionize.yaml +11 -7
texttools/prompts/run_custom.yaml +3 -1
texttools/prompts/summarize.yaml +3 -3
texttools/prompts/to_question.yaml +60 -0
texttools/prompts/translate.yaml +4 -4
texttools/tools/async_tools.py +152 -169
texttools/tools/sync_tools.py +138 -150
hamtaa_texttools-1.3.2.dist-info/RECORD +0 -31
texttools/core/engine.py +0 -262
texttools/prompts/subject_to_question.yaml +0 -26
texttools/prompts/text_to_question.yaml +0 -26
{hamtaa_texttools-1.3.2.dist-info → hamtaa_texttools-2.1.0.dist-info}/top_level.txt +0 -0

texttools/core/utils.py ADDED Viewed

@@ -0,0 +1,260 @@
+import asyncio
+import math
+import random
+import re
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import yaml
+from .exceptions import PromptError
+class OperatorUtils:
+    """
+    Collection of utilities used in operators
+    """
+    @staticmethod
+    @lru_cache(maxsize=32)
+    def _load_prompt_yaml(prompt_file: str) -> dict:
+        base_dir = Path(__file__).parent.parent / "prompts"
+        prompt_path = base_dir / prompt_file
+        if not prompt_path.exists():
+            raise PromptError(f"Prompt file not found: {prompt_file}")
+        try:
+            return yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
+        except yaml.YAMLError as e:
+            raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
+    @staticmethod
+    def load_prompt(
+        prompt_file: str, text: str, mode: str, **extra_kwargs
+    ) -> dict[str, str]:
+        try:
+            data = OperatorUtils._load_prompt_yaml(prompt_file)
+            if "main_template" not in data:
+                raise PromptError(f"Missing 'main_template' in {prompt_file}")
+            if "analyze_template" not in data:
+                raise PromptError(f"Missing 'analyze_template' in {prompt_file}")
+            if mode and mode not in data.get("main_template", {}):
+                raise PromptError(f"Mode '{mode}' not found in {prompt_file}")
+            main_template = (
+                data["main_template"][mode]
+                if mode and isinstance(data["main_template"], dict)
+                else data["main_template"]
+            )
+            analyze_template = (
+                data["analyze_template"][mode]
+                if mode and isinstance(data["analyze_template"], dict)
+                else data["analyze_template"]
+            )
+            if not main_template or not main_template.strip():
+                raise PromptError(
+                    f"Empty main_template in {prompt_file}"
+                    + (f" for mode '{mode}'" if mode else "")
+                )
+            template_configs = {
+                "main_template": main_template,
+                "analyze_template": analyze_template,
+            }
+            format_args = {"text": text}
+            format_args.update(extra_kwargs)
+            # Inject variables into the templates
+            for key, value in template_configs.items():
+                template_configs[key] = value.format(**format_args)
+            return template_configs
+        except yaml.YAMLError as e:
+            raise PromptError(f"Invalid YAML in {prompt_file}: {e}")
+        except KeyError as e:
+            raise PromptError(f"Missing template variable: {e}")
+        except Exception as e:
+            raise PromptError(f"Failed to load prompt {prompt_file}: {e}")
+    @staticmethod
+    def build_main_prompt(
+        main_template: str,
+        analysis: str | None,
+        output_lang: str | None,
+        user_prompt: str | None,
+    ) -> str:
+        parts = []
+        if analysis:
+            parts.append(f"Based on this analysis: {analysis}")
+        if output_lang:
+            parts.append(f"Respond only in the {output_lang} language.")
+        if user_prompt:
+            parts.append(f"Consider this instruction: {user_prompt}")
+        parts.append(main_template)
+        return "\n".join(parts)
+    @staticmethod
+    def build_message(prompt: str) -> list[dict[str, str]]:
+        return [{"role": "user", "content": prompt}]
+    @staticmethod
+    def extract_logprobs(completion: Any) -> list[dict]:
+        """
+        Extracts and filters logprobs from completion.
+        Skips punctuation and structural tokens.
+        """
+        logprobs_data = []
+        ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
+        for choice in completion.choices:
+            if not getattr(choice, "logprobs", None):
+                raise ValueError("Your model does not support logprobs")
+            for logprob_item in choice.logprobs.content:
+                if ignore_pattern.match(logprob_item.token):
+                    continue
+                token_entry = {
+                    "token": logprob_item.token,
+                    "prob": round(math.exp(logprob_item.logprob), 8),
+                    "top_alternatives": [],
+                }
+                for alt in logprob_item.top_logprobs:
+                    if ignore_pattern.match(alt.token):
+                        continue
+                    token_entry["top_alternatives"].append(
+                        {
+                            "token": alt.token,
+                            "prob": round(math.exp(alt.logprob), 8),
+                        }
+                    )
+                logprobs_data.append(token_entry)
+        return logprobs_data
+    @staticmethod
+    def get_retry_temp(base_temp: float) -> float:
+        new_temp = base_temp + random.choice([-1, 1]) * random.uniform(0.1, 0.9)
+        return max(0.0, min(new_temp, 1.5))
+class TheToolUtils:
+    """
+    Collection of utilities used in TheTool's tools
+    """
+    @staticmethod
+    def to_chunks(text: str, size: int, overlap: int) -> list[str]:
+        separators = ["\n\n", "\n", " ", ""]
+        is_separator_regex = False
+        keep_separator = True
+        length_function = len
+        strip_whitespace = True
+        chunk_size = size
+        chunk_overlap = overlap
+        def _split_text_with_regex(
+            text: str, separator: str, keep_separator: bool
+        ) -> list[str]:
+            if not separator:
+                return [text]
+            if not keep_separator:
+                return re.split(separator, text)
+            _splits = re.split(f"({separator})", text)
+            splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
+            if len(_splits) % 2 == 0:
+                splits += [_splits[-1]]
+            return [_splits[0]] + splits if _splits[0] else splits
+        def _join_docs(docs: list[str], separator: str) -> str | None:
+            text = separator.join(docs)
+            if strip_whitespace:
+                text = text.strip()
+            return text if text else None
+        def _merge_splits(splits: list[str], separator: str) -> list[str]:
+            separator_len = length_function(separator)
+            docs = []
+            current_doc = []
+            total = 0
+            for d in splits:
+                len_ = length_function(d)
+                if total + len_ + (separator_len if current_doc else 0) > chunk_size:
+                    if total > chunk_size:
+                        pass
+                    if current_doc:
+                        doc = _join_docs(current_doc, separator)
+                        if doc is not None:
+                            docs.append(doc)
+                        while total > chunk_overlap or (
+                            total + len_ + (separator_len if current_doc else 0)
+                            > chunk_size
+                            and total > 0
+                        ):
+                            total -= length_function(current_doc[0]) + (
+                                separator_len if len(current_doc) > 1 else 0
+                            )
+                            current_doc = current_doc[1:]
+                current_doc.append(d)
+                total += len_ + (separator_len if len(current_doc) > 1 else 0)
+            doc = _join_docs(current_doc, separator)
+            if doc is not None:
+                docs.append(doc)
+            return docs
+        def _split_text(text: str, separators: list[str]) -> list[str]:
+            final_chunks = []
+            separator = separators[-1]
+            new_separators = []
+            for i, _s in enumerate(separators):
+                separator_ = _s if is_separator_regex else re.escape(_s)
+                if not _s:
+                    separator = _s
+                    break
+                if re.search(separator_, text):
+                    separator = _s
+                    new_separators = separators[i + 1 :]
+                    break
+            separator_ = separator if is_separator_regex else re.escape(separator)
+            splits = _split_text_with_regex(text, separator_, keep_separator)
+            _separator = "" if keep_separator else separator
+            good_splits = []
+            for s in splits:
+                if length_function(s) < chunk_size:
+                    good_splits.append(s)
+                else:
+                    if good_splits:
+                        merged_text = _merge_splits(good_splits, _separator)
+                        final_chunks.extend(merged_text)
+                        good_splits = []
+                    if not new_separators:
+                        final_chunks.append(s)
+                    else:
+                        other_info = _split_text(s, new_separators)
+                        final_chunks.extend(other_info)
+            if good_splits:
+                merged_text = _merge_splits(good_splits, _separator)
+                final_chunks.extend(merged_text)
+            return final_chunks
+        return _split_text(text, separators)
+    @staticmethod
+    async def run_with_timeout(coro: Any, timeout: float | None) -> Any:
+        if timeout is None:
+            return await coro
+        try:
+            return await asyncio.wait_for(coro, timeout=timeout)
+        except asyncio.TimeoutError:
+            raise TimeoutError(f"Operation exceeded timeout of {timeout} seconds")

texttools/models.py CHANGED Viewed

@@ -3,12 +3,12 @@ from __future__ import annotations
 from datetime import datetime
 from typing import Any
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 class ToolOutputMetadata(BaseModel):
     tool_name: str
-    processed_at: datetime = datetime.now()
+    processed_at: datetime = Field(default_factory=datetime.now)
     execution_time: float | None = None
@@ -19,22 +19,26 @@ class ToolOutput(BaseModel):
     errors: list[str] = []
     metadata: ToolOutputMetadata | None = None
-    def __repr__(self) -> str:
-        return f"ToolOutput({self.model_dump_json(indent=2)})"
+    def is_successful(self) -> bool:
+        return not self.errors and self.result is not None
+    def to_dict(self, exclude_none: bool = False) -> dict:
+        return self.model_dump(exclude_none=exclude_none)
-class Node:
-    def __init__(self, name: str, description: str, level: int, parent: Node | None):
-        self.name = name
-        self.description = description
-        self.level = level
-        self.parent = parent
-        self.children = {}
+    def to_json(self, indent: int = 2, exclude_none: bool = False) -> str:
+        return self.model_dump_json(indent=indent, exclude_none=exclude_none)
+class Node(BaseModel):
+    name: str
+    description: str | None
+    level: int
+    children: dict[str, Node] | None = Field(default_factory=dict)
 class CategoryTree:
     def __init__(self):
-        self._root = Node(name="root", description="root", level=0, parent=None)
+        self._root = Node(name="root", description="root", level=0)
         self._all_nodes = {"root": self._root}
     def get_all_nodes(self) -> dict[str, Node]:
@@ -56,33 +60,84 @@ class CategoryTree:
             raise ValueError(f"Cannot add {name} category twice")
         parent = self.get_node(parent_name)
         if not parent:
-            raise ValueError(f"Parent category '{parent_name}' not found")
+            raise ValueError(f"Parent category {parent_name} not found")
         node_data = {
             "name": name,
             "description": description if description else "No description provided",
             "level": parent.level + 1,
-            "parent": parent,
         }
         new_node = Node(**node_data)
         parent.children[name] = new_node
         self._all_nodes[name] = new_node
-    def remove_node(self, name: str) -> None:
+    def _find_parent(self, name: str) -> Node | None:
+        def traverse(node: Node) -> Node | None:
+            if name in node.children:
+                return node
+            for child in node.children.values():
+                found = traverse(child)
+                if found:
+                    return found
+            return None
+        if name == "root":
+            return None
+        return traverse(self._root)
+    def remove_node(self, name: str, remove_children: bool = True) -> None:
         if name == "root":
             raise ValueError("Cannot remove the root node")
         node = self.get_node(name)
         if not node:
-            raise ValueError(f"Category: '{name}' not found")
+            raise ValueError(f"Category: {name} not found")
+        parent = self._find_parent(name)
+        if not parent and name != "root":
+            raise ValueError("Parent not found, tree inconsistent")
+        if remove_children:
+            # Recursively remove children
+            for child_name in list(node.children.keys()):
+                self.remove_node(child_name, remove_children=True)
+        else:
+            # Move children to parent (grandparent for the children)
+            for child_name, child in list(node.children.items()):
+                if child_name in parent.children:
+                    raise ValueError(f"Name conflict when moving child {child_name}")
+                parent.children[child_name] = child
+                # Update levels for moved subtree
+                def update_levels(n: Node, new_level: int):
+                    n.level = new_level
+                    for c in n.children.values():
+                        update_levels(c, new_level + 1)
+                update_levels(child, parent.level + 1)
+        del parent.children[name]
+        del self._all_nodes[name]
-        for child_name in list(node.children.keys()):
-            self.remove_node(child_name)
+    def dump_tree(self) -> dict:
+        return self._root.model_dump()
-        if node.parent:
-            del node.parent.children[name]
+    def _index_subtree(self, node: Node):
+        if node.name in self._all_nodes:
+            raise ValueError(f"Duplicate node name: {node.name}")
-        del self._all_nodes[name]
+        self._all_nodes[node.name] = node
+        for child in node.children.values():
+            self._index_subtree(child)
+    @classmethod
+    def from_dict(cls, root: dict) -> CategoryTree:
+        tree = cls()
+        tree._root = Node.model_validate(root)
+        tree._all_nodes = {}
+        tree._index_subtree(tree._root)
+        return tree

texttools/prompts/{rewrite.yaml → augment.yaml} RENAMED Viewed

@@ -15,7 +15,7 @@ main_template:
     - Avoid Minor Changes: Do not just add/remove a few words or swap names. Create a fundamentally different sentence.
     Respond only in JSON format:
-    {{"result": "str"}}
+    {{"result": "rewriteen_text"}}
     Anchor Text:
     "{text}"
@@ -32,7 +32,7 @@ main_template:
     - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
     Respond only in JSON format:
-    {{"result": "str"}}
+    {{"result": "rewriteen_text"}}
     Anchor Text:
     "{text}"
@@ -53,7 +53,7 @@ main_template:
       - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
       Respond only in JSON format:
-      {{"result": "str"}}
+      {{"result": "rewriteen_text"}}
       Anchor Text:
       "{text}"

texttools/prompts/categorize.yaml CHANGED Viewed

@@ -14,23 +14,22 @@ main_template: |
   - If descriptions are missing or empty, rely on the category name.
   - If the correct answer cannot be determined with certainty, choose the most likely one.
-  Output format:
+  Respond only in JSON format:
   {{
-  "reason": "Explanation of why the input belongs to the category"
-  "result": "<category_name_only>"
+  "reason": "explanation",
+  "result": "category_name",
   }}
-  Available categories with their descriptions:
+  Available categories:
   {category_list}
   Here is the text:
   {text}
 analyze_template: |
-  We want to categorize the given text.
-  To improve categorization, we need an analysis of the text.
-  Analyze the given text and write its main idea and a short analysis of that.
-  Analysis should be very short.
+  The task is to categorize the given text.
+  To improve categorization, you must write an analysis of the text.
+  Analyze the given text and write its main idea and a short analysis of it.
   Here is the text:
   {text}

texttools/prompts/extract_entities.yaml CHANGED Viewed

@@ -7,8 +7,8 @@ main_template: |
   {{
     "result": [
       {{
-        "text": "string",
-        "type": "string",
+        "text": "original_text",
+        "type": "ne_of_text",
       }}
     ]
   }}

texttools/prompts/extract_keywords.yaml CHANGED Viewed

@@ -12,7 +12,7 @@ main_template:
     - Output between 3 and 7 keywords based on the input length.
     Respond only in JSON format:
-    {{"result": ["keyword1", "keyword2", etc.]}}
+    {{"result": ["keyword1", "keyword2", ...]}}
     Here is the text:
     {text}
@@ -34,7 +34,7 @@ main_template:
       - Long texts (more than 4 paragraphs): 6–7 keywords
     Respond only in JSON format:
-    {{"result": ["keyword1", "keyword2", etc.]}}
+    {{"result": ["keyword1", "keyword2", ...]}}
     Here is the text:
     {text}
@@ -57,7 +57,9 @@ main_template:
     Here is the text:
     {text}
 analyze_template:
   auto: |
     Analyze the following text to identify its main topics, concepts, and important terms.
     Provide a concise summary of your findings that will help in extracting relevant keywords.

texttools/prompts/{check_fact.yaml → is_fact.yaml} RENAMED Viewed

@@ -13,12 +13,13 @@ main_template: |
   {source_text}
 analyze_template: |
-  You should analyze a statement and a source text and provide a brief,
-  summarized analysis that could help in determining that can the statement
-  be concluded from the source or not.
+  You must analyze a statement and a source text and provide a brief,
+  summarized analysis that could help in determining whether the statement
+  can be concluded from the source or not.
   The statement is:
   {text}
   The source text is:
-  {source_text}
+  {source_text}

texttools/prompts/is_question.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 main_template: |
   You are a question detector.
-  Determine that if the given text contains any question or not.
+  Determine whether the given text contains any question or not.
   Respond only in JSON format (Output should be a boolean):
   {{"result": True/False}}

texttools/prompts/merge_questions.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 main_template:
-  default: |
+  simple: |
     You are a language expert.
     I will give you a list of questions that are semantically similar.
     Your task is to merge them into one unified question.
@@ -12,27 +12,29 @@ main_template:
     - Does not omit any unique idea from the originals.
     Respond only in JSON format:
-    {{"result": "string"}}
+    {{"result": "merged_question"}}
     Here is the questions:
     {text}
-  reason: |
+  stepwise: |
     You are an AI assistant helping to unify semantically similar questions.
     First, briefly extract the unique intent or content from each input question.
     Then, write one merged question that combines all their content clearly and naturally, without redundancy.
     Step 1: Extract key ideas.
     Step 2: Write the final merged question.
     Respond only in JSON format:
-    {{"result": "string"}}
+    {{"result": "merged_question"}}
     Here is the questions:
     {text}
 analyze_template:
-  default: |
+  simple: |
     You are a language expert.
     Analyze the following questions to identify their core intent, key concepts,
     and the specific information they are seeking.
@@ -42,7 +44,7 @@ analyze_template:
     Here is the question:
     {text}
-  reason: |
+  stepwise: |
     Analyze the following questions to identify their exact wording, phrasing,
     and the literal meaning it conveys.
     Provide a brief, summarized analysis of their linguistic structure and current meaning,

texttools/prompts/propositionize.yaml CHANGED Viewed

@@ -6,19 +6,23 @@ main_template: |
   A single, self-contained statement of fact that is concise and verifiable.
   Strict Guidelines:
-  1. Remove Meta-Data: STRICTLY EXCLUDE all citations, references, URLs, source attributions (e.g., "Source: makarem.ir"), and conversational fillers (e.g., "Based on the documents...", "In conclusion...").
-  2. Resolve Context: Replace pronouns ("it", "this", "they") with the specific nouns they refer to. Each proposition must make sense in isolation.
-  3. Preserve Logic: Keep conditions attached to their facts. Do not split a rule from its condition (e.g., "If X, then Y" should be one proposition).
-  4. No Redundancy: Do not extract summary statements that merely repeat facts already listed.
+  - Remove Meta-Data: STRICTLY EXCLUDE all citations, references, URLs, source attributions (e.g., "Source: makarem.ir"), and conversational fillers (e.g., "Based on the documents...", "In conclusion...").
+  - Resolve Context: Replace pronouns ("it", "this", "they") with the specific nouns they refer to. Each proposition must make sense in isolation.
+  - Preserve Logic: Keep conditions attached to their facts. Do not split a rule from its condition (e.g., "If X, then Y" should be one proposition).
+  - No Redundancy: Do not extract summary statements that merely repeat facts already listed.
-  Extract the atomic propositions from the following text:
+    Respond only in JSON format:
+    {{"result": ["text1", "text2", ...]}}
+  Here is the text:
   {text}
 analyze_template: |
-  We want to analyze this text snippet and think about where we can split sentence to atomic meaningful propositions.
+  You must analyze this text snippet and think about where we can split sentence to atomic meaningful propositions.
   An atomic proposition is a single, self-contained fact that is concise,
   verifiable, and does not rely on external context.
   You just have to think around the possible propositions in the text and how a proposition can be made.
   Here is the text:
-  {text}
+  {text}

texttools/prompts/run_custom.yaml CHANGED Viewed

@@ -1,7 +1,9 @@
 main_template: |
   {text}
   Respond only in JSON format:
   {output_model_str}
 analyze_template: |
-  {analyze_template}
+  {analyze_template}

texttools/prompts/summarize.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
 main_template: |
-  You are a summarizer.
+  You are an expert summarizer.
   You must summarize the given text, preserving its meaning.
   Respond only in JSON format:
-  {{"result": "string"}}
+  {{"result": "summary"}}
-  Provide a concise summary of the following text:
+  Here is the text:
   {text}

hamtaa-texttools 1.3.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

hamtaa-texttools 1.3.2py3-none-any.whl → 2.1.0py3-none-any.whl