PyPI - hamtaa-texttools - Versions diffs - 1.0.5__py3-none-any.whl → 1.1.16__py3-none-any.whl - Mend

hamtaa-texttools 1.0.5py3-none-any.whl → 1.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

hamtaa_texttools-1.1.16.dist-info/METADATA +255 -0
hamtaa_texttools-1.1.16.dist-info/RECORD +31 -0
texttools/__init__.py +6 -8
texttools/batch/batch_config.py +26 -0
texttools/batch/batch_runner.py +144 -139
texttools/batch/{batch_manager.py → internals/batch_manager.py} +42 -54
texttools/batch/internals/utils.py +16 -0
texttools/prompts/README.md +8 -4
texttools/prompts/categorize.yaml +77 -0
texttools/prompts/detect_entity.yaml +22 -0
texttools/prompts/extract_keywords.yaml +68 -0
texttools/prompts/{question_merger.yaml → merge_questions.yaml} +5 -5
texttools/tools/async_tools.py +804 -0
texttools/tools/internals/async_operator.py +139 -236
texttools/tools/internals/formatters.py +24 -0
texttools/tools/internals/models.py +183 -0
texttools/tools/internals/operator_utils.py +54 -0
texttools/tools/internals/prompt_loader.py +23 -43
texttools/tools/internals/sync_operator.py +201 -0
texttools/tools/sync_tools.py +804 -0
hamtaa_texttools-1.0.5.dist-info/METADATA +0 -192
hamtaa_texttools-1.0.5.dist-info/RECORD +0 -30
texttools/batch/__init__.py +0 -4
texttools/formatters/base_formatter.py +0 -33
texttools/formatters/user_merge_formatter.py +0 -30
texttools/prompts/categorizer.yaml +0 -28
texttools/prompts/keyword_extractor.yaml +0 -18
texttools/tools/__init__.py +0 -4
texttools/tools/async_the_tool.py +0 -277
texttools/tools/internals/operator.py +0 -295
texttools/tools/internals/output_models.py +0 -52
texttools/tools/the_tool.py +0 -501
{hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/WHEEL +0 -0
{hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/licenses/LICENSE +0 -0
{hamtaa_texttools-1.0.5.dist-info → hamtaa_texttools-1.1.16.dist-info}/top_level.txt +0 -0
/texttools/prompts/{ner_extractor.yaml → extract_entities.yaml} +0 -0
/texttools/prompts/{question_detector.yaml → is_question.yaml} +0 -0
/texttools/prompts/{rewriter.yaml → rewrite.yaml} +0 -0
/texttools/prompts/{custom_tool.yaml → run_custom.yaml} +0 -0
/texttools/prompts/{subject_question_generator.yaml → subject_to_question.yaml} +0 -0
/texttools/prompts/{summarizer.yaml → summarize.yaml} +0 -0
/texttools/prompts/{question_generator.yaml → text_to_question.yaml} +0 -0
/texttools/prompts/{translator.yaml → translate.yaml} +0 -0

texttools/tools/internals/async_operator.py CHANGED Viewed

@@ -1,297 +1,200 @@
-from __future__ import annotations
-import json
-import math
-import re
-from typing import Any, Literal, Optional, TypeVar
+from typing import Any, TypeVar, Type
+from collections.abc import Callable
+import logging
 from openai import AsyncOpenAI
 from pydantic import BaseModel
-from texttools.formatters.user_merge_formatter import (
-    UserMergeFormatter,
-)
+from texttools.tools.internals.models import ToolOutput
+from texttools.tools.internals.operator_utils import OperatorUtils
+from texttools.tools.internals.formatters import Formatter
 from texttools.tools.internals.prompt_loader import PromptLoader
 # Base Model type for output models
 T = TypeVar("T", bound=BaseModel)
+logger = logging.getLogger("texttools.async_operator")
 class AsyncOperator:
     """
-    Async version of Operator.
+    Core engine for running text-processing operations with an LLM (Async).
-    Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
+    It wires together:
+    - `PromptLoader` → loads YAML prompt templates.
+    - `UserMergeFormatter` → applies formatting to messages (e.g., merging).
+    - AsyncOpenAI client → executes completions/parsed completions.
     """
-    def __init__(
-        self,
-        client: AsyncOpenAI,
-        *,
-        model: str,
-        temperature: float = 0.0,
-        **client_kwargs: Any,
-    ):
-        self.client: AsyncOpenAI = client
-        self.model = model
-        self.temperature = temperature
-        self.client_kwargs = client_kwargs
-    def _build_user_message(self, prompt: str) -> dict[str, str]:
-        return {"role": "user", "content": prompt}
-    async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
-        try:
-            completion = await self.client.chat.completions.create(
-                model=self.model,
-                messages=analyze_message,
-                temperature=self.temperature,
-                **self.client_kwargs,
-            )
-            analysis = completion.choices[0].message.content.strip()
-            return analysis
+    def __init__(self, client: AsyncOpenAI, model: str):
+        self._client = client
+        self._model = model
-        except Exception as e:
-            print(f"[ERROR] Analysis failed: {e}")
-            raise
-    async def _analyze(self, prompt_configs: dict[str, str]) -> str:
+    async def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
+        """
+        Calls OpenAI API for analysis using the configured prompt template.
+        Returns the analyzed content as a string.
+        """
         analyze_prompt = prompt_configs["analyze_template"]
-        analyze_message = [self._build_user_message(analyze_prompt)]
-        analysis = await self._analysis_completion(analyze_message)
+        analyze_message = [OperatorUtils.build_user_message(analyze_prompt)]
+        completion = await self._client.chat.completions.create(
+            model=self._model,
+            messages=analyze_message,
+            temperature=temperature,
+        )
+        analysis = completion.choices[0].message.content.strip()
         return analysis
     async def _parse_completion(
         self,
         message: list[dict[str, str]],
-        output_model: T,
+        output_model: Type[T],
+        temperature: float,
         logprobs: bool = False,
         top_logprobs: int = 3,
-        max_tokens: int | None = None,
+        priority: int | None = 0,
     ) -> tuple[T, Any]:
-        try:
-            request_kwargs = {
-                "model": self.model,
-                "messages": message,
-                "response_format": output_model,
-                "temperature": self.temperature,
-                **self.client_kwargs,
-            }
-            if max_tokens is not None:
-                request_kwargs["max_tokens"] = max_tokens
-            if logprobs:
-                request_kwargs["logprobs"] = True
-                request_kwargs["top_logprobs"] = top_logprobs
-            completion = await self.client.beta.chat.completions.parse(**request_kwargs)
-            parsed = completion.choices[0].message.parsed
-            return parsed, completion
-        except Exception as e:
-            print(f"[ERROR] Failed to parse completion: {e}")
-            raise
-    def _clean_json_response(self, response: str) -> str:
         """
-        Clean JSON response by removing code block markers and whitespace.
-        Handles cases like:
-        - ```json{"result": "value"}```
+        Parses a chat completion using OpenAI's structured output format.
+        Returns both the parsed object and the raw completion for logprobs.
         """
-        cleaned = response.strip()
-        # Remove ```json marker
-        if cleaned.startswith("```json"):
-            cleaned = cleaned[7:]
-        # Remove trailing ```
-        if cleaned.endswith("```"):
-            cleaned = cleaned[:-3]
-        return cleaned.strip()
-    def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
-        """
-        Convert a JSON response string to output model.
-        Args:
-            response_string: The JSON string (may contain code block markers)
-            output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
-        Returns:
-            Instance of your output model
-        """
-        try:
-            # Clean the response string
-            cleaned_json = self._clean_json_response(response_string)
-            # Fix Python-style booleans
-            cleaned_json = cleaned_json.replace("False", "false").replace(
-                "True", "true"
-            )
-            # Convert string to Python dictionary
-            response_dict = json.loads(cleaned_json)
-            # Convert dictionary to output model
-            return output_model(**response_dict)
-        except json.JSONDecodeError as e:
-            raise ValueError(
-                f"Failed to parse JSON response: {e}\nResponse: {response_string}"
-            )
-        except Exception as e:
-            raise ValueError(f"Failed to convert to output model: {e}")
-    async def _vllm_completion(
-        self,
-        message: list[dict[str, str]],
-        output_model: T,
-        logprobs: bool = False,
-        top_logprobs: int = 3,
-        max_tokens: int | None = None,
-    ) -> tuple[T, Any]:
-        try:
-            json_schema = output_model.model_json_schema()
-            # Build kwargs dynamically
-            request_kwargs = {
-                "model": self.model,
-                "messages": message,
-                "extra_body": {"guided_json": json_schema},
-                "temperature": self.temperature,
-                **self.client_kwargs,
-            }
-            if max_tokens is not None:
-                request_kwargs["max_tokens"] = max_tokens
-            if logprobs:
-                request_kwargs["logprobs"] = True
-                request_kwargs["top_logprobs"] = top_logprobs
-            completion = await self.client.chat.completions.create(**request_kwargs)
-            response = completion.choices[0].message.content
-            # Convert the string response to output model
-            parsed = self._convert_to_output_model(response, output_model)
-            return parsed, completion
-        except Exception as e:
-            print(f"[ERROR] Failed to get vLLM structured output: {e}")
-            raise
-    def _extract_logprobs(self, completion: dict):
-        logprobs_data = []
-        ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
-        for choice in completion.choices:
-            if not getattr(choice, "logprobs", None):
-                continue
-            for logprob_item in choice.logprobs.content:
-                if ignore_pattern.match(logprob_item.token):
-                    continue
-                token_entry = {
-                    "token": logprob_item.token,
-                    "prob": round(math.exp(logprob_item.logprob), 8),
-                    "top_alternatives": [],
-                }
-                for alt in logprob_item.top_logprobs:
-                    if ignore_pattern.match(alt.token):
-                        continue
-                    token_entry["top_alternatives"].append(
-                        {
-                            "token": alt.token,
-                            "prob": round(math.exp(alt.logprob), 8),
-                        }
-                    )
-                logprobs_data.append(token_entry)
-        return logprobs_data
+        request_kwargs = {
+            "model": self._model,
+            "messages": message,
+            "response_format": output_model,
+            "temperature": temperature,
+        }
+        if logprobs:
+            request_kwargs["logprobs"] = True
+            request_kwargs["top_logprobs"] = top_logprobs
+        if priority:
+            request_kwargs["extra_body"] = {"priority": priority}
+        completion = await self._client.beta.chat.completions.parse(**request_kwargs)
+        parsed = completion.choices[0].message.parsed
+        return parsed, completion
     async def run(
         self,
-        input_text: str,
+        # User parameters
+        text: str,
+        with_analysis: bool,
+        output_lang: str | None,
+        user_prompt: str | None,
+        temperature: float,
+        logprobs: bool,
+        top_logprobs: int | None,
+        validator: Callable[[Any], bool] | None,
+        max_validation_retries: int | None,
+        # Internal parameters
         prompt_file: str,
-        output_model: T,
-        with_analysis: bool = False,
-        use_modes: bool = False,
-        mode: str = "",
-        resp_format: Literal["vllm", "parse"] = "parse",
-        output_lang: str | None = None,
-        logprobs: bool = False,
-        top_logprobs: int = 3,
-        max_tokens: int | None = None,
+        output_model: Type[T],
+        mode: str | None,
+        priority: int | None = 0,
         **extra_kwargs,
-    ) -> dict[str, Any]:
+    ) -> ToolOutput:
         """
-        Execute the async LLM pipeline with the given input text.
+        Execute the async LLM pipeline with the given input text. (Async)
         """
         prompt_loader = PromptLoader()
-        formatter = UserMergeFormatter()
+        formatter = Formatter()
+        output = ToolOutput()
         try:
-            cleaned_text = input_text.strip()
-            # FIXED: Correct parameter order for load
+            # Prompt configs contain two keys: main_template and analyze template, both are string
             prompt_configs = prompt_loader.load(
-                prompt_file=prompt_file,  # prompt_file
-                text=cleaned_text,  # text
-                mode=mode if use_modes else "",  # mode
+                prompt_file=prompt_file,
+                text=text.strip(),
+                mode=mode,
                 **extra_kwargs,
             )
-            messages: list[dict[str, str]] = []
+            messages = []
             if with_analysis:
-                analysis = await self._analyze(prompt_configs)
+                analysis = await self._analyze(prompt_configs, temperature)
                 messages.append(
-                    self._build_user_message(f"Based on this analysis: {analysis}")
+                    OperatorUtils.build_user_message(
+                        f"Based on this analysis: {analysis}"
+                    )
                 )
             if output_lang:
                 messages.append(
-                    self._build_user_message(
+                    OperatorUtils.build_user_message(
                         f"Respond only in the {output_lang} language."
                     )
                 )
-            messages.append(self._build_user_message(prompt_configs["main_template"]))
-            messages = formatter.format(messages)
-            if resp_format == "vllm":
-                parsed, completion = await self._vllm_completion(
-                    messages,
-                    output_model,
-                    logprobs,
-                    top_logprobs,
-                    max_tokens,  # Pass max_tokens
-                )
-            elif resp_format == "parse":
-                parsed, completion = await self._parse_completion(
-                    messages,
-                    output_model,
-                    logprobs,
-                    top_logprobs,
-                    max_tokens,  # Pass max_tokens
+            if user_prompt:
+                messages.append(
+                    OperatorUtils.build_user_message(
+                        f"Consider this instruction {user_prompt}"
+                    )
                 )
-            else:
-                raise ValueError(f"Unknown resp_format: {resp_format}")
-            results = {"result": parsed.result}
+            messages.append(
+                OperatorUtils.build_user_message(prompt_configs["main_template"])
+            )
+            messages = formatter.user_merge_format(messages)
+            parsed, completion = await self._parse_completion(
+                messages, output_model, temperature, logprobs, top_logprobs, priority
+            )
+            output.result = parsed.result
+            # Retry logic if validation fails
+            if validator and not validator(output.result):
+                for attempt in range(max_validation_retries):
+                    logger.warning(
+                        f"Validation failed, retrying for the {attempt + 1} time."
+                    )
+                    # Generate new temperature for retry
+                    retry_temperature = OperatorUtils.get_retry_temp(temperature)
+                    try:
+                        parsed, completion = await self._parse_completion(
+                            messages,
+                            output_model,
+                            retry_temperature,
+                            logprobs,
+                            top_logprobs,
+                        )
+                        output.result = parsed.result
+                        # Check if retry was successful
+                        if validator(output.result):
+                            logger.info(
+                                f"Validation passed on retry attempt {attempt + 1}"
+                            )
+                            break
+                        else:
+                            logger.warning(
+                                f"Validation still failing after retry attempt {attempt + 1}"
+                            )
+                    except Exception as e:
+                        logger.error(f"Retry attempt {attempt + 1} failed: {e}")
+                        # Continue to next retry attempt if this one fails
+            # Final check after all retries
+            if validator and not validator(output.result):
+                output.errors.append("Validation failed after all retry attempts")
             if logprobs:
-                results["logprobs"] = self._extract_logprobs(completion)
+                output.logprobs = OperatorUtils.extract_logprobs(completion)
             if with_analysis:
-                results["analysis"] = analysis
+                output.analysis = analysis
+            output.process = prompt_file[:-5]
-            return results
+            return output
         except Exception as e:
-            print(f"[ERROR] Async operation failed: {e}")
-            raise
+            logger.error(f"AsyncTheTool failed: {e}")
+            output.errors.append(str(e))
+            return output

texttools/tools/internals/formatters.py ADDED Viewed

@@ -0,0 +1,24 @@
+class Formatter:
+    @staticmethod
+    def user_merge_format(messages: list[dict[str, str]]) -> list[dict[str, str]]:
+        """
+        Merges consecutive user messages into a single message, separated by newlines.
+        This is useful for condensing a multi-turn user input into a single
+        message for the LLM. Assistant and system messages are left unchanged and
+        act as separators between user message groups.
+        """
+        merged: list[dict[str, str]] = []
+        for message in messages:
+            role, content = message["role"], message["content"].strip()
+            # Merge with previous user turn
+            if merged and role == "user" and merged[-1]["role"] == "user":
+                merged[-1]["content"] += "\n" + content
+            # Otherwise, start a new turn
+            else:
+                merged.append({"role": role, "content": content})
+        return merged

texttools/tools/internals/models.py ADDED Viewed

@@ -0,0 +1,183 @@
+from datetime import datetime
+from typing import Type, Any, Literal
+from pydantic import BaseModel, Field, create_model
+class ToolOutput(BaseModel):
+    result: Any = None
+    analysis: str = ""
+    logprobs: list[dict[str, Any]] = []
+    process: str = ""
+    processed_at: datetime = datetime.now()
+    execution_time: float = -1.0
+    errors: list[str] = []
+    def __repr__(self) -> str:
+        return f"ToolOutput(process='{self.process}', result_type='{type(self.result)}', result='{self.result}', analysis='{self.analysis}', logprobs='{self.logprobs}', errors='{self.errors}', processed_at='{self.processed_at}', execution_time='{self.execution_time}'"
+class StrOutput(BaseModel):
+    result: str = Field(..., description="The output string")
+class BoolOutput(BaseModel):
+    result: bool = Field(
+        ..., description="Boolean indicating the output state", example=True
+    )
+class ListStrOutput(BaseModel):
+    result: list[str] = Field(
+        ..., description="The output list of strings", example=["text_1", "text_2"]
+    )
+class ListDictStrStrOutput(BaseModel):
+    result: list[dict[str, str]] = Field(
+        ...,
+        description="List of dictionaries containing string key-value pairs",
+        example=[{"text": "Mohammad", "type": "PER"}],
+    )
+class ReasonListStrOutput(BaseModel):
+    reason: str = Field(..., description="Thinking process that led to the output")
+    result: list[str] = Field(..., description="The output list of strings")
+class Node(BaseModel):
+    node_id: int
+    name: str
+    level: int
+    parent_id: int | None
+    description: str = "No description provided"
+class CategoryTree:
+    def __init__(self, tree_name):
+        self.root = Node(node_id=0, name=tree_name, level=0, parent_id=None)
+        self.all_nodes: list[Node] = [self.root]
+        self.new_id = 1
+    def add_node(
+        self,
+        node_name: str,
+        parent_name: str | None = None,
+        description: str | None = None,
+    ) -> None:
+        if self.find_node(node_name):
+            raise ValueError(f"{node_name} has been chosen for another category before")
+        if parent_name:
+            parent_node = self.find_node(parent_name)
+            if parent_node is None:
+                raise ValueError(f"Parent category '{parent_name}' not found")
+            parent_id = parent_node.node_id
+            level = parent_node.level + 1
+        else:
+            level = 1
+            parent_id = 0
+        node_data = {
+            "node_id": self.new_id,
+            "name": node_name,
+            "level": level,
+            "parent_id": parent_id,
+        }
+        if description is not None:
+            node_data["description"] = description
+        self.all_nodes.append(Node(**node_data))
+        self.new_id += 1
+    def get_nodes(self) -> list[Node]:
+        return self.all_nodes
+    def get_level_count(self) -> int:
+        return max([item.level for item in self.all_nodes])
+    def find_node(self, identifier: int | str) -> Node | None:
+        if isinstance(identifier, str):
+            for node in self.get_nodes():
+                if node.name == identifier:
+                    return node
+            return None
+        elif isinstance(identifier, int):
+            for node in self.get_nodes():
+                if node.node_id == identifier:
+                    return node
+            return None
+        else:
+            return None
+    def find_children(self, parent_node: Node) -> list[Node] | None:
+        children = [
+            node for node in self.get_nodes() if parent_node.node_id == node.parent_id
+        ]
+        return children if children else None
+    def remove_node(self, identifier: int | str) -> None:
+        node = self.find_node(identifier)
+        if node is not None:
+            # Remove node's children recursively
+            children = self.find_children(node)
+            # Ending condition
+            if children is None:
+                self.all_nodes.remove(node)
+                return
+            for child in children:
+                self.remove_node(child.name)
+            # Remove the node from tree
+            self.all_nodes.remove(node)
+        else:
+            raise ValueError(f"Node with identifier: '{identifier}' not found.")
+    def dump_tree(self) -> dict:
+        def build_dict(node: Node) -> dict:
+            children = [
+                build_dict(child)
+                for child in self.all_nodes
+                if child.parent_id == node.node_id
+            ]
+            return {
+                "node_id": node.node_id,
+                "name": node.name,
+                "level": node.level,
+                "parent_id": node.parent_id,
+                "children": children,
+            }
+        return {"category_tree": build_dict(self.root)["children"]}
+# This function is needed to create CategorizerOutput with dynamic categories
+def create_dynamic_model(allowed_values: list[str]) -> Type[BaseModel]:
+    literal_type = Literal[*allowed_values]
+    CategorizerOutput = create_model(
+        "CategorizerOutput",
+        reason=(
+            str,
+            Field(
+                ..., description="Explanation of why the input belongs to the category"
+            ),
+        ),
+        result=(literal_type, Field(..., description="Predicted category label")),
+    )
+    return CategorizerOutput
+class Entity(BaseModel):
+    text: str = Field(description="The exact text of the entity")
+    entity_type: str = Field(description="The type of the entity")
+class EntityDetectorOutput(BaseModel):
+    result: list[Entity] = Field(description="List of all extracted entities")

hamtaa-texttools 1.0.5__py3-none-any.whl → 1.1.16__py3-none-any.whl

hamtaa-texttools 1.0.5py3-none-any.whl → 1.1.16py3-none-any.whl