PyPI - hamtaa-texttools - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

hamtaa-texttools 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (30) hide show

{hamtaa_texttools-1.0.2.dist-info → hamtaa_texttools-1.0.4.dist-info}/METADATA +18 -6
hamtaa_texttools-1.0.4.dist-info/RECORD +29 -0
texttools/__init__.py +3 -3
texttools/{utils/batch_manager → batch}/batch_runner.py +1 -1
texttools/formatters/user_merge_formatter/user_merge_formatter.py +0 -17
texttools/prompts/README.md +5 -5
texttools/prompts/categorizer.yaml +16 -10
texttools/prompts/keyword_extractor.yaml +4 -1
texttools/prompts/ner_extractor.yaml +4 -1
texttools/prompts/question_detector.yaml +5 -2
texttools/prompts/question_generator.yaml +4 -3
texttools/prompts/question_merger.yaml +6 -4
texttools/prompts/question_rewriter.yaml +6 -4
texttools/prompts/subject_question_generator.yaml +3 -4
texttools/prompts/summarizer.yaml +1 -0
texttools/prompts/translator.yaml +1 -0
texttools/tools/__init__.py +2 -1
texttools/tools/async_the_tool.py +263 -0
texttools/tools/internals/async_operator.py +288 -0
texttools/tools/{operator.py → internals/operator.py} +133 -63
texttools/tools/{output_models.py → internals/output_models.py} +8 -0
texttools/tools/{prompt_loader.py → internals/prompt_loader.py} +16 -18
texttools/tools/the_tool.py +181 -72
hamtaa_texttools-1.0.2.dist-info/RECORD +0 -28
texttools/utils/__init__.py +0 -4
{hamtaa_texttools-1.0.2.dist-info → hamtaa_texttools-1.0.4.dist-info}/WHEEL +0 -0
{hamtaa_texttools-1.0.2.dist-info → hamtaa_texttools-1.0.4.dist-info}/licenses/LICENSE +0 -0
{hamtaa_texttools-1.0.2.dist-info → hamtaa_texttools-1.0.4.dist-info}/top_level.txt +0 -0
/texttools/{utils/batch_manager → batch}/__init__.py +0 -0
/texttools/{utils/batch_manager → batch}/batch_manager.py +0 -0

texttools/tools/internals/async_operator.py ADDED Viewed

@@ -0,0 +1,288 @@
+from __future__ import annotations
+import json
+import math
+import re
+from typing import Any, Literal, Optional, TypeVar
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+from texttools.formatters.user_merge_formatter.user_merge_formatter import (
+    UserMergeFormatter,
+)
+from texttools.tools.internals.prompt_loader import PromptLoader
+# Base Model type for output models
+T = TypeVar("T", bound=BaseModel)
+class AsyncOperator:
+    """
+    Async version of Operator.
+    Behaves like the synchronous Operator but uses AsyncOpenAI and async/await.
+    """
+    def __init__(
+        self,
+        client: AsyncOpenAI,
+        *,
+        model: str,
+        temperature: float = 0.0,
+        **client_kwargs: Any,
+    ):
+        self.client: AsyncOpenAI = client
+        self.model = model
+        self.temperature = temperature
+        self.client_kwargs = client_kwargs
+    def _build_user_message(self, prompt: str) -> dict[str, str]:
+        return {"role": "user", "content": prompt}
+    async def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
+        try:
+            completion = await self.client.chat.completions.create(
+                model=self.model,
+                messages=analyze_message,
+                temperature=self.temperature,
+                **self.client_kwargs,
+            )
+            analysis = completion.choices[0].message.content.strip()
+            return analysis
+        except Exception as e:
+            print(f"[ERROR] Analysis failed: {e}")
+            raise
+    async def _analyze(self, prompt_configs: dict[str, str]) -> str:
+        analyze_prompt = prompt_configs["analyze_template"]
+        analyze_message = [self._build_user_message(analyze_prompt)]
+        analysis = await self._analysis_completion(analyze_message)
+        return analysis
+    async def _parse_completion(
+        self,
+        message: list[dict[str, str]],
+        output_model: T,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+    ) -> tuple[T, Any]:
+        try:
+            request_kwargs = {
+                "model": self.model,
+                "messages": message,
+                "response_format": output_model,
+                "temperature": self.temperature,
+                **self.client_kwargs,
+            }
+            if logprobs:
+                request_kwargs["logprobs"] = True
+                request_kwargs["top_logprobs"] = top_logprobs
+            completion = await self.client.beta.chat.completions.parse(**request_kwargs)
+            parsed = completion.choices[0].message.parsed
+            return parsed, completion
+        except Exception as e:
+            print(f"[ERROR] Failed to parse completion: {e}")
+            raise
+    def _clean_json_response(self, response: str) -> str:
+        """
+        Clean JSON response by removing code block markers and whitespace.
+        Handles cases like:
+        - ```json{"result": "value"}```
+        """
+        cleaned = response.strip()
+        # Remove ```json marker
+        if cleaned.startswith("```json"):
+            cleaned = cleaned[7:]
+        # Remove trailing ```
+        if cleaned.endswith("```"):
+            cleaned = cleaned[:-3]
+        return cleaned.strip()
+    def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
+        """
+        Convert a JSON response string to output model.
+        Args:
+            response_string: The JSON string (may contain code block markers)
+            output_model: Your Pydantic output model class (e.g., StrOutput, ListStrOutput)
+        Returns:
+            Instance of your output model
+        """
+        try:
+            # Clean the response string
+            cleaned_json = self._clean_json_response(response_string)
+            # Fix Python-style booleans
+            cleaned_json = cleaned_json.replace("False", "false").replace(
+                "True", "true"
+            )
+            # Convert string to Python dictionary
+            response_dict = json.loads(cleaned_json)
+            # Convert dictionary to output model
+            return output_model(**response_dict)
+        except json.JSONDecodeError as e:
+            raise ValueError(
+                f"Failed to parse JSON response: {e}\nResponse: {response_string}"
+            )
+        except Exception as e:
+            raise ValueError(f"Failed to convert to output model: {e}")
+    async def _vllm_completion(
+        self,
+        message: list[dict[str, str]],
+        output_model: T,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+    ) -> tuple[T, Any]:
+        try:
+            json_schema = output_model.model_json_schema()
+            # Build kwargs dynamically
+            request_kwargs = {
+                "model": self.model,
+                "messages": message,
+                "extra_body": {"guided_json": json_schema},
+                "temperature": self.temperature,
+                **self.client_kwargs,
+            }
+            if logprobs:
+                request_kwargs["logprobs"] = True
+                request_kwargs["top_logprobs"] = top_logprobs
+            completion = await self.client.chat.completions.create(**request_kwargs)
+            response = completion.choices[0].message.content
+            # Convert the string response to output model
+            parsed = self._convert_to_output_model(response, output_model)
+            return parsed, completion
+        except Exception as e:
+            print(f"[ERROR] Failed to get vLLM structured output: {e}")
+            raise
+    def _extract_logprobs(self, completion: dict):
+        logprobs_data = []
+        ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
+        for choice in completion.choices:
+            if not getattr(choice, "logprobs", None):
+                continue
+            for logprob_item in choice.logprobs.content:
+                if ignore_pattern.match(logprob_item.token):
+                    continue
+                token_entry = {
+                    "token": logprob_item.token,
+                    "prob": round(math.exp(logprob_item.logprob), 8),
+                    "top_alternatives": [],
+                }
+                for alt in logprob_item.top_logprobs:
+                    if ignore_pattern.match(alt.token):
+                        continue
+                    token_entry["top_alternatives"].append(
+                        {
+                            "token": alt.token,
+                            "prob": round(math.exp(alt.logprob), 8),
+                        }
+                    )
+                logprobs_data.append(token_entry)
+        return logprobs_data
+    async def run(
+        self,
+        input_text: str,
+        prompt_file: str,
+        output_model: T,
+        with_analysis: bool = False,
+        use_modes: bool = False,
+        mode: str = "",
+        resp_format: Literal["vllm", "parse"] = "parse",
+        output_lang: Optional[str] = None,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+        **extra_kwargs,
+    ) -> dict[str, Any]:
+        """
+        Execute the async LLM pipeline with the given input text.
+        Args:
+            input_text: The text to process (will be stripped of whitespace)
+            **extra_kwargs: Additional variables to inject into prompt templates
+        Returns:
+            Dictionary containing the parsed result and optional analysis
+        """
+        prompt_loader = PromptLoader()
+        formatter = UserMergeFormatter()
+        try:
+            cleaned_text = input_text.strip()
+            prompt_configs = prompt_loader.load_prompts(
+                prompt_file,
+                use_modes,
+                mode,
+                cleaned_text,
+                **extra_kwargs,
+            )
+            messages: list[dict[str, str]] = []
+            if with_analysis:
+                analysis = await self._analyze(prompt_configs)
+                messages.append(
+                    self._build_user_message(f"Based on this analysis: {analysis}")
+                )
+            if output_lang:
+                messages.append(
+                    self._build_user_message(
+                        f"Respond only in the {output_lang} language."
+                    )
+                )
+            messages.append(self._build_user_message(prompt_configs["main_template"]))
+            messages = formatter.format(messages)
+            if resp_format == "vllm":
+                parsed, completion = await self._vllm_completion(
+                    messages, output_model, logprobs, top_logprobs
+                )
+            elif resp_format == "parse":
+                parsed, completion = await self._parse_completion(
+                    messages, output_model, logprobs, top_logprobs
+                )
+            else:
+                raise ValueError(f"Unknown resp_format: {resp_format}")
+            results = {"result": parsed.result}
+            if logprobs:
+                results["logprobs"] = self._extract_logprobs(completion)
+            if with_analysis:
+                results["analysis"] = analysis
+            return results
+        except Exception as e:
+            # Print error clearly and re-raise for the caller to handle
+            print(f"[ERROR] Async operation failed: {e}")
+            raise

texttools/tools/{operator.py → internals/operator.py} RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
-from typing import Any, TypeVar, Type, Literal
+import math
+import re
+from typing import Any, TypeVar, Literal, Optional
 import json
 from openai import OpenAI
@@ -9,7 +11,7 @@ from pydantic import BaseModel
 from texttools.formatters.user_merge_formatter.user_merge_formatter import (
     UserMergeFormatter,
 )
-from texttools.tools.prompt_loader import PromptLoader
+from texttools.tools.internals.prompt_loader import PromptLoader
 # Base Model type for output models
 T = TypeVar("T", bound=BaseModel)
@@ -42,13 +44,6 @@ class Operator:
     - RESP_FORMAT: str → "vllm" or "parse"
     """
-    PROMPT_FILE: str
-    OUTPUT_MODEL: Type[T]
-    WITH_ANALYSIS: bool = False
-    USE_MODES: bool
-    MODE: str = ""
-    RESP_FORMAT: Literal["vllm", "parse"] = "vllm"
     def __init__(
         self,
         client: OpenAI,
@@ -59,17 +54,12 @@ class Operator:
     ):
         self.client: OpenAI = client
         self.model = model
-        self.prompt_loader = PromptLoader()
-        self.formatter = UserMergeFormatter()
         self.temperature = temperature
         self.client_kwargs = client_kwargs
     def _build_user_message(self, prompt: str) -> dict[str, str]:
         return {"role": "user", "content": prompt}
-    def _apply_formatter(self, messages: list[dict[str, str]]) -> list[dict[str, str]]:
-        return self.formatter.format(messages)
     def _analysis_completion(self, analyze_message: list[dict[str, str]]) -> str:
         try:
             completion = self.client.chat.completions.create(
@@ -85,30 +75,35 @@ class Operator:
             print(f"[ERROR] Analysis failed: {e}")
             raise
-    def _analyze(self) -> str:
-        analyze_prompt = self.prompt_configs["analyze_template"]
+    def _analyze(self, prompt_configs: dict[str, str]) -> str:
+        analyze_prompt = prompt_configs["analyze_template"]
         analyze_message = [self._build_user_message(analyze_prompt)]
         analysis = self._analysis_completion(analyze_message)
         return analysis
-    def _build_main_message(self) -> list[dict[str, str]]:
-        main_prompt = self.prompt_configs["main_template"]
-        main_message = self._build_user_message(main_prompt)
-        return main_message
-    def _parse_completion(self, message: list[dict[str, str]]) -> T:
+    def _parse_completion(
+        self,
+        message: list[dict[str, str]],
+        output_model: T,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+    ) -> tuple[T, Any]:
         try:
-            completion = self.client.beta.chat.completions.parse(
-                model=self.model,
-                messages=message,
-                response_format=self.OUTPUT_MODEL,
-                temperature=self.temperature,
+            request_kwargs = {
+                "model": self.model,
+                "messages": message,
+                "response_format": output_model,
+                "temperature": self.temperature,
                 **self.client_kwargs,
-            )
+            }
+            if logprobs:
+                request_kwargs["logprobs"] = True
+                request_kwargs["top_logprobs"] = top_logprobs
+            completion = self.client.beta.chat.completions.parse(**request_kwargs)
             parsed = completion.choices[0].message.parsed
-            return parsed
+            return parsed, completion
         except Exception as e:
             print(f"[ERROR] Failed to parse completion: {e}")
@@ -119,24 +114,20 @@ class Operator:
         Clean JSON response by removing code block markers and whitespace.
         Handles cases like:
         - ```json{"result": "value"}```
-        - ```{"result": "value"}```
         """
-        # Remove code block markers
         cleaned = response.strip()
-        # Remove ```json and ``` markers
+        # Remove ```json marker
         if cleaned.startswith("```json"):
-            cleaned = cleaned[7:]  # Remove ```json
-        elif cleaned.startswith("```"):
-            cleaned = cleaned[3:]  # Remove ```
+            cleaned = cleaned[7:]
-        # Remove trailing ``` or '''
+        # Remove trailing ```
         if cleaned.endswith("```"):
             cleaned = cleaned[:-3]
         return cleaned.strip()
-    def _convert_to_output_model(self, response_string: str) -> T:
+    def _convert_to_output_model(self, response_string: str, output_model: T) -> T:
         """
         Convert a JSON response string to output model.
@@ -151,11 +142,16 @@ class Operator:
             # Clean the response string
             cleaned_json = self._clean_json_response(response_string)
+            # Fix Python-style booleans
+            cleaned_json = cleaned_json.replace("False", "false").replace(
+                "True", "true"
+            )
             # Convert string to Python dictionary
             response_dict = json.loads(cleaned_json)
             # Convert dictionary to output model
-            return self.OUTPUT_MODEL(**response_dict)
+            return output_model(**response_dict)
         except json.JSONDecodeError as e:
             raise ValueError(
@@ -164,28 +160,84 @@ class Operator:
         except Exception as e:
             raise ValueError(f"Failed to convert to output model: {e}")
-    def _vllm_completion(self, message: list[dict[str, str]]) -> T:
+    def _vllm_completion(
+        self,
+        message: list[dict[str, str]],
+        output_model: T,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+    ) -> tuple[T, Any]:
         try:
-            json_schema = self.OUTPUT_MODEL.model_json_schema()
-            completion = self.client.chat.completions.create(
-                model=self.model,
-                messages=message,
-                extra_body={"guided_json": json_schema},
-                temperature=self.temperature,
+            json_schema = output_model.model_json_schema()
+            # Build kwargs dynamically
+            request_kwargs = {
+                "model": self.model,
+                "messages": message,
+                "extra_body": {"guided_json": json_schema},
+                "temperature": self.temperature,
                 **self.client_kwargs,
-            )
+            }
+            if logprobs:
+                request_kwargs["logprobs"] = True
+                request_kwargs["top_logprobs"] = top_logprobs
+            completion = self.client.chat.completions.create(**request_kwargs)
             response = completion.choices[0].message.content
             # Convert the string response to output model
-            parsed_response = self._convert_to_output_model(response)
+            parsed = self._convert_to_output_model(response, output_model)
-            return parsed_response
+            return parsed, completion
         except Exception as e:
             print(f"[ERROR] Failed to get vLLM structured output: {e}")
             raise
-    def run(self, input_text: str, **extra_kwargs) -> dict[str, Any]:
+    def _extract_logprobs(self, completion: dict):
+        logprobs_data = []
+        ignore_pattern = re.compile(r'^(result|[\s\[\]\{\}",:]+)$')
+        for choice in completion.choices:
+            if not getattr(choice, "logprobs", None):
+                continue
+            for logprob_item in choice.logprobs.content:
+                if ignore_pattern.match(logprob_item.token):
+                    continue
+                token_entry = {
+                    "token": logprob_item.token,
+                    "prob": round(math.exp(logprob_item.logprob), 8),
+                    "top_alternatives": [],
+                }
+                for alt in logprob_item.top_logprobs:
+                    if ignore_pattern.match(alt.token):
+                        continue
+                    token_entry["top_alternatives"].append(
+                        {
+                            "token": alt.token,
+                            "prob": round(math.exp(alt.logprob), 8),
+                        }
+                    )
+                logprobs_data.append(token_entry)
+        return logprobs_data
+    def run(
+        self,
+        input_text: str,
+        prompt_file: str,
+        output_model: T,
+        with_analysis: bool = False,
+        use_modes: bool = False,
+        mode: str = "",
+        resp_format: Literal["vllm", "parse"] = "parse",
+        output_lang: Optional[str] = None,
+        logprobs: bool = False,
+        top_logprobs: int = 3,
+        **extra_kwargs,
+    ) -> dict[str, Any]:
         """
         Execute the LLM pipeline with the given input text.
@@ -196,36 +248,54 @@ class Operator:
         Returns:
             Dictionary containing the parsed result and optional analysis
         """
+        prompt_loader = PromptLoader()
+        formatter = UserMergeFormatter()
         try:
             cleaned_text = input_text.strip()
-            self.prompt_configs = self.prompt_loader.load_prompts(
-                self.PROMPT_FILE,
-                self.USE_MODES,
-                self.MODE,
+            prompt_configs = prompt_loader.load_prompts(
+                prompt_file,
+                use_modes,
+                mode,
                 cleaned_text,
                 **extra_kwargs,
             )
             messages: list[dict[str, str]] = []
-            if self.WITH_ANALYSIS:
-                analysis = self._analyze()
+            if with_analysis:
+                analysis = self._analyze(prompt_configs)
                 messages.append(
                     self._build_user_message(f"Based on this analysis: {analysis}")
                 )
-            messages.append(self._build_main_message())
-            messages = self.formatter.format(messages)
+            if output_lang:
+                messages.append(
+                    self._build_user_message(
+                        f"Respond only in the {output_lang} language."
+                    )
+                )
+            messages.append(self._build_user_message(prompt_configs["main_template"]))
+            messages = formatter.format(messages)
-            if self.RESP_FORMAT == "vllm":
-                parsed = self._vllm_completion(messages)
-            elif self.RESP_FORMAT == "parse":
-                parsed = self._parse_completion(messages)
+            if resp_format == "vllm":
+                parsed, completion = self._vllm_completion(
+                    messages, output_model, logprobs, top_logprobs
+                )
+            elif resp_format == "parse":
+                parsed, completion = self._parse_completion(
+                    messages, output_model, logprobs, top_logprobs
+                )
             results = {"result": parsed.result}
-            if self.WITH_ANALYSIS:
+            if logprobs:
+                results["logprobs"] = self._extract_logprobs(completion)
+            if with_analysis:
                 results["analysis"] = analysis
             return results

texttools/tools/{output_models.py → internals/output_models.py} RENAMED Viewed

@@ -11,6 +11,14 @@ class StrOutput(BaseModel):
     result: str
+class BoolOutput(BaseModel):
+    """
+    Output model for a single boolean result.
+    """
+    result: bool
 class ListStrOutput(BaseModel):
     """
     Output model for a list of strings result.

texttools/tools/{prompt_loader.py → internals/prompt_loader.py} RENAMED Viewed

@@ -1,4 +1,3 @@
-from typing import Optional
 from pathlib import Path
 import yaml
@@ -25,16 +24,17 @@ class PromptLoader:
     MAIN_TEMPLATE: str = "main_template"
     ANALYZE_TEMPLATE: str = "analyze_template"
-    def __init__(self, prompts_dir: Optional[str] = None):
-        self.PROMPTS_DIR = prompts_dir or "prompts"
-    def _get_prompt_path(self, prompt_file: str) -> Path:
-        return Path(__file__).parent.parent / self.PROMPTS_DIR / prompt_file
+    def _get_prompt_path(self, prompt_file: str, prompts_dir: str) -> Path:
+        return Path(__file__).parent.parent.parent / prompts_dir / prompt_file
     def _load_templates(
-        self, prompt_file: str, use_modes: bool, mode: str
+        self,
+        prompts_dir: str,
+        prompt_file: str,
+        use_modes: bool,
+        mode: str,
     ) -> dict[str, str]:
-        prompt_path = self._get_prompt_path(prompt_file)
+        prompt_path = self._get_prompt_path(prompt_file, prompts_dir)
         if not prompt_path.exists():
             raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
@@ -45,18 +45,13 @@ class PromptLoader:
         except yaml.YAMLError as e:
             raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
-        if self.MAIN_TEMPLATE not in data:
-            raise ValueError(
-                f"Missing required '{self.MAIN_TEMPLATE}' in {prompt_file}"
-            )
         return {
-            self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
+            "main_template": data["main_template"][mode]
             if use_modes
-            else data[self.MAIN_TEMPLATE],
-            self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
+            else data["main_template"],
+            "analyze_template": data.get("analyze_template")[mode]
             if use_modes
-            else data.get(self.ANALYZE_TEMPLATE),
+            else data.get("analyze_template"),
         }
     def _build_format_args(self, input_text: str, **extra_kwargs) -> dict[str, str]:
@@ -72,9 +67,12 @@ class PromptLoader:
         use_modes: bool,
         mode: str,
         input_text: str,
+        prompts_dir: str = "prompts",
         **extra_kwargs,
     ) -> dict[str, str]:
-        template_configs = self._load_templates(prompt_file, use_modes, mode)
+        template_configs = self._load_templates(
+            prompts_dir, prompt_file, use_modes, mode
+        )
         format_args = self._build_format_args(input_text, **extra_kwargs)
         # Inject variables inside each template

hamtaa-texttools 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

Potentially problematic release.

hamtaa-texttools 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl