PyPI - hamtaa-texttools - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.16__py3-none-any.whl - Mend

hamtaa-texttools 1.1.1py3-none-any.whl → 1.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/METADATA +98 -26
hamtaa_texttools-1.1.16.dist-info/RECORD +31 -0
texttools/__init__.py +6 -8
texttools/batch/batch_config.py +26 -0
texttools/batch/batch_runner.py +105 -151
texttools/batch/{batch_manager.py → internals/batch_manager.py} +39 -40
texttools/batch/internals/utils.py +16 -0
texttools/prompts/README.md +4 -4
texttools/prompts/categorize.yaml +77 -0
texttools/prompts/detect_entity.yaml +22 -0
texttools/prompts/extract_keywords.yaml +68 -18
texttools/tools/async_tools.py +804 -0
texttools/tools/internals/async_operator.py +90 -69
texttools/tools/internals/models.py +183 -0
texttools/tools/internals/operator_utils.py +54 -0
texttools/tools/internals/prompt_loader.py +13 -14
texttools/tools/internals/sync_operator.py +201 -0
texttools/tools/sync_tools.py +804 -0
hamtaa_texttools-1.1.1.dist-info/RECORD +0 -30
texttools/batch/__init__.py +0 -4
texttools/prompts/categorizer.yaml +0 -28
texttools/tools/__init__.py +0 -4
texttools/tools/async_the_tool.py +0 -414
texttools/tools/internals/base_operator.py +0 -91
texttools/tools/internals/operator.py +0 -179
texttools/tools/internals/output_models.py +0 -59
texttools/tools/the_tool.py +0 -412
{hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/WHEEL +0 -0
{hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/licenses/LICENSE +0 -0
{hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/top_level.txt +0 -0

texttools/batch/{batch_manager.py → internals/batch_manager.py} RENAMED Viewed

@@ -1,19 +1,20 @@
 import json
 import uuid
 from pathlib import Path
-from typing import Any, Type
+from typing import Any, Type, TypeVar
 import logging
 from pydantic import BaseModel
 from openai import OpenAI
 from openai.lib._pydantic import to_strict_json_schema
-# Configure logger
-logger = logging.getLogger("batch_runner")
-logger.setLevel(logging.INFO)
+# Base Model type for output models
+T = TypeVar("T", bound=BaseModel)
+logger = logging.getLogger("texttools.batch_manager")
-class SimpleBatchManager:
+class BatchManager:
     """
     Manages batch processing jobs for OpenAI's chat completions with structured outputs.
@@ -26,30 +27,29 @@ class SimpleBatchManager:
         self,
         client: OpenAI,
         model: str,
-        output_model: Type[BaseModel],
+        output_model: Type[T],
         prompt_template: str,
-        handlers: list[Any] | None = None,
         state_dir: Path = Path(".batch_jobs"),
         custom_json_schema_obj_str: dict | None = None,
         **client_kwargs: Any,
     ):
-        self.client = client
-        self.model = model
-        self.output_model = output_model
-        self.prompt_template = prompt_template
-        self.handlers = handlers or []
-        self.state_dir = state_dir
-        self.state_dir.mkdir(parents=True, exist_ok=True)
-        self.custom_json_schema_obj_str = custom_json_schema_obj_str
-        self.client_kwargs = client_kwargs
-        self.dict_input = False
-        if self.custom_json_schema_obj_str:
-            if self.custom_json_schema_obj_str is not dict:
-                raise ValueError("schema should be a dict")
+        self._client = client
+        self._model = model
+        self._output_model = output_model
+        self._prompt_template = prompt_template
+        self._state_dir = state_dir
+        self._custom_json_schema_obj_str = custom_json_schema_obj_str
+        self._client_kwargs = client_kwargs
+        self._dict_input = False
+        self._state_dir.mkdir(parents=True, exist_ok=True)
+        if custom_json_schema_obj_str and not isinstance(
+            custom_json_schema_obj_str, dict
+        ):
+            raise ValueError("Schema should be a dict")
     def _state_file(self, job_name: str) -> Path:
-        return self.state_dir / f"{job_name}.json"
+        return self._state_dir / f"{job_name}.json"
     def _load_state(self, job_name: str) -> list[dict[str, Any]]:
         """
@@ -83,17 +83,17 @@ class SimpleBatchManager:
         """
         response_format_config: dict[str, Any]
-        if self.custom_json_schema_obj_str:
+        if self._custom_json_schema_obj_str:
             response_format_config = {
                 "type": "json_schema",
-                "json_schema": self.custom_json_schema_obj_str,
+                "json_schema": self._custom_json_schema_obj_str,
             }
         else:
-            raw_schema = to_strict_json_schema(self.output_model)
+            raw_schema = to_strict_json_schema(self._output_model)
             response_format_config = {
                 "type": "json_schema",
                 "json_schema": {
-                    "name": self.output_model.__name__,
+                    "name": self._output_model.__name__,
                     "schema": raw_schema,
                 },
             }
@@ -105,11 +105,11 @@ class SimpleBatchManager:
             "body": {
                 "model": self.model,
                 "messages": [
-                    {"role": "system", "content": self.prompt_template},
+                    {"role": "system", "content": self._prompt_template},
                     {"role": "user", "content": text},
                 ],
                 "response_format": response_format_config,
-                **self.client_kwargs,
+                **self._client_kwargs,
             },
         }
@@ -127,10 +127,10 @@ class SimpleBatchManager:
         else:
             raise TypeError(
-                "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}."
+                "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}"
             )
-        file_path = self.state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
+        file_path = self._state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
         with open(file_path, "w", encoding="utf-8") as f:
             for task in tasks:
                 f.write(json.dumps(task) + "\n")
@@ -143,9 +143,10 @@ class SimpleBatchManager:
         """
         if self._load_state(job_name):
             return
         path = self._prepare_file(payload)
-        upload = self.client.files.create(file=open(path, "rb"), purpose="batch")
-        job = self.client.batches.create(
+        upload = self._client.files.create(file=open(path, "rb"), purpose="batch")
+        job = self._client.batches.create(
             input_file_id=upload.id,
             endpoint="/v1/chat/completions",
             completion_window="24h",
@@ -161,7 +162,7 @@ class SimpleBatchManager:
         if not job:
             return "completed"
-        info = self.client.batches.retrieve(job["id"])
+        info = self._client.batches.retrieve(job["id"])
         job = info.to_dict()
         self._save_state(job_name, [job])
         logger.info("Batch job status: %s", job)
@@ -179,18 +180,18 @@ class SimpleBatchManager:
             return {}
         batch_id = job["id"]
-        info = self.client.batches.retrieve(batch_id)
+        info = self._client.batches.retrieve(batch_id)
         out_file_id = info.output_file_id
         if not out_file_id:
             error_file_id = info.error_file_id
             if error_file_id:
                 err_content = (
-                    self.client.files.content(error_file_id).read().decode("utf-8")
+                    self._client.files.content(error_file_id).read().decode("utf-8")
                 )
-                logger.info("Error file content:", err_content)
+                logger.error("Error file content:", err_content)
             return {}
-        content = self.client.files.content(out_file_id).read().decode("utf-8")
+        content = self._client.files.content(out_file_id).read().decode("utf-8")
         lines = content.splitlines()
         results = {}
         log = []
@@ -201,7 +202,7 @@ class SimpleBatchManager:
                 content = result["response"]["body"]["choices"][0]["message"]["content"]
                 try:
                     parsed_content = json.loads(content)
-                    model_instance = self.output_model(**parsed_content)
+                    model_instance = self._output_model(**parsed_content)
                     results[custom_id] = model_instance.model_dump(mode="json")
                 except json.JSONDecodeError:
                     results[custom_id] = {"error": "Failed to parse content as JSON"}
@@ -221,8 +222,6 @@ class SimpleBatchManager:
                 error_d = {custom_id: results[custom_id]}
                 log.append(error_d)
-        for handler in self.handlers:
-            handler.handle(results)
         if remove_cache:
             self._clear_state(job_name)

texttools/batch/internals/utils.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import Any
+def export_data(data) -> list[dict[str, str]]:
+    """
+    Produces a structure of the following form from an initial data structure:
+    [{"id": str, "text": str},...]
+    """
+    return data
+def import_data(data) -> Any:
+    """
+    Takes the output and adds and aggregates it to the original structure.
+    """
+    return data

texttools/prompts/README.md CHANGED Viewed

@@ -14,15 +14,15 @@ This folder contains YAML files for all prompts used in the project. Each file r
 ### Example YAML Structure
 ```yaml
 main_template:
-  default: |
+  mode_1: |
     Your main instructions here with placeholders like {input}.
-  reason: |
+  mode_2: |
     Optional reasoning instructions here.
 analyze_template:
-  default: |
+  mode_1: |
     Analyze and summarize the input.
-  reason: |
+  mode_2: |
     Optional detailed analysis template.
 ```

texttools/prompts/categorize.yaml ADDED Viewed

@@ -0,0 +1,77 @@
+main_template:
+  category_list: |
+    You are an expert classification agent.
+    You receive a list of categories.
+    Your task:
+    - Read all provided categories carefully.
+    - Consider the user query, intent, and task explanation.
+    - Select exactly one category name from the list that best matches the user’s intent.
+    - Return only the category name, nothing else.
+    Rules:
+    - Never invent categories that are not in the list.
+    - If multiple categories seem possible, choose the closest match based on the description and user intent.
+    - If descriptions are missing or empty, rely on the category name.
+    - If the correct answer cannot be determined with certainty, choose the most likely one.
+    Output format:
+    {{
+    "reason": "Explanation of why the input belongs to the category"
+    "result": "<category_name_only>"
+    }}
+    Available categories with their descriptions:
+    {category_list}
+    The text that has to be categorized:
+    {input}
+  category_tree: |
+    You are an expert classification agent.
+    You receive a list of categories at the current level of a hierarchical category tree.
+    Your task:
+    - Read all provided categories carefully.
+    - Consider the user query, intent, and task explanation.
+    - Select exactly one category name from the list that best matches the user’s intent.
+    - Return only the category name, nothing else.
+    Rules:
+    - Never invent categories that are not in the list.
+    - If multiple categories seem possible, choose the closest match based on the description and user intent.
+    - If descriptions are missing or empty, rely on the category name.
+    - If the correct answer cannot be determined with certainty, choose the most likely one.
+    Output format:
+    {{
+    "reason": "Explanation of why the input belongs to the category"
+    "result": "<category_name_only>"
+    }}
+    Available categories with their descriptions at this level:
+    {category_list}
+    Do not include category descriptions at all. Only write the raw category.
+    The text that has to be categorized:
+    {input}
+analyze_template:
+  category_list: |
+    We want to categorize the given text.
+    To improve categorization, we need an analysis of the text.
+    Analyze the given text and write its main idea and a short analysis of that.
+    Analysis should be very short.
+    Text:
+    {input}
+  category_tree: |
+    We want to categorize the given text.
+    To improve categorization, we need an analysis of the text.
+    Analyze the given text and write its main idea and a short analysis of that.
+    Analysis should be very short.
+    Text:
+    {input}

texttools/prompts/detect_entity.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+main_template: |
+  You are an expert Named Entity Recognition (NER) system. Extract entities from the text.
+  The output must strictly follow the provided Pydantic schema.
+  Mapping Rule:
+  - Person: شخص
+  - Location: مکان
+  - Time: زمان
+  - Living Beings: موجود زنده
+  - Organization: سازمان
+  - Concept: مفهوم
+  CRITICAL:
+  1. The final output structure must be a complete JSON object matching the Pydantic schema (List[Entity]).
+  2. Both the extracted text and the type must be in Persian, using the exact mapping provided above.
+  Here is the text: {input}
+analyze_template: |
+  Analyze the following text to identify all potential named entities and their categories (Person, Location, Time, Living Beings, Organization, Concept).
+  Provide a brief summary of the entities identified that will help the main process to extract them accurately and apply the correct Persian type label.
+  Here is the text: {input}

texttools/prompts/extract_keywords.yaml CHANGED Viewed

@@ -1,18 +1,68 @@
-main_template: |
-  You are an expert keyword extractor.
-  Extract the most relevant keywords from the given text.
-  Guidelines:
-  - Keywords must represent the main concepts of the text.
-  - If two words have overlapping meanings, choose only one.
-  - Do not include generic or unrelated words.
-  - Keywords must be single, self-contained words (no phrases).
-  - Output between 3 and 7 keywords based on the input length.
-  - Respond only in JSON format:
-  {{"result": ["keyword1", "keyword2", etc.]}}
-  Here is the text:
-  {input}
-analyze_template: |
-  Analyze the following text to identify its main topics, concepts, and important terms.
-  Provide a concise summary of your findings that will help in extracting relevant keywords.
-  {input}
+main_template:
+  auto: |
+    You are an expert keyword extractor.
+    Extract the most relevant keywords from the given text.
+    Guidelines:
+    - Keywords must represent the main concepts of the text.
+    - If two words have overlapping meanings, choose only one.
+    - Do not include generic or unrelated words.
+    - Keywords must be single, self-contained words (no phrases).
+    - Output between 3 and 7 keywords based on the input length.
+    - Respond only in JSON format:
+    {{"result": ["keyword1", "keyword2", etc.]}}
+    Here is the text:
+    {input}
+  threshold: |
+    You are an expert keyword extractor specialized in fine-grained concept identification.
+    Extract the most specific, content-bearing keywords from the text.
+    Requirements:
+    - Choose fine-grained conceptual terms, not general domain labels.
+    - Avoid words that only describe the broad topic (e.g., Islam, religion, philosophy, history).
+    - Prefer specific names, concepts, doctrines, events, arguments, or terminology.
+    - Do not select words only because they appear frequently. A keyword must represent a central conceptual idea, not a repeated surface term.
+    - If multiple words express overlapping meaning, select the more specific one.
+    - Keywords must be single words (no multi-word expressions).
+    - Extract N keywords depending on input length:
+      - Short texts (a few sentences): 3 keywords
+      - Medium texts (1–4 paragraphs): 4–5 keywords
+      - Long texts (more than 4 paragraphs): 6–7 keywords
+    - Respond only in JSON format:
+    {{"result": ["keyword1", "keyword2", etc.]}}
+    Here is the text:
+    {input}
+  count: |
+    You are an expert keyword extractor with precise output requirements.
+    Extract exactly {number_of_keywords} keywords from the given text.
+    Requirements:
+    - Extract exactly {number_of_keywords} keywords, no more, no less.
+    - Select the {number_of_keywords} most relevant and specific keywords that represent core concepts.
+    - Prefer specific terms, names, and concepts over general topic labels.
+    - If the text doesn't contain enough distinct keywords, include the most relevant ones even if some are less specific.
+    - Keywords must be single words (no multi-word expressions).
+    - Order keywords by relevance (most relevant first).
+    - Respond only in JSON format:
+    {{"result": ["keyword1", "keyword2", "keyword3", ...]}}
+    Here is the text:
+    {input}
+analyze_template:
+  auto: |
+    Analyze the following text to identify its main topics, concepts, and important terms.
+    Provide a concise summary of your findings that will help in extracting relevant keywords.
+    {input}
+  threshold: |
+    Analyze the following text to identify its main topics, concepts, and important terms.
+    Provide a concise summary of your findings that will help in extracting relevant keywords.
+    {input}
+  count: |
+    Analyze the following text to identify its main topics, concepts, and important terms.
+    Provide a concise summary of your findings that will help in extracting relevant keywords.
+    {input}

hamtaa-texttools 1.1.1__py3-none-any.whl → 1.1.16__py3-none-any.whl

hamtaa-texttools 1.1.1py3-none-any.whl → 1.1.16py3-none-any.whl