PyPI - hamtaa-texttools - Versions diffs - 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

hamtaa-texttools 0.1.48py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (86) hide show

hamtaa_texttools-1.1.7.dist-info/METADATA +228 -0
hamtaa_texttools-1.1.7.dist-info/RECORD +30 -0
hamtaa_texttools-1.1.7.dist-info/licenses/LICENSE +21 -0
texttools/__init__.py +4 -26
texttools/batch/__init__.py +3 -0
texttools/{utils/batch_manager → batch}/batch_manager.py +226 -241
texttools/batch/batch_runner.py +254 -0
texttools/prompts/README.md +35 -0
texttools/prompts/categorizer.yaml +28 -0
texttools/prompts/extract_entities.yaml +20 -0
texttools/prompts/extract_keywords.yaml +18 -0
texttools/prompts/is_question.yaml +14 -0
texttools/prompts/merge_questions.yaml +46 -0
texttools/prompts/rewrite.yaml +111 -0
texttools/prompts/run_custom.yaml +7 -0
texttools/prompts/subject_to_question.yaml +22 -0
texttools/prompts/summarize.yaml +14 -0
texttools/prompts/text_to_question.yaml +20 -0
texttools/prompts/translate.yaml +15 -0
texttools/tools/__init__.py +4 -33
texttools/tools/async_the_tool.py +435 -0
texttools/tools/internals/async_operator.py +242 -0
texttools/tools/internals/base_operator.py +100 -0
texttools/tools/internals/formatters.py +24 -0
texttools/tools/internals/operator.py +242 -0
texttools/tools/internals/output_models.py +62 -0
texttools/tools/internals/prompt_loader.py +60 -0
texttools/tools/the_tool.py +433 -0
hamtaa_texttools-0.1.48.dist-info/METADATA +0 -60
hamtaa_texttools-0.1.48.dist-info/RECORD +0 -61
texttools/base/__init__.py +0 -3
texttools/base/base_categorizer.py +0 -40
texttools/base/base_keyword_extractor.py +0 -35
texttools/base/base_ner_extractor.py +0 -61
texttools/base/base_question_detector.py +0 -35
texttools/base/base_question_generator.py +0 -99
texttools/base/base_question_merger.py +0 -59
texttools/base/base_question_rewriter.py +0 -61
texttools/base/base_router.py +0 -33
texttools/base/base_summarizer.py +0 -55
texttools/base/base_task_performer.py +0 -53
texttools/base/base_translator.py +0 -38
texttools/formatter/__init__.py +0 -1
texttools/formatter/base.py +0 -26
texttools/formatter/gemma3_formatter.py +0 -54
texttools/handlers/__init__.py +0 -6
texttools/handlers/categorizer/__init__.py +0 -6
texttools/handlers/categorizer/categorizer.py +0 -61
texttools/handlers/handlers.py +0 -88
texttools/tools/categorizer/__init__.py +0 -2
texttools/tools/categorizer/encoder_model/__init__.py +0 -1
texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
texttools/tools/categorizer/llm/__init__.py +0 -2
texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
texttools/tools/keyword_extractor/__init__.py +0 -1
texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
texttools/tools/merger/__init__.py +0 -2
texttools/tools/merger/gemma_question_merger.py +0 -214
texttools/tools/ner/__init__.py +0 -1
texttools/tools/ner/gemma_ner_extractor.py +0 -157
texttools/tools/question_detector/__init__.py +0 -2
texttools/tools/question_detector/gemma_detector.py +0 -114
texttools/tools/question_detector/llm_detector.py +0 -112
texttools/tools/question_generator/__init__.py +0 -1
texttools/tools/question_generator/gemma_question_generator.py +0 -198
texttools/tools/reranker/__init__.py +0 -3
texttools/tools/reranker/reranker.py +0 -137
texttools/tools/reranker/scorer.py +0 -216
texttools/tools/reranker/sorter.py +0 -278
texttools/tools/rewriter/__init__.py +0 -2
texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
texttools/tools/router/__init__.py +0 -0
texttools/tools/router/gemma_router.py +0 -169
texttools/tools/subject_to_question/__init__.py +0 -1
texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
texttools/tools/summarizer/__init__.py +0 -2
texttools/tools/summarizer/gemma_summarizer.py +0 -140
texttools/tools/summarizer/llm_summerizer.py +0 -108
texttools/tools/translator/__init__.py +0 -1
texttools/tools/translator/gemma_translator.py +0 -189
texttools/utils/batch_manager/__init__.py +0 -2
texttools/utils/batch_manager/batch_runner.py +0 -207
texttools/utils/flex_processor.py +0 -78
{hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/WHEEL +0 -0
{hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/top_level.txt +0 -0

texttools/batch/batch_runner.py ADDED Viewed

@@ -0,0 +1,254 @@
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Type, TypeVar
+import logging
+from dotenv import load_dotenv
+from openai import OpenAI
+from pydantic import BaseModel
+from texttools.batch.batch_manager import BatchManager
+from texttools.tools.internals.output_models import StrOutput
+# Base Model type for output models
+T = TypeVar("T", bound=BaseModel)
+logger = logging.getLogger("texttools.batch_runner")
+def export_data(data) -> list[dict[str, str]]:
+    """
+    Produces a structure of the following form from an initial data structure:
+    [{"id": str, "text": str},...]
+    """
+    return data
+def import_data(data) -> Any:
+    """
+    Takes the output and adds and aggregates it to the original structure.
+    """
+    return data
+@dataclass
+class BatchConfig:
+    """
+    Configuration for batch job runner.
+    """
+    system_prompt: str = ""
+    job_name: str = ""
+    input_data_path: str = ""
+    output_data_filename: str = ""
+    model: str = "gpt-4.1-mini"
+    MAX_BATCH_SIZE: int = 100
+    MAX_TOTAL_TOKENS: int = 2_000_000
+    CHARS_PER_TOKEN: float = 2.7
+    PROMPT_TOKEN_MULTIPLIER: int = 1_000
+    BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
+    import_function: Callable = import_data
+    export_function: Callable = export_data
+    poll_interval_seconds: int = 30
+    max_retries: int = 3
+class BatchJobRunner:
+    """
+    Handles running batch jobs using a batch manager and configuration.
+    """
+    def __init__(
+        self, config: BatchConfig = BatchConfig(), output_model: Type[T] = StrOutput
+    ):
+        self.config = config
+        self.system_prompt = config.system_prompt
+        self.job_name = config.job_name
+        self.input_data_path = config.input_data_path
+        self.output_data_filename = config.output_data_filename
+        self.model = config.model
+        self.output_model = output_model
+        self.manager = self._init_manager()
+        self.data = self._load_data()
+        self.parts: list[list[dict[str, Any]]] = []
+        self._partition_data()
+        Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+        # Map part index to job name
+        self.part_idx_to_job_name: dict[int, str] = {}
+        # Track retry attempts per part
+        self.part_attempts: dict[int, int] = {}
+    def _init_manager(self) -> BatchManager:
+        load_dotenv()
+        api_key = os.getenv("OPENAI_API_KEY")
+        client = OpenAI(api_key=api_key)
+        return BatchManager(
+            client=client,
+            model=self.model,
+            prompt_template=self.system_prompt,
+            output_model=self.output_model,
+        )
+    def _load_data(self):
+        with open(self.input_data_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        data = self.config.export_function(data)
+        # Ensure data is a list of dicts with 'id' and 'content' as strings
+        if not isinstance(data, list):
+            raise ValueError(
+                "Exported data must be a list of dicts with 'id' and 'content' keys"
+            )
+        for item in data:
+            if not (isinstance(item, dict) and "id" in item and "content" in item):
+                raise ValueError(
+                    f"Item must be a dict with 'id' and 'content' keys. Got: {type(item)}"
+                )
+            if not (isinstance(item["id"], str) and isinstance(item["content"], str)):
+                raise ValueError("'id' and 'content' must be strings.")
+        return data
+    def _partition_data(self):
+        total_length = sum(len(item["content"]) for item in self.data)
+        prompt_length = len(self.system_prompt)
+        total = total_length + (prompt_length * len(self.data))
+        calculation = total / self.config.CHARS_PER_TOKEN
+        logger.info(
+            f"Total chars: {total_length}, Prompt chars: {prompt_length}, Total: {total}, Tokens: {calculation}"
+        )
+        if calculation < self.config.MAX_TOTAL_TOKENS:
+            self.parts = [self.data]
+        else:
+            # Partition into chunks of MAX_BATCH_SIZE
+            self.parts = [
+                self.data[i : i + self.config.MAX_BATCH_SIZE]
+                for i in range(0, len(self.data), self.config.MAX_BATCH_SIZE)
+            ]
+        logger.info(f"Data split into {len(self.parts)} part(s)")
+    def _submit_all_jobs(self) -> None:
+        for idx, part in enumerate(self.parts):
+            if self._result_exists(idx):
+                logger.info(f"Skipping part {idx + 1}: result already exists.")
+                continue
+            part_job_name = (
+                f"{self.job_name}_part_{idx + 1}"
+                if len(self.parts) > 1
+                else self.job_name
+            )
+            # If a job with this name already exists, register and skip submitting
+            existing_job = self.manager._load_state(part_job_name)
+            if existing_job:
+                logger.info(
+                    f"Skipping part {idx + 1}: job already exists ({part_job_name})."
+                )
+                self.part_idx_to_job_name[idx] = part_job_name
+                self.part_attempts.setdefault(idx, 0)
+                continue
+            payload = part
+            logger.info(
+                f"Submitting job for part {idx + 1}/{len(self.parts)}: {part_job_name}"
+            )
+            self.manager.start(payload, job_name=part_job_name)
+            self.part_idx_to_job_name[idx] = part_job_name
+            self.part_attempts.setdefault(idx, 0)
+            # This is added for letting file get uploaded, before starting the next part.
+            logger.info("Uploading...")
+            time.sleep(30)
+    def _save_results(
+        self,
+        output_data: list[dict[str, Any]] | dict[str, Any],
+        log: list[Any],
+        part_idx: int,
+    ):
+        part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
+        result_path = (
+            Path(self.config.BASE_OUTPUT_DIR)
+            / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
+        )
+        if not output_data:
+            logger.info("No output data to save. Skipping this part.")
+            return
+        else:
+            with open(result_path, "w", encoding="utf-8") as f:
+                json.dump(output_data, f, ensure_ascii=False, indent=4)
+        if log:
+            log_path = (
+                Path(self.config.BASE_OUTPUT_DIR)
+                / f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
+            )
+            with open(log_path, "w", encoding="utf-8") as f:
+                json.dump(log, f, ensure_ascii=False, indent=4)
+    def _result_exists(self, part_idx: int) -> bool:
+        part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
+        result_path = (
+            Path(self.config.BASE_OUTPUT_DIR)
+            / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
+        )
+        return result_path.exists()
+    def run(self):
+        """
+        Execute the batch job processing pipeline.
+        Submits jobs, monitors progress, handles retries, and saves results.
+        """
+        # Submit all jobs up-front for concurrent execution
+        self._submit_all_jobs()
+        pending_parts: set[int] = set(self.part_idx_to_job_name.keys())
+        logger.info(f"Pending parts: {sorted(pending_parts)}")
+        # Polling loop
+        while pending_parts:
+            finished_this_round: list[int] = []
+            for part_idx in list(pending_parts):
+                job_name = self.part_idx_to_job_name[part_idx]
+                status = self.manager.check_status(job_name=job_name)
+                logger.info(f"Status for {job_name}: {status}")
+                if status == "completed":
+                    logger.info(
+                        f"Job completed. Fetching results for part {part_idx + 1}..."
+                    )
+                    output_data, log = self.manager.fetch_results(
+                        job_name=job_name, remove_cache=False
+                    )
+                    output_data = self.config.import_function(output_data)
+                    self._save_results(output_data, log, part_idx)
+                    logger.info(f"Fetched and saved results for part {part_idx + 1}.")
+                    finished_this_round.append(part_idx)
+                elif status == "failed":
+                    attempt = self.part_attempts.get(part_idx, 0) + 1
+                    self.part_attempts[part_idx] = attempt
+                    if attempt <= self.config.max_retries:
+                        logger.info(
+                            f"Job {job_name} failed (attempt {attempt}). Retrying after short backoff..."
+                        )
+                        self.manager._clear_state(job_name)
+                        time.sleep(10)
+                        payload = self._to_manager_payload(self.parts[part_idx])
+                        new_job_name = (
+                            f"{self.job_name}_part_{part_idx + 1}_retry_{attempt}"
+                        )
+                        self.manager.start(payload, job_name=new_job_name)
+                        self.part_idx_to_job_name[part_idx] = new_job_name
+                    else:
+                        logger.info(
+                            f"Job {job_name} failed after {attempt - 1} retries. Marking as failed."
+                        )
+                        finished_this_round.append(part_idx)
+                else:
+                    # Still running or queued
+                    continue
+            # Remove finished parts
+            for part_idx in finished_this_round:
+                pending_parts.discard(part_idx)
+            if pending_parts:
+                logger.info(
+                    f"Waiting {self.config.poll_interval_seconds}s before next status check for parts: {sorted(pending_parts)}"
+                )
+                time.sleep(self.config.poll_interval_seconds)

texttools/prompts/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Prompts
+## Overview
+This folder contains YAML files for all prompts used in the project. Each file represents a separate prompt template, which can be loaded by tools or scripts that require structured prompts for AI models.
+---
+## Structure
+- **prompt_file.yaml**: Each YAML file represents a single prompt template.
+- **main_template**: The main instruction template for the model.
+- **analyze_template** (optional): A secondary reasoning template used before generating the final response.
+- **Modes** (optional): Some prompts may have multiple modes (e.g., `default`, `reason`) to allow different behaviors.
+### Example YAML Structure
+```yaml
+main_template:
+  default: |
+    Your main instructions here with placeholders like {input}.
+  reason: |
+    Optional reasoning instructions here.
+analyze_template:
+  default: |
+    Analyze and summarize the input.
+  reason: |
+    Optional detailed analysis template.
+```
+---
+## Guidelines
+1. **Naming**: Use descriptive names for each YAML file corresponding to the tool or task it serves.
+2. **Placeholders**: Use `{input}` or other relevant placeholders to dynamically inject data.
+3. **Modes**: If using modes, ensure both `main_template` and `analyze_template` contain the corresponding keys.
+4. **Consistency**: Keep formatting consistent across files for easier parsing by scripts.

texttools/prompts/categorizer.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+main_template: |
+  تو یک متخصص علوم دینی هستی
+  من یک متن به تو میدهم و تو باید
+  آن متن را در یکی از دسته بندی های زیر طبقه بندی کنی
+  دسته بندی ها:
+  "باورهای دینی",
+  "اخلاق اسلامی",
+  "احکام و فقه",
+  "تاریخ اسلام و شخصیت ها",
+  "منابع دینی",
+  "دین و جامعه/سیاست",
+  "عرفان و معنویت",
+  "هیچکدام",
+  فقط با این فرمت json پاسخ بده:
+  {{
+  	  "reason": "<دلیل انتخابت رو به صورت خلاصه بگو>",
+      "result": "<یکی از دسته بندی ها>"
+  }}
+  متنی که باید طبقه بندی کنی:
+  {input}
+analyze_template: |
+  ما میخواهیم متنی که داده می شود را طبقه بندی کنیم.
+  برای بهبود طبقه بندی، نیاز به آنالیز متن داریم.
+  متنی که داده می شود را آنالیز کن و ایده اصلی و آنالیزی کوتاه از آن را بنویس.
+  آنالیز باید بسیار خلاصه باشد
+  نهایتا 20 کلمه
+  {input}

texttools/prompts/extract_entities.yaml ADDED Viewed

@@ -0,0 +1,20 @@
+main_template: |
+  You are a Named Entity Recognition (NER) extractor.
+  Identify and extract all named entities (e.g., PER, ORG, LOC, DAT, etc.) from the given text.
+  For each entity, provide its text and a clear type.
+  Respond only in JSON format:
+  {{
+    "result": [
+      {{
+        "text": "string",
+        "type": "string",
+      }}
+    ]
+  }}
+  Here is the text:
+  {input}
+analyze_template: |
+  Read the following text and identify any proper nouns, key concepts, or specific mentions that might represent named entities.
+  Provide a brief, summarized analysis that could help in categorizing these entities.
+  {input}

texttools/prompts/extract_keywords.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+main_template: |
+  You are an expert keyword extractor.
+  Extract the most relevant keywords from the given text.
+  Guidelines:
+  - Keywords must represent the main concepts of the text.
+  - If two words have overlapping meanings, choose only one.
+  - Do not include generic or unrelated words.
+  - Keywords must be single, self-contained words (no phrases).
+  - Output between 3 and 7 keywords based on the input length.
+  - Respond only in JSON format:
+  {{"result": ["keyword1", "keyword2", etc.]}}
+  Here is the text:
+  {input}
+analyze_template: |
+  Analyze the following text to identify its main topics, concepts, and important terms.
+  Provide a concise summary of your findings that will help in extracting relevant keywords.
+  {input}

texttools/prompts/is_question.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+main_template: |
+  You are a question detector.
+  Determine that if the given text contains any question or not.
+  Respond only in JSON format (Output should be a boolean):
+  {{"result": True/False}}
+  Here is the text:
+  {input}
+analyze_template: |
+  We want to analyze this text snippet to see if it contains any question or request of some kind or not.
+  Read the text, and reason about it being a request or not.
+  Summerized, short answer.
+  {input}

texttools/prompts/merge_questions.yaml ADDED Viewed

@@ -0,0 +1,46 @@
+main_template:
+  default: |
+    You are a language expert.
+    I will give you a list of questions that are semantically similar.
+    Your task is to merge them into one unified question.
+    Guidelines:
+    - Preserves all the information and intent from the original questions.
+    - Sounds natural, fluent, and concise.
+    - Avoids redundancy or unnecessary repetition.
+    - Does not omit any unique idea from the originals.
+    - Respond only in JSON format:
+    {{"result": "string"}}
+    Here is the questions:
+    {input}
+  reason: |
+    You are an AI assistant helping to unify semantically similar questions.
+    First, briefly extract the unique intent or content from each input question.
+    Then, write one merged question that combines all their content clearly and naturally, without redundancy.
+    Step 1: Extract key ideas.
+    Step 2: Write the final merged question.
+    Respond only in JSON format:
+    {{"result": "string"}}
+    Here is the questions:
+    {input}
+analyze_template:
+  default: |
+    You are a language expert.
+    Analyze the following questions to identify their core intent, key concepts,
+    and the specific information they are seeking.
+    Provide a brief, summarized understanding of the questions' meaning that
+    will help in merging and rephrasing it accurately without changing its intent.
+    Here is the question:
+    {input}
+  reason: |
+    Analyze the following questions to identify their exact wording, phrasing,
+    and the literal meaning it conveys.
+    Provide a brief, summarized analysis of their linguistic structure and current meaning,
+    which will then be used to create a new question containing all of their contents.
+    Here is the question:
+    {input}

texttools/prompts/rewrite.yaml ADDED Viewed

@@ -0,0 +1,111 @@
+main_template:
+  positive: |
+    You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
+    Your task is to create a positive pair for a given "Anchor" text.
+    A high-quality positive pair consists of two sentences that are semantically equivalent or highly similar in meaning, but differ in wording, syntax, and sentence structure.
+    They should be paraphrases of each other.
+    Instructions:
+    - Preserve Core Meaning: The generated sentence must convey the same key information, intent, and context as the Anchor.
+    - Vary Lexicon: Use different words and phrases (synonyms, related terms).
+    - Vary Syntax: Change the sentence structure (e.g., active to passive voice, change clause order, combine or split sentences).
+    - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
+    - Avoid Minor Changes: Do not just add/remove a few words or swap names. Create a fundamentally different sentence.
+    Respond only in JSON format:
+    {{"result": "str"}}
+    Anchor Text:
+    "{input}"
+  negative: |
+    You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
+    Your task is to create a negative sample for a given "Anchor" text.
+    A high-quality negative sample is a sentence that is semantically unrelated to the Anchor's specific question, while staying within the same general domain (religious topics).
+    Instructions:
+    - Stay in Domain: The sentence must be about the text's topics, but on a different subject
+    - Ensure Clear Distinction: The topic should be clearly different from the anchor's specific focus
+    - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
+    Respond only in JSON format:
+    {{"result": "str"}}
+    Anchor Text:
+    "{input}"
+  hard_negative: |
+      You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
+      Your task is to create a hard-negative sample for a given "Anchor" text.
+      A high-quality hard-negative sample is a sentence that is topically related but semantically distinct from the Anchor.
+      It should share some context (e.g., same domain, same entities) but differ in a crucial piece of information, action, conclusion, or specific detail.
+      Instructions:
+      - Stay in General Domain: Remain in the same broad domain (e.g., religious topics), but choose a completely different subject matter.
+      - Maintain Topical Overlap: Keep the same domain, subject, or entities (e.g., people, products, concepts) as the Anchor.
+      - Alter a Key Semantic Element: Reverse a key word or condition or place or proper name that completely reverses the meaning of the sentence.
+      - Avoid Being a Paraphrase: The sentence must NOT be semantically equivalent. The core factual claim or intent must be different.
+      - Make it Challenging: The difference should be subtle enough that it requires a deep understanding of the text to identify, not just a simple keyword mismatch.
+      - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
+      Respond only in JSON format:
+      {{"result": "str"}}
+      Anchor Text:
+      "{input}"
+analyze_template:
+  positive: |
+    Analyze the following text to understand its CORE SEMANTIC MEANING for creating a high-quality POSITIVE sample.
+    Focus on:
+    - Core Intent: What is the fundamental question or statement being made?
+    - Key Entities/Concepts: What are the main subjects, objects, and concepts?
+    - Semantic Relationships: How are the entities related?
+    - Context & Domain: What is the broader context and domain?
+    Your analysis should capture the ESSENTIAL MEANING that must be preserved in any paraphrase.
+    Text:
+    {input}
+  negative: |
+    Analyze the following text to identify its SPECIFIC TOPIC and DOMAIN for creating a high-quality NEGATIVE sample.
+    Focus on:
+    - Specific Topic: What exact subject is this text about?
+    - Domain Context: What broader domain does this belong to?
+    - Key Elements to AVOID: What concepts, entities, or phrases must NOT appear in the negative sample?
+    - Alternative Topics: What are related but DISTINCT topics within the same domain?
+    The goal is to find topics that are in the same domain but semantically unrelated to this specific text.
+    Text:
+    {input}
+  hard_negative: |
+    Analyze this text to identify EXACTLY ONE ELEMENT that can be changed to create a hard-negative sample.
+    CRITICAL: The hard-negative must keep the SAME TOPIC and MOST WORDS identical.
+    Identify ONE change from these options:
+    - Change a quantity/order word (first→last, one→many)
+    - Change a key location/entity to a related one (paradise→hell, heaven→earth)
+    - Change the question focus slightly (who→what, what→how)
+    - Change a key action verb to a related action (enter→exit, give→take)
+    PRESERVE:
+    - Main topic and subject
+    - Sentence structure
+    - 80-90% of the vocabulary
+    Text:
+    {input}

texttools/prompts/run_custom.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+main_template: |
+  {input}
+  Respond only in JSON format:
+  {output_model_str}
+analyze_template: |

texttools/prompts/subject_to_question.yaml ADDED Viewed

@@ -0,0 +1,22 @@
+main_template: |
+  You are a question from subject generator.
+  Given the following subject, generate {number_of_questions} appropriate questions that this subject would directly respond to.
+  The generated subject should be independently meaningful,
+  and it must not mention any verbs like, this, that, he or she and etc. in the question.
+  There is a `reason` key, fill that up with a summerized version of your thoughts.
+  The `reason` must be less than 20 words.
+  Don't forget to fill the reason.
+  Respond only in JSON format:
+  {{"result": ["question1", "question2", ...], "reason": "string"}}
+  Here is the text:
+  {input}
+analyze_template: |
+  Our goal is to generate questions from the given subject.
+  The questions must be meaningfull, some of them should be specific and some should be general.
+  But first, in this step we want to analyze the subject that I asked to generate questions for it.
+  We need a summerized analysis of the subject.
+  What is the subject about?
+  What point of views can we see and generate questoins from it? (Questions that real users might have.)
+  Here is the subject:
+  {input}

texttools/prompts/summarize.yaml ADDED Viewed

@@ -0,0 +1,14 @@
+main_template: |
+  You are a summarizer.
+  You must summarize the given text, preserving its meaning.
+  Respond only in JSON format:
+  {{"result": "string"}}
+  Provide a concise summary of the following text:
+  {input}
+analyze_template: |
+  Read the following text and identify its main points, key arguments, and overall purpose.
+  Provide a brief, summarized analysis that will help in generating an accurate and concise summary.
+  {input}

texttools/prompts/text_to_question.yaml ADDED Viewed

@@ -0,0 +1,20 @@
+main_template: |
+  You are a question generator.
+  Given the following answer, generate one
+  appropriate question that this answer would directly respond to.
+  The generated answer should be independently meaningful,
+  and not mentioning any verbs like, this, that, he or she on the question.
+  Respond only in JSON format:
+  {{"result": "string"}}
+  Here is the answer:
+  {input}
+analyze_template: |
+  Analyze the following answer to identify its key facts,
+  main subject, and what kind of information it provides.
+  Provide a brief, summarized understanding of the answer's content that will
+  help in formulating a relevant and direct question.
+  Just mention the keypoints that was provided in the answer
+  Here is the answer:
+  {input}

texttools/prompts/translate.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+main_template: |
+  You are a {target_language} translator.
+  Output only the translated text.
+  Respond only in JSON format:
+  {{"result": "string"}}
+  Don't translate proper name, only transliterate them to {target_language}
+  Translate the following text to {target_language}:
+  {input}
+analyze_template: |
+  Analyze the following text and identify important linguistic considerations for translation.
+  Point out any idioms, cultural references, or complex structures that need special attention.
+  Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
+  {input}

texttools/tools/__init__.py CHANGED Viewed

@@ -1,33 +1,4 @@
-from .categorizer import EmbeddingCategorizer, GemmaCategorizer, LLMCategorizer
-from .keyword_extractor import GemmaKeywordExtractor
-from .ner import GemmaNERExtractor
-from .question_detector import GemmaQuestionDetector, LLMQuestionDetector
-from .question_generator import GemmaQuestionGenerator
-from .reranker import GemmaReranker, GemmaScorer, GemmaSorter
-from .rewriter import GemmaQuestionRewriter, RewriteMode
-from .merger import GemmaQuestionMerger, MergingMode
-from .subject_to_question import GemmaQuestionGeneratorFromSubject
-from .summarizer import GemmaSummarizer, LLMSummarizer
-from .translator import GemmaTranslator
-__all__ = [
-    "EmbeddingCategorizer",
-    "GemmaCategorizer",
-    "LLMCategorizer",
-    "GemmaTranslator",
-    "GemmaSummarizer",
-    "LLMSummarizer",
-    "GemmaNERExtractor",
-    "GemmaQuestionDetector",
-    "LLMQuestionDetector",
-    "GemmaQuestionGenerator",
-    "GemmaScorer",
-    "GemmaSorter",
-    "GemmaReranker",
-    "GemmaQuestionRewriter",
-    "RewriteMode",
-    "GemmaKeywordExtractor",
-    "GemmaQuestionGeneratorFromSubject",
-    "GemmaQuestionMerger",
-    "MergingMode",
-]
+from .async_the_tool import AsyncTheTool
+from .the_tool import TheTool
+__all__ = ["TheTool", "AsyncTheTool"]

hamtaa-texttools 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

hamtaa-texttools 0.1.48py3-none-any.whl → 1.1.7py3-none-any.whl