PyPI - hamtaa-texttools - Versions diffs - 0.1.43__py3-none-any.whl - Mend

hamtaa-texttools 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (60) hide show

hamtaa_texttools-0.1.43.dist-info/METADATA +60 -0
hamtaa_texttools-0.1.43.dist-info/RECORD +60 -0
hamtaa_texttools-0.1.43.dist-info/WHEEL +5 -0
hamtaa_texttools-0.1.43.dist-info/top_level.txt +1 -0
texttools/__init__.py +26 -0
texttools/base/__init__.py +3 -0
texttools/base/base_categorizer.py +40 -0
texttools/base/base_keyword_extractor.py +35 -0
texttools/base/base_ner_extractor.py +61 -0
texttools/base/base_question_detector.py +35 -0
texttools/base/base_question_generator.py +99 -0
texttools/base/base_question_merger.py +59 -0
texttools/base/base_question_rewriter.py +61 -0
texttools/base/base_router.py +33 -0
texttools/base/base_summarizer.py +55 -0
texttools/base/base_task_performer.py +53 -0
texttools/base/base_translator.py +38 -0
texttools/batch_manager/__init__.py +2 -0
texttools/batch_manager/batch_manager.py +241 -0
texttools/batch_manager/batch_runner.py +207 -0
texttools/formatter/__init__.py +1 -0
texttools/formatter/base.py +26 -0
texttools/formatter/gemma3_formatter.py +51 -0
texttools/handlers/__init__.py +6 -0
texttools/handlers/categorizer/__init__.py +6 -0
texttools/handlers/categorizer/categorizer.py +61 -0
texttools/handlers/handlers.py +88 -0
texttools/tools/__init__.py +33 -0
texttools/tools/categorizer/__init__.py +2 -0
texttools/tools/categorizer/encoder_model/__init__.py +1 -0
texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +51 -0
texttools/tools/categorizer/llm/__init__.py +2 -0
texttools/tools/categorizer/llm/gemma_categorizer.py +169 -0
texttools/tools/categorizer/llm/openai_categorizer.py +80 -0
texttools/tools/keyword_extractor/__init__.py +1 -0
texttools/tools/keyword_extractor/gemma_extractor.py +138 -0
texttools/tools/merger/__init__.py +2 -0
texttools/tools/merger/gemma_question_merger.py +214 -0
texttools/tools/ner/__init__.py +1 -0
texttools/tools/ner/gemma_ner_extractor.py +157 -0
texttools/tools/question_detector/__init__.py +2 -0
texttools/tools/question_detector/gemma_detector.py +130 -0
texttools/tools/question_detector/llm_detector.py +112 -0
texttools/tools/question_generator/__init__.py +1 -0
texttools/tools/question_generator/gemma_question_generator.py +198 -0
texttools/tools/reranker/__init__.py +3 -0
texttools/tools/reranker/reranker.py +137 -0
texttools/tools/reranker/scorer.py +216 -0
texttools/tools/reranker/sorter.py +278 -0
texttools/tools/rewriter/__init__.py +2 -0
texttools/tools/rewriter/gemma_question_rewriter.py +213 -0
texttools/tools/router/__init__.py +0 -0
texttools/tools/router/gemma_router.py +169 -0
texttools/tools/subject_to_question/__init__.py +1 -0
texttools/tools/subject_to_question/gemma_question_generator.py +224 -0
texttools/tools/summarizer/__init__.py +2 -0
texttools/tools/summarizer/gemma_summarizer.py +140 -0
texttools/tools/summarizer/llm_summerizer.py +108 -0
texttools/tools/translator/__init__.py +1 -0
texttools/tools/translator/gemma_translator.py +202 -0

texttools/tools/merger/gemma_question_merger.py ADDED Viewed

@@ -0,0 +1,214 @@
+from typing import Any, Optional
+from openai import OpenAI
+from texttools.base.base_question_merger import BaseQuestionsMerger, MergingMode
+from texttools.formatter import Gemma3Formatter
+# class QuestionGeneration(BaseModel):
+#     generated_question: str
+class GemmaQuestionMerger(BaseQuestionsMerger):
+    """
+    Questions merger for Gemma-style models with one mode for now:
+    1. merge the provided questions, preserving all the main points.
+    Outputs JSON with a single string field: {"merged_question": "..."}.
+    Allows optional extra instructions via `prompt_template`.
+    """
+    def __init__(
+        self,
+        client: OpenAI,
+        *,
+        model: str,
+        chat_formatter: Optional[Any] = None,
+        use_reason: bool = False,
+        temperature: float = 0.5,
+        prompt_template: Optional[str] = None,
+        handlers: Optional[list[Any]] = None,
+        **client_kwargs: Any,
+    ):
+        super().__init__(handlers)
+        self.client = client
+        self.model = model
+        self.temperature = temperature
+        self.client_kwargs = client_kwargs
+        self.chat_formatter = chat_formatter or Gemma3Formatter()
+        self.use_reason = use_reason
+        self.reason_summary = None
+        self.prompt_template = prompt_template
+        self.json_schema = {"rewritten_question": "string"}
+    def _build_messages(
+        self,
+        questions: list[str],
+        mode: MergingMode,
+    ) -> list[dict[str, str]]:
+        """
+        Builds the message list for the LLM API call for question merging,
+        adapting the prompt based on the chosen mode.
+        """
+        clean_questions = self.preprocess(questions)
+        messages: list[dict[str, str]] = []
+        if self.prompt_template:
+            messages.append({"role": "user", "content": self.prompt_template})
+        if self.reason_summary:
+            messages.append(
+                {
+                    "role": "user",
+                    "content": f"Based on this analysis: {self.reason_summary}",
+                }
+            )
+        if mode == MergingMode.DEFAULT_MODE:
+            instruction = (
+                "You are a language expert."
+                "I will give you a list of questions that are semantically similar."
+                "Your task is to merge them into one unified question that:"
+                "- Preserves all the information and intent from the original questions."
+                "- Sounds natural, fluent, and concise."
+                "- Avoids redundancy or unnecessary repetition."
+                "- Does not omit any unique idea from the originals."
+                "**Output only the merged question.**"
+            )
+        elif mode == MergingMode.REASON_MODE:
+            instruction = (
+                "You are an AI assistant helping to unify semantically similar questions."
+                "First, briefly extract the unique intent or content from each input question."
+                "Then, write one merged question that combines all their content clearly and naturally, without redundancy."
+                "Step 1: Extract key ideas."
+                "Step 2: Write the final merged question."
+            )
+        else:
+            raise ValueError(f"Unsupported rewrite mode: {mode}")
+        messages.append({"role": "user", "content": instruction})
+        messages.append(
+            {"role": "user", "content": f"here is the questions: {clean_questions}"}
+        )
+        # schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
+        messages.append(
+            {
+                "role": "user",
+                "content": """
+        Respond only with the new generated question, without any additional information.
+        **the generated question will be in the language of the users input**
+                         """,
+            }
+        )
+        # messages.append({"role": "assistant", "content": "{"})
+        # deprecated method for structured output
+        # this line will restructure the messages
+        # based on the formatter that we provided
+        # some models will require custom settings
+        restructured = self.chat_formatter.format(messages=messages)
+        return restructured
+    def _reason(self, questions: list[str], mode: MergingMode) -> str:
+        """
+        Internal reasoning step to help the model understand the core meaning
+        or structure of the question depending on the mode.
+        """
+        if mode == MergingMode.DEFAULT_MODE:
+            reason_prompt = """
+                Analyze the following questions to identify their core intent, key concepts,
+                and the specific information they are seeking.
+                Provide a brief, summarized understanding of the questions' meaning that
+                will help in merging and rephrasing it accurately without changing its intent.
+                **respond in the language of the question**
+                """
+        elif mode == MergingMode.REASON_MODE:
+            reason_prompt = """
+                Analyze the following questions to identify their exact wording, phrasing,
+                and the literal meaning it conveys.
+                Provide a brief, summarized analysis of their linguistic structure and current meaning,
+                which will then be used to create a new question containing all of their contents.
+                **respond in the language of the question**
+                """
+        else:
+            raise ValueError(f"Unsupported rewrite mode for reason: {mode}")
+        messages = [
+            {"role": "user", "content": reason_prompt},
+            {"role": "user", "content": f"here is the question: {questions}"},
+        ]
+        restructured = self.chat_formatter.format(messages=messages)
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=restructured,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        reason_summary = resp.choices[0].message.content.strip()
+        self.reason_summary = reason_summary
+    def rewrite_questions(
+        self,
+        questions: list[str],
+        mode: MergingMode = MergingMode.DEFAULT_MODE,
+        reason_summary: str = None,
+    ) -> str:
+        """
+        merging the input `questions` based on the specified `mode`.
+        Optionally uses an internal reasoning step for better accuracy.
+        """
+        if self.use_reason and not reason_summary:
+            self._reason(questions, mode)
+        elif reason_summary:
+            self.reason_summary = reason_summary
+        messages = self._build_messages(questions, mode)
+        # for structured output formatting
+        # but now i want to try somthing else
+        # i want to see if i could get the results without structured output
+        # completion = self.client.beta.chat.completions.parse(
+        #     model=self.model,
+        #     messages=messages,
+        #     response_format=QuestionGeneration,
+        #     temperature=self.temperature,
+        #     extra_body=dict(guided_decoding_backend="outlines"),
+        #     **self.client_kwargs,
+        # )
+        # message = completion.choices[0].message
+        # if message.parsed:
+        #     result = message.parsed.generated_question
+        # else:
+        #     raise ValueError(f"Failed to parse the response. Raw content: {message.content}")
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        result = resp.choices[0].message.content.strip()
+        # dispatch and return
+        self._dispatch(
+            {
+                "original_questions": questions,
+                "merged_question": result,
+                "mode": mode.value,
+            }
+        )
+        return result
+    def get_reason(self):
+        return self.reason_summary

texttools/tools/ner/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from texttools.tools.ner.gemma_ner_extractor import GemmaNERExtractor

texttools/tools/ner/gemma_ner_extractor.py ADDED Viewed

@@ -0,0 +1,157 @@
+import json
+from typing import Any, Optional
+from openai import OpenAI
+from texttools.base.base_ner_extractor import BaseNERExtractor
+class GemmaNERExtractor(BaseNERExtractor):
+    """
+    Named Entity Recognition (NER) system for Gemma-style models with optional reasoning step.
+    Outputs JSON with a single array field: {"entities": [{"text": "...", "type": "..."}, ...]}.
+    Allows optional extra instructions via `prompt_template`.
+    """
+    def __init__(
+        self,
+        client: OpenAI,
+        *,
+        model: str,
+        use_reason: bool = False,
+        temperature: float = 0.0,
+        prompt_template: Optional[str] = None,
+        # Handlers can be any type that implements a .handle method
+        handlers: Optional[list[Any]] = None,
+        **client_kwargs: Any,
+    ):
+        super().__init__(handlers)
+        self.client = client
+        self.model = model
+        self.temperature = temperature
+        self.client_kwargs = client_kwargs
+        self.use_reason = use_reason
+        self.prompt_template = prompt_template
+        # Define the JSON schema for NER output
+        # This specifies an array of objects, where each object has 'text' (string) and 'type' (string)
+        self.json_schema = {
+            "entities": [
+                {
+                    "text": "string",
+                    "type": "string",
+                }
+            ]
+        }
+    def _build_messages(
+        self, text: str, reason: Optional[str] = None
+    ) -> list[dict[str, str]]:
+        """
+        Builds the message list for the LLM API call for entity extraction.
+        """
+        clean_text = self.preprocess(text)
+        messages: list[dict[str, str]] = []
+        if self.prompt_template:
+            messages.append({"role": "user", "content": self.prompt_template})
+        if reason:
+            messages.append(
+                {"role": "user", "content": f"Based on this analysis: {reason}"}
+            )
+        messages.append(
+            {
+                "role": "user",
+                "content": "Identify and extract all named entities (e.g., PER, ORG, LOC, DAT, etc.) from the following text. For each entity, provide its text and a clear type. Respond as a JSON array of objects.",
+            }
+        )
+        messages.append({"role": "user", "content": clean_text})
+        # Ensure the schema is dumped as a valid JSON string for the LLM
+        schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
+        messages.append({"role": "user", "content": schema_instr})
+        messages.append(
+            {"role": "assistant", "content": "{"}
+        )  # Hint to start JSON output
+        return messages
+    def _reason(self, text: str) -> str:
+        """
+        Internal reasoning step to help the model identify potential entities and their context.
+        """
+        messages = [
+            {
+                "role": "user",
+                "content": """
+                    Read the following text and identify any proper nouns, key concepts, or specific mentions that might represent named entities.
+                    Provide a brief, summarized analysis that could help in categorizing these entities.
+                    """,
+            },
+            {
+                "role": "user",
+                "content": f"""
+                    {text}
+                    """,
+            },
+        ]
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        reason_summary = resp.choices[0].message.content.strip()
+        return reason_summary
+    def extract_entities(self, text: str) -> list[dict[str, str]]:
+        """
+        Extracts named entities from `text`.
+        Optionally uses an internal reasoning step for better accuracy.
+        """
+        reason_summary = None
+        if self.use_reason:
+            reason_summary = self._reason(text)
+        messages = self._build_messages(text, reason_summary)
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        raw = resp.choices[0].message.content.strip()
+        # Robustly parse JSON, even if the LLM adds extraneous text before the JSON
+        if not raw.startswith("{"):
+            raw = "{" + raw
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Failed to parse JSON for NER: {e}\nRaw output: {raw}")
+        entities = parsed.get("entities")
+        # Validate that 'entities' is a list and contains dictionaries with 'text' and 'type'
+        if not isinstance(entities, list) or not all(
+            isinstance(item, dict)
+            and "text" in item
+            and "type" in item
+            and isinstance(item["text"], str)
+            and isinstance(item["type"], str)
+            for item in entities
+        ):
+            raise ValueError(
+                f"Invalid response schema for NER. Expected 'entities' as a list of dicts with 'text' and 'type', got: {parsed}"
+            )
+        # dispatch and return
+        self._dispatch(entities=entities, original_text=text)
+        return entities

texttools/tools/question_detector/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from texttools.tools.question_detector.llm_detector import LLMQuestionDetector
2	+ from texttools.tools.question_detector.gemma_detector import GemmaQuestionDetector

texttools/tools/question_detector/gemma_detector.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import Any, Optional
+from openai import OpenAI
+from pydantic import BaseModel
+from texttools.base.base_question_detector import BaseQuestionDetector
+from texttools.formatter import Gemma3Formatter
+class QuestionDetection(BaseModel):
+    is_question: bool
+class GemmaQuestionDetector(BaseQuestionDetector):
+    """
+    Simplified binary question detector for Gemma-style models without system prompts.
+    Outputs JSON with a single boolean field: {"is_question": true|false}.
+    Allows optional extra instructions via `prompt_template`.
+    """
+    def __init__(
+        self,
+        client: OpenAI,
+        *,
+        model: str,
+        chat_formatter: Optional[Any] = None,
+        use_reason: bool = False,
+        temperature: float = 0.0,
+        prompt_template: str = None,
+        handlers: list[Any] = None,
+        **client_kwargs: Any,
+    ):
+        super().__init__(handlers)
+        self.client = client
+        self.model = model
+        self.temperature = temperature
+        self.client_kwargs = client_kwargs
+        self.chat_formatter = chat_formatter or Gemma3Formatter()
+        self.use_reason = use_reason
+        self.prompt_template = prompt_template
+        self.json_schema = {"is_question": bool}
+    def _build_messages(self, text: str, reason: str = None) -> list[dict[str, str]]:
+        clean = self.preprocess(text)
+        schema_instr = f"respond only in JSON format: {self.json_schema}"
+        messages: list[dict[str, str]] = []
+        if reason:
+            messages.append({"role": "user", "content": reason})
+        messages.append({"role": "user", "content": schema_instr})
+        if self.prompt_template:
+            messages.append({"role": "user", "content": self.prompt_template})
+        messages.append({"role": "user", "content": clean})
+        # this line will restructure the messages
+        # based on the formatter that we provided
+        # some models will require custom settings
+        restructured = self.chat_formatter.format(messages=messages)
+        return restructured
+    def _reason(self, text: str) -> list:
+        messages = [
+            {
+                "role": "user",
+                "content": """
+                    we want to analyze this text snippet to see if it contains any question
+                    or request of some kind or not
+                    read the text, and reason about it being a request or not
+                    summerized
+                    short answer
+                    """,
+            },
+            {
+                "role": "user",
+                "content": f"""
+                    {text}
+                    """,
+            },
+        ]
+        restructured = self.chat_formatter.format(messages=messages)
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            messages=restructured,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        reason = resp.choices[0].message.content.strip()
+        return reason
+    def detect(self, text: str) -> bool:
+        """
+        Returns True if `text` is a question, False otherwise.
+        Optionally uses an internal reasoning step for better accuracy.
+        """
+        reason_summary = None
+        if self.use_reason:
+            reason_summary = self._reason(text)
+        # print(reason_summary)
+        messages = self._build_messages(text, reason_summary)
+        completion = self.client.beta.chat.completions.parse(
+            model=self.model,
+            messages=messages,
+            response_format=QuestionDetection,
+            temperature=self.temperature,
+            extra_body=dict(guided_decoding_backend="auto"),
+            **self.client_kwargs,
+        )
+        message = completion.choices[0].message
+        if message.parsed:
+            result = message.parsed.is_question
+        else:
+            raise ValueError(
+                f"Failed to parse the response. Raw content: {message.content}"
+            )
+        # dispatch and return
+        self._dispatch({"question": text, "result": result})
+        return result

texttools/tools/question_detector/llm_detector.py ADDED Viewed

@@ -0,0 +1,112 @@
+from typing import Any
+from openai import OpenAI
+from pydantic import BaseModel, create_model
+from texttools.base.base_question_detector import BaseQuestionDetector
+class LLMQuestionDetector(BaseQuestionDetector):
+    """
+    LLM-based binary question detector that wraps OpenAI s structured output parsing.
+    Usage:
+        ```python
+        from openai import OpenAI
+        from texttools import LLMQuestionDetector
+        # Instantiate an OpenAI client (ensure you ve set OPENAI_API_KEY)
+        client = OpenAI()
+        # Create detector
+        detector = LLMQuestionDetector(
+            client=client,
+            model="gpt-4o-2024-08-06",
+            temperature=0.0,  # deterministic outputs
+            prompt_template=(
+                "You are a binary classifier. "
+                "Answer only with `true` or `false` depending on the input."
+            ),
+            handlers=[my_handler],  # optional callbacks on each detection
+            max_tokens=10           # any other OpenAIClient kwargs
+        )
+        # Detect whether a string is a question
+        is_question = detector.detect("How are you today?")
+        # is_question == True
+        ```
+    Parameters:
+        client (OpenAI):
+            Instantiated OpenAI client. Make sure your API key is configured.
+        model (str):
+            Model name to use (e.g. "gpt-4", "gpt-4o-2024-08-06").
+        temperature (float, default=0.0):
+            Sampling temperature; 0.0 yields deterministic outputs.
+        prompt_template (str, optional):
+            System‐level instructions guiding the classification.
+        handlers (list[callable], optional):
+            List of callables that receive {"text": bool} after each detect().
+        client_kwargs (Any):
+            Additional parameters passed directly to OpenAI (e.g., max_tokens, top_p).
+    Internals:
+        - Wraps your input in system/user messages.
+        - Uses Pydantic to enforce that the API returns a boolean.
+        - Dispatches result to any registered handlers.
+    """
+    def __init__(
+        self,
+        client: OpenAI,
+        *,
+        model: str,
+        temperature: float = 0.0,
+        prompt_template: str = None,
+        handlers: list[Any] = None,
+        **client_kwargs: Any,
+    ):
+        """
+        :param client: an instantiated OpenAI client
+        :param model: the model name (e.g. "gpt-4o-2024-08-06")
+        :param temperature: sampling temperature
+        :param prompt_template: override default prompt instructions
+        :param handlers: optional list of result handlers
+        :param client_kwargs: any other OpenAI kwargs (e.g. `max_tokens`, `top_p`, etc.)
+        """
+        super().__init__(handlers)
+        self.client = client
+        self.model = model
+        self.temperature = temperature
+        self.client_kwargs = client_kwargs
+        self.prompt_template = prompt_template or (
+            "You are a binary classifier. "
+            "Answer only with `true` or `false` depending on the input."
+        )
+        self._OutputModel = create_model(
+            "DetectionOutput",
+            result=(bool, ...),
+        )
+    def _build_messages(self, text: str) -> list[dict[str, str]]:
+        clean = self.preprocess(text)
+        return [
+            {"role": "system", "content": self.prompt_template},
+            {"role": "user", "content": clean},
+        ]
+    def detect(self, text: str) -> bool:
+        msgs = self._build_messages(text)
+        resp = self.client.responses.parse(
+            model=self.model,
+            input=msgs,
+            text_format=self._OutputModel,
+            temperature=self.temperature,
+            **self.client_kwargs,
+        )
+        output: BaseModel = resp.output_parsed
+        self._dispatch({"question": text, "result": output.result})
+        return output.result

texttools/tools/question_generator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .gemma_question_generator import GemmaQuestionGenerator