PyPI - unique_toolkit - Versions diffs - 1.42.9__py3-none-any.whl → 1.43.0__py3-none-any.whl - Mend

unique_toolkit 1.42.9py3-none-any.whl → 1.43.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/service.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""DataFrame handler service."""
+import pandas as pd
+from unique_toolkit._common.experimental.write_up_agent.schemas import GroupData
+from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.exceptions import (
+    DataFrameGroupingError,
+    DataFrameProcessingError,
+    DataFrameValidationError,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.utils import (
+    dataframe_to_dict_records,
+    normalize_column_names,
+    to_snake_case,
+)
+class DataFrameHandler:
+    """
+    Handles all DataFrame operations.
+    This handler automatically converts all column names to snake_case to ensure
+    compatibility with Jinja template syntax. For example:
+    - "My Column" becomes "my_column"
+    - "UserName" becomes "user_name"
+    - "column-name" becomes "column_name"
+    This normalization happens automatically during validation and grouping operations.
+    Responsibilities:
+    - Normalize column names to snake_case
+    - Validate DataFrame has required columns
+    - Create groups from DataFrame
+    """
+    def validate_columns(
+        self, df: pd.DataFrame, grouping_column: str, selected_columns: list[str]
+    ) -> None:
+        """
+        Validate DataFrame has required columns.
+        NOTE: Column names are automatically converted to snake_case before validation.
+        Ensure your template uses snake_case column references (e.g., {{ row.my_column }}).
+        Args:
+            df: pandas DataFrame to validate
+            grouping_column: Column to group by (should be in snake_case)
+            selected_columns: Columns that should exist (should be in snake_case)
+        Raises:
+            DataFrameValidationError: If columns are missing after normalization
+        Example:
+            >>> df = pd.DataFrame({"My Section": [1], "My Question": [2]})
+            >>> handler.validate_columns(df, "my_section", ["my_question"])
+            # Validation passes because "My Section" -> "my_section"
+        """
+        # Normalize DataFrame columns to snake_case
+        normalized_df = normalize_column_names(df)
+        required_columns = {grouping_column} | set(selected_columns)
+        missing_columns = required_columns - set(normalized_df.columns)
+        if missing_columns:
+            raise DataFrameValidationError(
+                f"DataFrame missing required columns after snake_case normalization: {sorted(missing_columns)}. "
+                f"Available columns: {sorted(normalized_df.columns)}",
+                missing_columns=sorted(missing_columns),
+            )
+    def create_groups(
+        self, df: pd.DataFrame, grouping_column: str, selected_columns: list[str]
+    ) -> list[GroupData]:
+        """
+        Create groups from DataFrame.
+        NOTE: Column names are automatically converted to snake_case.
+        Group values (group_key) are kept as-is from the DataFrame (NOT normalized).
+        The returned GroupData instances will have:
+        - snake_case column names in their rows
+        - Original group_key values from DataFrame
+        IMPORTANT: Groups are returned in the order of their first appearance in the DataFrame,
+        NOT sorted alphabetically. This preserves the logical flow of your data.
+        Args:
+            df: pandas DataFrame to group
+            grouping_column: Column to group by (should be in snake_case)
+            selected_columns: Columns to include in rows (should be in snake_case)
+        Returns:
+            List of GroupData instances in order of first appearance, each containing
+            group_key (in snake_case) and rows with snake_case columns
+        Raises:
+            DataFrameGroupingError: If grouping fails
+            DataFrameProcessingError: If data processing fails
+        Example:
+            >>> df = pd.DataFrame({
+            ...     "My Section": ["Intro", "Methods", "Results", "Intro"],
+            ...     "My Question": ["Q1", "Q2", "Q3", "Q4"],
+            ... })
+            >>> groups = handler.create_groups(df, "my_section", ["my_question"])
+            >>> [g.group_key for g in groups]
+            ['intro', 'methods', 'results']  # Values normalized to snake_case, order preserved
+        """
+        # Normalize column names to snake_case
+        normalized_df = normalize_column_names(df)
+        if grouping_column not in normalized_df.columns:
+            raise DataFrameGroupingError(
+                f"Grouping column '{grouping_column}' not found in normalized DataFrame. "
+                f"Available columns: {sorted(normalized_df.columns)}",
+                grouping_column=grouping_column,
+            )
+        try:
+            # Use sort=False to preserve the order of first appearance in the DataFrame
+            grouped = normalized_df.groupby(grouping_column, sort=False)
+        except Exception as e:
+            raise DataFrameGroupingError(
+                f"Failed to group DataFrame by '{grouping_column}': {e}",
+                grouping_column=grouping_column,
+            ) from e
+        results = []
+        try:
+            for group_key, group_df in grouped:
+                # Filter columns if specified
+                if selected_columns:
+                    cols_to_use = [c for c in selected_columns if c in group_df.columns]
+                    limited_df = group_df.loc[:, cols_to_use]
+                else:
+                    limited_df = group_df
+                # Convert to dict records
+                rows = dataframe_to_dict_records(limited_df)
+                # Normalize group_key value to snake_case for consistency with template syntax
+                normalized_group_key = to_snake_case(str(group_key))
+                # Create GroupData instance with proper typing
+                results.append(GroupData(group_key=normalized_group_key, rows=rows))
+        except Exception as e:
+            raise DataFrameProcessingError(f"Error processing grouped data: {e}") from e
+        return results

unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/utils.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""Utility functions for DataFrame operations."""
+import re
+import pandas as pd
+def to_snake_case(text: str) -> str:
+    """
+    Convert a string to snake_case.
+    This ensures column names are compatible with Jinja template syntax.
+    Examples:
+        >>> to_snake_case("MyColumn")
+        'my_column'
+        >>> to_snake_case("my_column")
+        'my_column'
+        >>> to_snake_case("My Column Name")
+        'my_column_name'
+        >>> to_snake_case("column-name")
+        'column_name'
+        >>> to_snake_case("Column_123")
+        'column_123'
+    Args:
+        text: String to convert
+    Returns:
+        snake_case version of the string
+    """
+    # Replace spaces and hyphens with underscores
+    text = text.replace(" ", "_").replace("-", "_")
+    # Insert underscore before uppercase letters (for camelCase/PascalCase)
+    text = re.sub(r"(?<!^)(?=[A-Z])", "_", text)
+    # Convert to lowercase
+    text = text.lower()
+    # Remove duplicate underscores
+    text = re.sub(r"_+", "_", text)
+    # Remove leading/trailing underscores
+    text = text.strip("_")
+    return text
+def from_snake_case_to_display_name(text: str) -> str:
+    """
+    Convert snake_case text back to Title Case for display.
+    Args:
+        text: snake_case text to convert
+    Returns:
+        Title Case version of the text
+    Example:
+        >>> from_snake_case("executive_summary")
+        'Executive Summary'
+        >>> from_snake_case("my_column_name")
+        'My Column Name'
+        >>> from_snake_case("api_design")
+        'Api Design'
+    """
+    # Split on underscores and capitalize each word
+    words = text.split("_")
+    return " ".join(word.capitalize() for word in words)
+def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert all DataFrame column names to snake_case.
+    This normalization ensures column names are compatible with Jinja template
+    syntax (e.g., {{ row.my_column }} works, but {{ row.My Column }} doesn't).
+    Examples:
+        >>> df = pd.DataFrame({"My Column": [1], "AnotherColumn": [2]})
+        >>> normalized = normalize_column_names(df)
+        >>> list(normalized.columns)
+        ['my_column', 'another_column']
+    Args:
+        df: Input DataFrame
+    Returns:
+        New DataFrame with normalized column names
+    """
+    ## TODO [UN-16142]: Normalization may lead to duplicate column names, we should handle this case
+    normalized_columns = {col: to_snake_case(col) for col in df.columns}
+    return df.rename(columns=normalized_columns)
+def limit_dataframe_rows(df: pd.DataFrame, max_rows: int) -> pd.DataFrame:
+    """
+    Limit DataFrame to first N rows.
+    Args:
+        df: DataFrame to limit
+        max_rows: Maximum number of rows
+    Returns:
+        DataFrame with at most max_rows rows
+    """
+    if len(df) <= max_rows:
+        return df.copy()
+    return df.head(max_rows).copy()
+def dataframe_to_dict_records(
+    df: pd.DataFrame, columns: list[str] | None = None
+) -> list[dict]:
+    """
+    Convert DataFrame to list of dict records.
+    Args:
+        df: DataFrame to convert
+        columns: Optional list of columns to include
+    Returns:
+        List of dict records
+    """
+    if columns:
+        df = df.loc[:, columns]
+    # Replace NaN with None for better serialization
+    return df.where(pd.notna(df), None).to_dict(orient="records")

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Generation handler module."""
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.config import (
+    GenerationHandlerConfig,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.exceptions import (
+    AggregationError,
+    BatchCreationError,
+    GenerationHandlerError,
+    LLMCallError,
+    PromptBuildError,
+    TokenLimitError,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.service import (
+    GenerationHandler,
+)
+__all__ = [
+    "GenerationHandler",
+    "GenerationHandlerConfig",
+    "GenerationHandlerError",
+    "BatchCreationError",
+    "PromptBuildError",
+    "LLMCallError",
+    "AggregationError",
+    "TokenLimitError",
+]

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/config.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Configuration for generation handler."""
+from pydantic import BaseModel, Field
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.prompts.config import (
+    GenerationHandlerPromptsConfig,
+)
+from unique_toolkit._common.pydantic_helpers import get_configuration_dict
+from unique_toolkit._common.validators import LMI, get_LMI_default_field
+from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
+class GenerationHandlerConfig(BaseModel):
+    """Configuration for generation handler.
+    This configuration controls how groups are batched, how prompts are built,
+    and how the LLM is called for generating summaries.
+    """
+    model_config = get_configuration_dict()
+    language_model: LMI = get_LMI_default_field(
+        DEFAULT_GPT_4o,
+        description="The language model to use for generating summaries.",
+    )
+    common_instruction: str = Field(
+        default="You are a technical writer. Summarize the provided content concisely and clearly.",
+        description="Common instruction applied to all groups",
+    )
+    # TODO [UN-16142]: Add default instructions for each group
+    group_specific_instructions: dict[str, str] = Field(
+        default_factory=dict,
+        description=(
+            "Custom instructions per group. "
+            "Keys should be formatted as 'column:value' (e.g., 'section:Introduction')"
+        ),
+    )
+    max_tokens_per_batch: int = Field(
+        default=4000,
+        ge=100,
+        description="Maximum tokens per batch for LLM input (affects batching strategy)",
+    )
+    max_rows_per_batch: int = Field(
+        default=20,
+        ge=1,
+        description="Maximum rows per batch (secondary limit to tokens)",
+    )
+    prompts_config: GenerationHandlerPromptsConfig = Field(
+        default_factory=GenerationHandlerPromptsConfig,
+        description="Configuration for the prompts.",
+    )

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/exceptions.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Exceptions for generation handler operations."""
+class GenerationHandlerError(Exception):
+    """Base exception for all generation handler errors."""
+    pass
+class BatchCreationError(GenerationHandlerError):
+    """Raised when batch creation fails."""
+    def __init__(
+        self,
+        message: str,
+        group_key: str | None = None,
+        row_count: int | None = None,
+    ):
+        super().__init__(message)
+        self.group_key = group_key
+        self.row_count = row_count
+class PromptBuildError(GenerationHandlerError):
+    """Raised when prompt building fails."""
+    def __init__(
+        self,
+        message: str,
+        prompt_type: str | None = None,
+        context: dict | None = None,
+    ):
+        super().__init__(message)
+        self.prompt_type = prompt_type
+        self.context = context or {}
+class LLMCallError(GenerationHandlerError):
+    """Raised when LLM call fails."""
+    def __init__(
+        self,
+        message: str,
+        group_key: str | None = None,
+        batch_index: int | None = None,
+        error_details: str | None = None,
+    ):
+        super().__init__(message)
+        self.group_key = group_key
+        self.batch_index = batch_index
+        self.error_details = error_details
+class AggregationError(GenerationHandlerError):
+    """Raised when aggregating batch results fails."""
+    def __init__(
+        self,
+        message: str,
+        group_key: str | None = None,
+        batch_count: int | None = None,
+    ):
+        super().__init__(message)
+        self.group_key = group_key
+        self.batch_count = batch_count
+class TokenLimitError(GenerationHandlerError):
+    """Raised when token counting or limit validation fails."""
+    def __init__(
+        self,
+        message: str,
+        estimated_tokens: int | None = None,
+        max_tokens: int | None = None,
+    ):
+        super().__init__(message)
+        self.estimated_tokens = estimated_tokens
+        self.max_tokens = max_tokens

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/config.py ADDED Viewed

@@ -0,0 +1,34 @@
+from pathlib import Path
+from typing import Annotated
+from pydantic import BaseModel, Field
+from unique_toolkit._common.experimental.write_up_agent.utils import template_loader
+from unique_toolkit._common.pydantic.rjsf_tags import RJSFMetaTag
+from unique_toolkit._common.pydantic_helpers import get_configuration_dict
+PARENT_DIR = Path(__file__).parent
+_SYSTEM_PROMPT_TEMPLATE = template_loader(PARENT_DIR, "system_prompt.j2")
+_USER_PROMPT_TEMPLATE = template_loader(PARENT_DIR, "user_prompt.j2")
+class GenerationHandlerPromptsConfig(BaseModel):
+    model_config = get_configuration_dict()
+    system_prompt_template: Annotated[
+        str,
+        RJSFMetaTag.StringWidget.textarea(
+            rows=len(_SYSTEM_PROMPT_TEMPLATE.split("\n"))
+        ),
+    ] = Field(
+        default=_SYSTEM_PROMPT_TEMPLATE,
+        description="The system prompt for the source selection.",
+    )
+    user_prompt_template: Annotated[
+        str,
+        RJSFMetaTag.StringWidget.textarea(rows=len(_USER_PROMPT_TEMPLATE.split("\n"))),
+    ] = Field(
+        default=_USER_PROMPT_TEMPLATE,
+        description="The user prompt for the source selection.",
+    )

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/system_prompt.j2 ADDED Viewed

@@ -0,0 +1,15 @@
+{# System prompt with common instructions #}
+You are an expert technical writer tasked with creating clear, concise, and well-organized content summaries.
+## Your Task
+Generate a comprehensive summary for a specific section based on the provided content. The section header is already defined - focus on creating the body content.
+## Guidelines
+- Synthesize the provided information into a coherent narrative
+- Organize content logically, using sub-sections if it improves clarity
+- Maintain a professional and informative tone
+- Be concise while preserving key information
+- If section-specific instructions are provided, follow them carefully
+{{ common_instruction }}

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/user_prompt.j2 ADDED Viewed

@@ -0,0 +1,21 @@
+{# User prompt with section context and optional previous summary #}
+## Section Context
+You are writing content for the section titled: **"{{ section_name }}"**
+**IMPORTANT**: The section header is already provided in the final report. Do NOT include the section title (# {{ section_name }}) in your response. Start directly with the content.
+{% if group_instruction %}
+## Section-Specific Instructions
+{{ group_instruction }}
+{% endif %}
+{% if previous_summary %}
+## Context from Previous Batch
+The following is a summary from the previous batch of content for this section. Build upon this context when generating your summary:
+{{ previous_summary }}
+{% endif %}
+## Content to Summarize
+{{ content }}

unique_toolkit 1.42.9__py3-none-any.whl → 1.43.0__py3-none-any.whl

unique_toolkit 1.42.9py3-none-any.whl → 1.43.0py3-none-any.whl