PyPI - unique_toolkit - Versions diffs - 1.42.9__py3-none-any.whl → 1.43.1__py3-none-any.whl - Mend

unique_toolkit 1.42.9py3-none-any.whl → 1.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/service.py ADDED Viewed

@@ -0,0 +1,369 @@
+"""Generation handler service for LLM-based summarization."""
+import logging
+from typing import Any, Callable
+from jinja2 import Template
+from tiktoken import get_encoding
+from unique_toolkit._common.experimental.write_up_agent.schemas import (
+    GroupData,
+    ProcessedGroup,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.utils import (
+    from_snake_case_to_display_name,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.config import (
+    GenerationHandlerConfig,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.exceptions import (
+    BatchCreationError,
+    LLMCallError,
+    PromptBuildError,
+    TokenLimitError,
+)
+from unique_toolkit.language_model import LanguageModelService
+from unique_toolkit.language_model.builder import MessagesBuilder
+_LOGGER = logging.getLogger(__name__)
+class GenerationHandler:
+    """
+    Handles LLM-based generation with adaptive batching and iterative aggregation.
+    This service:
+    - Splits groups into batches based on token/row limits
+    - Builds prompts from Jinja templates
+    - Calls LLM for each batch
+    - Aggregates results iteratively with context
+    """
+    def __init__(
+        self,
+        config: GenerationHandlerConfig,
+        renderer: Callable[[GroupData], str],
+    ):
+        """
+        Initialize generation handler.
+        Args:
+            config: Configuration for generation
+            renderer: Function to render group content (injected from template handler)
+                     Signature: renderer(group_data: GroupData) -> str
+        """
+        self._config = config
+        self._renderer = renderer
+        # TODO [UN-16142]: Use token counter from toolkit
+        try:
+            encoder = get_encoding(self._config.language_model.encoder_name)
+        except Exception as e:
+            _LOGGER.warning(
+                f"Failed to get encoder for model {self._config.language_model.name}: {e}"
+            )
+            encoder = get_encoding("cl100k_base")
+        def token_counter(text: str) -> int:
+            return len(encoder.encode(text))
+        # Token counter (use provided or default to character approximation)
+        self._token_counter = token_counter
+    def _default_token_counter(self, text: str) -> int:
+        """Default token counter using tiktoken encoding with cl100k_base."""
+        default_encoder = get_encoding("cl100k_base")
+        return len(default_encoder.encode(text))
+    def process_groups(
+        self,
+        groups: list[GroupData],
+        grouping_column: str,
+        llm_service: LanguageModelService,
+    ) -> list[ProcessedGroup]:
+        """
+        Process all groups with LLM generation.
+        Args:
+            groups: List of GroupData instances
+            grouping_column: The column name used for grouping (e.g., 'section')
+            llm_service: LanguageModelService instance to use for LLM calls
+        Returns:
+            List of ProcessedGroup instances with llm_response added
+        Raises:
+            GenerationHandlerError: If generation fails
+        """
+        processed_groups = []
+        for group in groups:
+            group_key_string = group.group_key
+            _LOGGER.info(f"Processing group: {group_key_string}")
+            # Get group-specific instruction using the documented format: "column:value"
+            # e.g., "section:introduction" for a group with key "introduction" in column "section"
+            lookup_key = f"{grouping_column}:{group_key_string}"
+            group_instruction = self._config.group_specific_instructions.get(lookup_key)
+            try:
+                # Process group with batching
+                llm_response = self._process_group_with_batching(
+                    group, group_instruction, llm_service
+                )
+                # Create ProcessedGroup with proper typing
+                processed_group = ProcessedGroup(
+                    group_key=group.group_key,
+                    rows=group.rows,
+                    llm_response=llm_response,
+                )
+                processed_groups.append(processed_group)
+                _LOGGER.info(
+                    f"Successfully processed group: {group_key_string} "
+                    f"(response length: {self._token_counter(llm_response)} tokens)"
+                )
+            except Exception as e:
+                _LOGGER.error(f"Error processing group {group_key_string}: {e}")
+                # Re-raise to allow caller to handle
+                raise
+        return processed_groups
+    def _process_group_with_batching(
+        self,
+        group: GroupData,
+        group_instruction: str | None,
+        llm_service: LanguageModelService,
+    ) -> str:
+        """
+        Process a single group with adaptive batching.
+        Args:
+            group: GroupData instance
+            group_instruction: Optional group-specific instruction
+            llm_service: LanguageModelService instance to use for LLM calls
+        Returns:
+            Final LLM response (aggregated if multiple batches)
+        Raises:
+            BatchCreationError: If batching fails
+            LLMCallError: If LLM call fails
+            AggregationError: If aggregation fails
+        """
+        group_key = group.group_key
+        rows = group.rows
+        # Create batches adaptively
+        try:
+            batches = self._create_batches(rows, group_key)
+            _LOGGER.info(
+                f"Created {len(batches)} batches for group {group_key} "
+                f"({len(rows)} total rows)"
+            )
+        except Exception as e:
+            raise BatchCreationError(
+                f"Failed to create batches for group {group_key}: {e}",
+                group_key=str(group_key),
+                row_count=len(rows),
+            ) from e
+        # Process each batch iteratively, keeping only one previous summary at a time
+        previous_summary: str | None = None
+        # TODO [UN-16142]: Improve error handling logic for LLMCallError
+        for batch_index, batch_group in enumerate(batches, start=1):
+            try:
+                # Render content for this batch
+                content = self._renderer(batch_group)
+                # Convert snake_case group_key to Title Case for display in prompts
+                display_section_name = from_snake_case_to_display_name(group_key)
+                # Build prompts with section name and at most one previous summary
+                system_prompt, user_prompt = self._build_prompts(
+                    section_name=display_section_name,  # Use Title Case for display
+                    content=content,
+                    group_instruction=group_instruction,
+                    previous_summary=previous_summary,
+                )
+                # Call LLM
+                batch_summary = self._call_llm(system_prompt, user_prompt, llm_service)
+                # Keep only this summary for the next iteration
+                previous_summary = batch_summary
+                _LOGGER.debug(
+                    f"Batch {batch_index}/{len(batches)} processed "
+                    f"(summary length: {self._token_counter(batch_summary)} tokens)"
+                )
+            except LLMCallError:
+                raise
+            except Exception as e:
+                raise LLMCallError(
+                    f"Error processing batch {batch_index} for group {group_key}: {e}",
+                    group_key=str(group_key),
+                    batch_index=batch_index,
+                    error_details=str(e),
+                ) from e
+        # Return final summary (last batch's result)
+        return previous_summary if previous_summary else ""
+    def _create_batches(
+        self, rows: list[dict[str, Any]], group_key: str
+    ) -> list[GroupData]:
+        """
+        Create batches adaptively based on token and row limits.
+        Fits as many rows as possible per batch while staying under limits.
+        Args:
+            rows: List of row dicts
+            group_key: Group identifier for creating GroupData instances
+        Returns:
+            List of GroupData instances (each representing a batch)
+        Raises:
+            TokenLimitError: If token counting fails
+        """
+        if not rows:
+            return [GroupData(group_key=group_key, rows=[])]
+        batches = []
+        current_batch = []
+        current_batch_tokens = 0
+        for row in rows:
+            # Estimate tokens for this row (rough approximation)
+            try:
+                row_str = str(row)
+                row_tokens = self._token_counter(row_str)
+            except Exception as e:
+                raise TokenLimitError(
+                    f"Failed to count tokens for row: {e}",
+                    estimated_tokens=0,
+                    max_tokens=self._config.max_tokens_per_batch,
+                ) from e
+            # Check if adding this row would exceed limits
+            would_exceed_tokens = (
+                current_batch_tokens + row_tokens > self._config.max_tokens_per_batch
+            )
+            would_exceed_rows = len(current_batch) >= self._config.max_rows_per_batch
+            if current_batch and (would_exceed_tokens or would_exceed_rows):
+                # Start new batch - create GroupData instance
+                batches.append(GroupData(group_key=group_key, rows=current_batch))
+                current_batch = [row]
+                current_batch_tokens = row_tokens
+            else:
+                # Add to current batch
+                current_batch.append(row)
+                current_batch_tokens += row_tokens
+        # Add final batch
+        if current_batch:
+            batches.append(GroupData(group_key=group_key, rows=current_batch))
+        return batches
+    def _build_prompts(
+        self,
+        section_name: str,
+        content: str,
+        group_instruction: str | None,
+        previous_summary: str | None,
+    ) -> tuple[str, str]:
+        """
+        Build system and user prompts from templates.
+        Args:
+            section_name: Name of the section being processed (group_key)
+            content: Rendered content to summarize
+            group_instruction: Optional group-specific instruction
+            previous_summary: Optional previous batch summary for context
+        Returns:
+            Tuple of (system_prompt, user_prompt)
+        Raises:
+            PromptBuildError: If prompt building fails
+        """
+        try:
+            # Build system prompt
+            system_prompt = Template(
+                self._config.prompts_config.system_prompt_template
+            ).render(
+                common_instruction=self._config.common_instruction,
+            )
+            # Build user prompt with section name
+            user_prompt = Template(
+                self._config.prompts_config.user_prompt_template
+            ).render(
+                section_name=section_name,
+                content=content,
+                group_instruction=group_instruction,
+                previous_summary=previous_summary,
+            )
+            return system_prompt.strip(), user_prompt.strip()
+        except Exception as e:
+            raise PromptBuildError(
+                f"Failed to build prompts: {e}",
+                context={
+                    "section_name": section_name,
+                    "has_group_instruction": group_instruction is not None,
+                    "has_previous_summary": previous_summary is not None,
+                },
+            ) from e
+    def _call_llm(
+        self, system_prompt: str, user_prompt: str, llm_service: LanguageModelService
+    ) -> str:
+        """
+        Call LLM with prompts.
+        Args:
+            system_prompt: System prompt
+            user_prompt: User prompt
+            llm_service: LanguageModelService instance to use for LLM calls
+        Returns:
+            LLM response text
+        Raises:
+            LLMCallError: If LLM call fails
+        """
+        messages = (
+            MessagesBuilder()
+            .system_message_append(system_prompt)
+            .user_message_append(user_prompt)
+            .build()
+        )
+        try:
+            # Call the language model using the configured LMI
+            response = llm_service.complete(
+                messages=messages,
+                model_name=self._config.language_model.name,
+            )
+            response_text = response.choices[0].message.content
+            assert isinstance(response_text, str), "Response must be a string"
+            return response_text
+        except Exception as e:
+            raise LLMCallError(
+                f"LLM call failed: {e}",
+                error_details=str(e),
+            ) from e

unique_toolkit/_common/experimental/write_up_agent/services/template_handler/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Template handler module."""
+from pathlib import Path
+from unique_toolkit._common.experimental.write_up_agent.services.template_handler.exceptions import (
+    ColumnExtractionError,
+    TemplateHandlerError,
+    TemplateParsingError,
+    TemplateRenderingError,
+    TemplateStructureError,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.template_handler.service import (
+    TemplateHandler,
+)
+from unique_toolkit._common.experimental.write_up_agent.utils import template_loader
+def default_jinja_template_loader():
+    return template_loader(Path(__file__).parent, "default_template.j2")
+__all__ = [
+    "TemplateHandler",
+    "TemplateHandlerError",
+    "TemplateParsingError",
+    "TemplateStructureError",
+    "TemplateRenderingError",
+    "ColumnExtractionError",
+]

unique_toolkit/_common/experimental/write_up_agent/services/template_handler/default_template.j2 ADDED Viewed

@@ -0,0 +1,37 @@
+{#
+  TODO [UN-16142]: Simplify template logic
+  Default Write-Up Agent Template
+  This template works in two phases:
+  1. PHASE 1 (LLM Input): When g.llm_response is not provided, renders all Q&A data
+     - Used to send structured data to the LLM for summarization
+  2. PHASE 2 (Final Report): When g.llm_response is provided, renders the LLM output
+     - Used to generate the final markdown report with LLM summaries
+  Template Variables:
+  - groups: List of data groups
+  - g.<column>: Access grouping columns (e.g., g.section)
+  - g.rows: List of rows in this group
+  - g.llm_response: (Optional) LLM-generated summary to replace row data
+  - row.<column>: Access data columns (e.g., row.question, row.answer)
+#}
+{% for g in groups %}
+# {{ g.section }}
+{% if g.llm_response %}
+{# Phase 2: Render LLM-generated summary #}
+{{ g.llm_response }}
+{% else %}
+{# Phase 1: Render detailed Q&A for LLM to process #}
+{% for row in g.rows %}
+**Q: {{ row.question }}**
+A: {{ row.answer }}
+{% endfor %}
+{% endif %}
+---
+{% endfor %}

unique_toolkit/_common/experimental/write_up_agent/services/template_handler/exceptions.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Exceptions for template handler operations."""
+class TemplateHandlerError(Exception):
+    """Base exception for all template handler errors."""
+    pass
+class TemplateParsingError(TemplateHandlerError):
+    """Raised when Jinja template parsing fails."""
+    def __init__(self, message: str, template_snippet: str | None = None):
+        super().__init__(message)
+        self.template_snippet = template_snippet
+class TemplateStructureError(TemplateHandlerError):
+    """Raised when template doesn't have the required structure."""
+    def __init__(self, message: str, expected_structure: str | None = None):
+        super().__init__(message)
+        self.expected_structure = expected_structure
+class TemplateRenderingError(TemplateHandlerError):
+    """Raised when template rendering fails."""
+    def __init__(self, message: str, context_keys: list[str] | None = None):
+        super().__init__(message)
+        self.context_keys = context_keys or []
+class ColumnExtractionError(TemplateHandlerError):
+    """Raised when extracting columns from template fails."""
+    def __init__(self, message: str, detected_columns: list[str] | None = None):
+        super().__init__(message)
+        self.detected_columns = detected_columns or []

unique_toolkit/_common/experimental/write_up_agent/services/template_handler/service.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Template handler service."""
+from jinja2 import Template, TemplateError
+from unique_toolkit._common.experimental.write_up_agent.schemas import (
+    GroupData,
+    ProcessedGroup,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.utils import (
+    from_snake_case_to_display_name,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.template_handler.exceptions import (
+    ColumnExtractionError,
+    TemplateParsingError,
+    TemplateRenderingError,
+    TemplateStructureError,
+)
+from unique_toolkit._common.experimental.write_up_agent.services.template_handler.utils import (
+    parse_template,
+)
+# TODO [UN-16142]: Simplify template logic
+class TemplateHandler:
+    """
+    Handles all template operations.
+    Responsibilities:
+    - Extract grouping column (single only)
+    - Extract selected columns
+    - Render template for groups
+    """
+    def __init__(self, template: str):
+        """
+        Initialize template handler.
+        Args:
+            template: Jinja template string
+        Raises:
+            TemplateParsingError: If template cannot be parsed
+        """
+        self._template = template
+        try:
+            self._jinja_template = Template(template, lstrip_blocks=True)
+        except TemplateError as e:
+            snippet = template[:100] + "..." if len(template) > 100 else template
+            raise TemplateParsingError(
+                f"Failed to parse Jinja template: {e}", template_snippet=snippet
+            ) from e
+        self._parsed_info = None
+    def _get_parsed_info(self):
+        """
+        Lazy parse template.
+        Raises:
+            TemplateParsingError: If template structure cannot be parsed
+        """
+        if self._parsed_info is None:
+            try:
+                self._parsed_info = parse_template(self._template)
+            except Exception as e:
+                raise TemplateParsingError(
+                    f"Failed to parse template structure: {e}"
+                ) from e
+        return self._parsed_info
+    def get_grouping_column(self) -> str:
+        """
+        Extract the single grouping column.
+        Returns:
+            Column name to group by
+        Raises:
+            TemplateStructureError: If template structure is invalid
+            ColumnExtractionError: If grouping column detection fails
+        """
+        info = self._get_parsed_info()
+        if not info.expects_groups:
+            raise TemplateStructureError(
+                "Template must use grouping pattern: {% for g in groups %}",
+                expected_structure="{% for g in groups %}",
+            )
+        if len(info.grouping_columns) == 0:
+            raise ColumnExtractionError(
+                "No grouping column detected in template. Use {{ g.column_name }} to reference grouping columns."
+            )
+        if len(info.grouping_columns) > 1:
+            raise ColumnExtractionError(
+                f"Single grouping column required. Found {len(info.grouping_columns)}: {info.grouping_columns}",
+                detected_columns=info.grouping_columns,
+            )
+        return info.grouping_columns[0]
+    def get_selected_columns(self) -> list[str]:
+        """
+        Extract columns referenced in template.
+        Returns:
+            List of column names from {{ row.column }} patterns
+        """
+        info = self._get_parsed_info()
+        return info.row_columns
+    def render_group(
+        self, group_data: GroupData, llm_response: str | None = None
+    ) -> str:
+        """
+        Render template for a single group.
+        This method supports two rendering modes:
+        1. Without llm_response: Renders the full row data (for LLM input)
+        2. With llm_response: Renders the LLM output instead of row data (for final report)
+        Args:
+            group_data: GroupData instance with group_key and rows
+            llm_response: Optional LLM-generated output. If provided, the template
+                         will render this instead of the detailed row loop.
+        Returns:
+            Rendered template string
+        Raises:
+            TemplateRenderingError: If rendering fails
+        """
+        try:
+            grouping_column = self.get_grouping_column()
+            # Prepare group item with grouping column value, rows, and llm_response
+            group_item = {
+                grouping_column: from_snake_case_to_display_name(group_data.group_key),
+                "rows": group_data.rows,
+                "llm_response": llm_response,  # Add to group item, not top level
+            }
+            # Render with groups list (template expects {% for g in groups %})
+            return self._jinja_template.render(groups=[group_item])
+        except (TemplateError, KeyError) as e:
+            context_keys = ["group_key", "rows", "llm_response"]
+            raise TemplateRenderingError(
+                f"Failed to render template: {e}", context_keys=context_keys
+            ) from e
+    def render_all_groups(self, processed_groups: list[ProcessedGroup]) -> str:
+        """
+        Render template for all groups at once.
+        Takes advantage of the template's {% for g in groups %} loop
+        to render all groups in a single pass.
+        Args:
+            processed_groups: List of ProcessedGroup instances with group_key, rows, and llm_response
+        Returns:
+            Rendered template string with all groups
+        Raises:
+            TemplateRenderingError: If rendering fails
+        """
+        try:
+            grouping_column = self.get_grouping_column()
+            # Prepare all groups for rendering
+            groups_data = []
+            for group_data in processed_groups:
+                # Convert snake_case group_key to Title Case for display
+                display_group_key = from_snake_case_to_display_name(
+                    group_data.group_key
+                )
+                group_item = {
+                    grouping_column: display_group_key,  # Use Title Case for display
+                    "rows": group_data.rows,
+                    "llm_response": group_data.llm_response,
+                }
+                groups_data.append(group_item)
+            # Render all groups at once using template's loop
+            return self._jinja_template.render(groups=groups_data)
+        except (TemplateError, KeyError) as e:
+            raise TemplateRenderingError(f"Failed to render all groups: {e}") from e

unique_toolkit 1.42.9__py3-none-any.whl → 1.43.1__py3-none-any.whl

unique_toolkit 1.42.9py3-none-any.whl → 1.43.1py3-none-any.whl