PyPI - sdg-hub - Versions diffs - 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

sdg-hub 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

sdg_hub/__init__.py +28 -1
sdg_hub/_version.py +2 -2
sdg_hub/core/__init__.py +22 -0
sdg_hub/core/blocks/__init__.py +58 -0
sdg_hub/core/blocks/base.py +313 -0
sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
sdg_hub/core/blocks/evaluation/__init__.py +9 -0
sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
sdg_hub/core/blocks/filtering/__init__.py +12 -0
sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
sdg_hub/core/blocks/llm/__init__.py +25 -0
sdg_hub/core/blocks/llm/client_manager.py +398 -0
sdg_hub/core/blocks/llm/config.py +336 -0
sdg_hub/core/blocks/llm/error_handler.py +368 -0
sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
sdg_hub/core/blocks/registry.py +331 -0
sdg_hub/core/blocks/transform/__init__.py +23 -0
sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
sdg_hub/core/blocks/transform/melt_columns.py +126 -0
sdg_hub/core/blocks/transform/rename_columns.py +69 -0
sdg_hub/core/blocks/transform/text_concat.py +102 -0
sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
sdg_hub/core/flow/__init__.py +20 -0
sdg_hub/core/flow/base.py +980 -0
sdg_hub/core/flow/metadata.py +344 -0
sdg_hub/core/flow/migration.py +187 -0
sdg_hub/core/flow/registry.py +330 -0
sdg_hub/core/flow/validation.py +265 -0
sdg_hub/{utils → core/utils}/__init__.py +6 -4
sdg_hub/{utils → core/utils}/datautils.py +1 -3
sdg_hub/core/utils/error_handling.py +208 -0
sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
sdg_hub-0.2.0.dist-info/METADATA +218 -0
sdg_hub-0.2.0.dist-info/RECORD +63 -0
sdg_hub/blocks/__init__.py +0 -42
sdg_hub/blocks/block.py +0 -96
sdg_hub/blocks/llmblock.py +0 -375
sdg_hub/blocks/openaichatblock.py +0 -556
sdg_hub/blocks/utilblocks.py +0 -597
sdg_hub/checkpointer.py +0 -139
sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
sdg_hub/configs/annotations/detailed_description.yaml +0 -10
sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
sdg_hub/configs/knowledge/__init__.py +0 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
sdg_hub/configs/knowledge/router.yaml +0 -12
sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
sdg_hub/configs/reasoning/__init__.py +0 -0
sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
sdg_hub/configs/skills/__init__.py +0 -0
sdg_hub/configs/skills/analyzer.yaml +0 -48
sdg_hub/configs/skills/annotation.yaml +0 -36
sdg_hub/configs/skills/contexts.yaml +0 -28
sdg_hub/configs/skills/critic.yaml +0 -60
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
sdg_hub/configs/skills/freeform_questions.yaml +0 -34
sdg_hub/configs/skills/freeform_responses.yaml +0 -39
sdg_hub/configs/skills/grounded_questions.yaml +0 -38
sdg_hub/configs/skills/grounded_responses.yaml +0 -59
sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
sdg_hub/configs/skills/judge.yaml +0 -53
sdg_hub/configs/skills/planner.yaml +0 -67
sdg_hub/configs/skills/respond.yaml +0 -8
sdg_hub/configs/skills/revised_responder.yaml +0 -78
sdg_hub/configs/skills/router.yaml +0 -59
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
sdg_hub/flow.py +0 -477
sdg_hub/flow_runner.py +0 -450
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
sdg_hub/pipeline.py +0 -121
sdg_hub/prompts.py +0 -74
sdg_hub/registry.py +0 -122
sdg_hub/sdg.py +0 -206
sdg_hub/utils/config_validation.py +0 -91
sdg_hub/utils/error_handling.py +0 -94
sdg_hub/utils/validation_result.py +0 -10
sdg_hub-0.1.3.dist-info/METADATA +0 -190
sdg_hub-0.1.3.dist-info/RECORD +0 -89
sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
/sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
/sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
{sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
{sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0

sdg_hub/core/blocks/llm/prompt_builder_block.py ADDED Viewed

@@ -0,0 +1,368 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Prompt builder block for formatting prompts into structured chat messages or plain text.
+This module provides the PromptBuilderBlock for handling LLM prompt formatting,
+including conversion to OpenAI Messages format and template rendering.
+"""
+# Standard
+from typing import Any, Literal, Optional
+# Third Party
+from datasets import Dataset
+from jinja2 import Template, meta
+from pydantic import BaseModel, Field, field_validator
+import yaml
+# Local
+from ...utils.error_handling import TemplateValidationError
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+class ChatMessage(BaseModel):
+    """Pydantic model for chat messages with proper validation."""
+    role: Literal["system", "user", "assistant", "tool"]
+    content: str
+    @field_validator("content")
+    @classmethod
+    def validate_content_not_empty(cls, v: str) -> str:
+        """Ensure content is not empty or just whitespace."""
+        if not v or not v.strip():
+            raise ValueError("Message content cannot be empty")
+        return v.strip()
+class MessageTemplate(BaseModel):
+    """Template for a chat message with Jinja2 template and original source."""
+    role: Literal["system", "user", "assistant", "tool"]
+    content_template: Template
+    original_source: str
+    model_config = {"arbitrary_types_allowed": True}
+class PromptTemplateConfig:
+    """Self-contained class for loading and validating YAML prompt configurations."""
+    def __init__(self, config_path: str):
+        """Initialize with path to YAML config file."""
+        self.config_path = config_path
+        self.message_templates: list[MessageTemplate] = []
+        self._load_and_validate()
+    def _load_and_validate(self) -> None:
+        """Load YAML config and validate format."""
+        try:
+            with open(self.config_path, encoding="utf-8") as config_file:
+                config = yaml.safe_load(config_file)
+                if not isinstance(config, list):
+                    raise ValueError(
+                        "Template config must be a list of message objects"
+                    )
+                if not config:
+                    raise ValueError("Prompt configuration cannot be empty")
+                self._compile_templates(config)
+                self._validate_message_flow()
+        except FileNotFoundError:
+            logger.error(f"Configuration file not found: {self.config_path}")
+            raise
+        except yaml.YAMLError as e:
+            logger.error(f"Error parsing YAML from {self.config_path}: {e}")
+            raise
+        except Exception as e:
+            logger.error(
+                f"Unexpected error reading config file {self.config_path}: {e}"
+            )
+            raise
+    def _compile_templates(self, config: list[dict[str, Any]]) -> None:
+        """Compile Jinja templates for each message in the config."""
+        for i, message in enumerate(config):
+            if "role" not in message or "content" not in message:
+                raise ValueError(
+                    f"Message {i} must have 'role' and 'content' fields. Got: {message.keys()}"
+                )
+            try:
+                content_source = message["content"]
+                message_template = MessageTemplate(
+                    role=message["role"],
+                    content_template=Template(content_source),
+                    original_source=content_source,
+                )
+                self.message_templates.append(message_template)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to compile template for message {i}: {e}"
+                ) from e
+    def _validate_message_flow(self) -> None:
+        """Validate that message flow is appropriate for chat completion."""
+        user_messages = [msg for msg in self.message_templates if msg.role == "user"]
+        if not user_messages:
+            raise ValueError(
+                "Template must contain at least one message with role='user' for proper conversation flow."
+            )
+        if self.message_templates and self.message_templates[-1].role != "user":
+            raise ValueError(
+                f"The final message must have role='user' for proper chat completion. "
+                f"Got role='{self.message_templates[-1].role}' for the last message."
+            )
+    def get_message_templates(self) -> list[MessageTemplate]:
+        """Return the compiled message templates."""
+        return self.message_templates
+class PromptRenderer:
+    """Handles rendering of message templates with variable substitution."""
+    def __init__(self, message_templates: list[MessageTemplate]):
+        """Initialize with a list of message templates."""
+        self.message_templates = message_templates
+    def get_required_variables(self) -> set:
+        """Extract all required variables from message templates."""
+        required_vars = set()
+        for msg_template in self.message_templates:
+            # Parse the original source to find undeclared variables
+            # Use the template's existing environment to ensure consistency
+            ast = msg_template.content_template.environment.parse(
+                msg_template.original_source
+            )
+            required_vars.update(meta.find_undeclared_variables(ast))
+        return required_vars
+    def resolve_template_vars(
+        self, sample: dict[str, Any], input_cols
+    ) -> dict[str, Any]:
+        """Resolve template variables from dataset columns based on input_cols.
+        Parameters
+        ----------
+        sample : Dict[str, Any]
+            Input sample from dataset.
+        input_cols : Union[str, List[str], Dict[str, str]]
+            Input column specification - now maps dataset columns to template variables.
+        Returns
+        -------
+        Dict[str, Any]
+            Template variables mapped from dataset columns.
+        """
+        template_vars = {}
+        if isinstance(input_cols, dict):
+            # Map dataset columns to template variables
+            for dataset_col, template_var in input_cols.items():
+                if dataset_col in sample:
+                    template_vars[template_var] = sample[dataset_col]
+                else:
+                    logger.warning(
+                        f"Dataset column '{dataset_col}' not found in sample"
+                    )
+        else:
+            # Use column names directly as template variables
+            for col in input_cols:
+                if col in sample:
+                    template_vars[col] = sample[col]
+                else:
+                    logger.warning(f"Dataset column '{col}' not found in sample")
+        return template_vars
+    def render_messages(self, template_vars: dict[str, Any]) -> list[ChatMessage]:
+        """Render all message templates with the given variables.
+        Parameters
+        ----------
+        template_vars : Dict[str, Any]
+            Variables to substitute in templates.
+        Returns
+        -------
+        List[ChatMessage]
+            List of rendered and validated chat messages.
+        """
+        rendered_messages = []
+        for i, msg_template in enumerate(self.message_templates):
+            try:
+                rendered_content = msg_template.content_template.render(
+                    template_vars
+                ).strip()
+                if rendered_content:  # Only add non-empty messages
+                    chat_message = ChatMessage(
+                        role=msg_template.role, content=rendered_content
+                    )
+                    rendered_messages.append(chat_message)
+            except Exception as e:
+                logger.warning(f"Failed to render message {i}: {e}")
+                continue
+        return rendered_messages
+@BlockRegistry.register(
+    "PromptBuilderBlock",
+    "llm",
+    "Formats prompts into structured chat messages or plain text using Jinja templates",
+)
+class PromptBuilderBlock(BaseBlock):
+    """Block for formatting prompts into structured chat messages or plain text.
+    This block takes input from dataset columns, applies Jinja templates from a YAML config
+    containing a list of messages, and outputs either structured chat messages or formatted text.
+    Parameters
+    ----------
+    block_name : str
+        Name of the block.
+    input_cols : Union[str, List[str], Dict[str, str]]
+        Input column specification:
+        - str: Single column name
+        - List[str]: List of column names
+        - Dict[str, str]: Mapping from dataset column names to template variables
+    output_cols : str
+        Name of the output column where formatted content will be saved.
+    prompt_config_path : str
+        Path to YAML file containing list of message objects with 'role' and 'content' fields.
+    format_as_messages : bool, optional
+        Whether to format output as chat messages (default True).
+        If True, outputs List[Dict[str, str]] with 'role' and 'content' keys.
+        If False, outputs concatenated string with role prefixes.
+    """
+    prompt_config_path: str = Field(
+        ..., description="Path to YAML file containing the Jinja template configuration"
+    )
+    format_as_messages: bool = Field(
+        True, description="Whether to format output as chat messages"
+    )
+    # Internal fields for configuration and renderer
+    prompt_template_config: Optional[PromptTemplateConfig] = Field(
+        None, description="Loaded prompt template configuration", exclude=True
+    )
+    prompt_renderer: Optional[PromptRenderer] = Field(
+        None, description="Prompt renderer instance", exclude=True
+    )
+    @field_validator("output_cols", mode="after")
+    @classmethod
+    def validate_single_output_col(cls, v):
+        """Validate that exactly one output column is specified."""
+        if len(v) != 1:
+            raise ValueError(
+                f"PromptBuilderBlock expects exactly one output column, got {len(v)}: {v}"
+            )
+        return v
+    def model_post_init(self, __context: Any) -> None:
+        """Initialize the block after Pydantic validation."""
+        # Load and validate prompt configuration
+        self.prompt_template_config = PromptTemplateConfig(self.prompt_config_path)
+        # Initialize prompt renderer
+        message_templates = self.prompt_template_config.get_message_templates()
+        self.prompt_renderer = PromptRenderer(message_templates)
+    def _validate_custom(self, dataset: Dataset) -> None:
+        if len(dataset) > 0:
+            # Get required variables from all message templates
+            required_vars = self.prompt_renderer.get_required_variables()
+            sample = dataset[0]
+            template_vars = self.prompt_renderer.resolve_template_vars(
+                sample, self.input_cols
+            )
+            missing_vars = required_vars - set(template_vars.keys())
+            if missing_vars:
+                raise TemplateValidationError(
+                    block_name=self.block_name,
+                    missing_variables=list(missing_vars),
+                    available_variables=list(template_vars.keys()),
+                )
+    def _generate(self, sample: dict[str, Any]) -> dict[str, Any]:
+        """Generate formatted output for a single sample.
+        1. Resolve columns needed for prompt templating
+        2. Render each message template with the variables
+        3. Format as messages or concatenated string based on format_as_messages
+        Parameters
+        ----------
+        sample : Dict[str, Any]
+            Input sample from dataset.
+        Returns
+        -------
+        Dict[str, Any]
+            Sample with formatted output added to specified output column.
+        """
+        output_col = self.output_cols[0]
+        try:
+            # Step 1: Resolve template variables from dataset columns
+            template_vars = self.prompt_renderer.resolve_template_vars(
+                sample, self.input_cols
+            )
+            # Step 2: Render messages using the prompt renderer
+            rendered_messages = self.prompt_renderer.render_messages(template_vars)
+            # Step 3: Format output based on format_as_messages setting
+            if not rendered_messages:
+                logger.warning(f"No valid messages generated for sample: {sample}")
+                sample[output_col] = [] if self.format_as_messages else ""
+            elif self.format_as_messages:
+                # Convert to dict format for serialization
+                sample[output_col] = [msg.model_dump() for msg in rendered_messages]
+            else:
+                # Concatenate all messages into a single string
+                sample[output_col] = "\n\n".join(
+                    [f"{msg.role}: {msg.content}" for msg in rendered_messages]
+                )
+        except Exception as e:
+            logger.error(f"Failed to format sample: {e}")
+            sample[output_col] = [] if self.format_as_messages else ""
+        return sample
+    def generate(self, samples: Dataset, **_kwargs: Any) -> Dataset:
+        """Generate formatted output for all samples using dataset map.
+        Parameters
+        ----------
+        samples : Dataset
+            Input dataset containing samples to be formatted.
+        **kwargs : Dict[str, Any]
+            Additional keyword arguments (unused in this block).
+        Returns
+        -------
+        Dataset
+            Dataset with the formatted output added to the specified column.
+        """
+        logger.debug(f"Formatting prompts for {len(samples)} samples")
+        # Use dataset map for efficient processing
+        formatted_dataset = samples.map(self._generate)
+        logger.debug(f"Successfully formatted {len(formatted_dataset)} samples")
+        return formatted_dataset

sdg_hub/core/blocks/llm/text_parser_block.py ADDED Viewed

@@ -0,0 +1,310 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Text parser block for parsing and post-processing LLM outputs.
+This module provides the TextParserBlock for handling output parsing using
+start/end tags, custom regex patterns, and cleanup operations.
+"""
+# Standard
+from typing import Any, Optional
+import re
+# Third Party
+from datasets import Dataset
+from pydantic import Field, field_validator, model_validator
+# Local
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+@BlockRegistry.register(
+    "TextParserBlock",
+    "llm",
+    "Parses and post-processes LLM outputs using tags or regex patterns",
+)
+class TextParserBlock(BaseBlock):
+    """Block for parsing and post-processing LLM outputs.
+    This block handles output parsing using start/end tags, custom regex patterns,
+    and cleanup operations. It expects exactly one input column containing raw LLM output.
+    Attributes
+    ----------
+    block_name : str
+        Unique identifier for this block instance.
+    input_cols : Union[str, List[str], Dict[str, Any], None]
+        Input column name(s) containing raw LLM output. Must specify exactly one column.
+    output_cols : Union[str, List[str], Dict[str, Any], None]
+        Output column name(s) for parsed results.
+    start_tags : List[str]
+        List of start tags for tag-based parsing.
+    end_tags : List[str]
+        List of end tags for tag-based parsing.
+    parsing_pattern : Optional[str]
+        Regex pattern for custom parsing.
+    parser_cleanup_tags : Optional[List[str]]
+        List of tags to clean from parsed output.
+    """
+    start_tags: list[str] = Field(
+        default_factory=list, description="List of start tags for tag-based parsing"
+    )
+    end_tags: list[str] = Field(
+        default_factory=list, description="List of end tags for tag-based parsing"
+    )
+    parsing_pattern: Optional[str] = Field(
+        default=None, description="Regex pattern for custom parsing"
+    )
+    parser_cleanup_tags: Optional[list[str]] = Field(
+        default=None, description="List of tags to clean from parsed output"
+    )
+    @field_validator("start_tags", "end_tags", mode="before")
+    @classmethod
+    def normalize_tags(cls, v):
+        """Normalize tag lists to ensure they are always lists."""
+        if v is None:
+            return []
+        if isinstance(v, str):
+            return [v]
+        if isinstance(v, list):
+            return v
+        raise ValueError(f"Tags must be a string, list, or None, got {type(v)}")
+    @field_validator("parser_cleanup_tags", mode="before")
+    @classmethod
+    def normalize_cleanup_tags(cls, v):
+        """Normalize cleanup tags to ensure they are always lists when not None."""
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return [v]
+        if isinstance(v, list):
+            return v
+        raise ValueError(f"Cleanup tags must be a string, list, or None, got {type(v)}")
+    @model_validator(mode="after")
+    def validate_parsing_configuration(self):
+        """Validate that parsing configuration is consistent."""
+        # Validate that at least one parsing method is configured
+        has_regex = self.parsing_pattern is not None
+        has_tags = bool(self.start_tags) or bool(self.end_tags)
+        if not has_regex and not has_tags:
+            raise ValueError(
+                "TextParserBlock requires at least one parsing method: "
+                "either 'parsing_pattern' (regex) or 'start_tags'/'end_tags' (tag-based parsing)"
+            )
+        # Validate tag parsing configuration
+        if has_tags:
+            if len(self.start_tags) != len(self.end_tags):
+                raise ValueError(
+                    f"start_tags and end_tags must have the same length. "
+                    f"Got {len(self.start_tags)} start_tags and {len(self.end_tags)} end_tags"
+                )
+            # We can't validate against output_cols here since they might not be normalized yet
+            # This validation will be moved to _validate_custom
+        return self
+    def _validate_custom(self, dataset: Dataset) -> None:
+        """Validate TextParserBlock specific requirements.
+        Parameters
+        ----------
+        dataset : Dataset
+            The dataset to validate.
+        Raises
+        ------
+        ValueError
+            If TextParserBlock requirements are not met.
+        """
+        # Validate that we have exactly one input column
+        if len(self.input_cols) == 0:
+            raise ValueError("TextParserBlock expects at least one input column")
+        if len(self.input_cols) > 1:
+            logger.warning(
+                f"TextParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
+                f"Using the first column: {self.input_cols[0]}"
+            )
+        # Validate tag parsing against output columns (can only be done after model creation)
+        has_tags = bool(self.start_tags) or bool(self.end_tags)
+        if has_tags and len(self.start_tags) != len(self.output_cols):
+            raise ValueError(
+                f"When using tag-based parsing, the number of tag pairs must match output_cols. "
+                f"Got {len(self.start_tags)} tag pairs and {len(self.output_cols)} output columns"
+            )
+    def _extract_matches(
+        self, text: str, start_tag: Optional[str], end_tag: Optional[str]
+    ) -> list[str]:
+        if not text:
+            return []
+        if not start_tag and not end_tag:
+            return [text.strip()]
+        pattern = ""
+        if start_tag:
+            pattern += re.escape(start_tag)
+        pattern += r"(.*?)"
+        if end_tag:
+            pattern += re.escape(end_tag)
+        elif start_tag:
+            pattern += "$"
+        return [match.strip() for match in re.findall(pattern, text, re.DOTALL)]
+    def _parse(self, generated_string: str) -> dict[str, list[str]]:
+        if self.parsing_pattern is not None:
+            return self._parse_with_regex(generated_string)
+        return self._parse_with_tags(generated_string)
+    def _parse_with_regex(self, generated_string: str) -> dict[str, list[str]]:
+        """Parse using regex pattern."""
+        if self.parsing_pattern is None:
+            raise ValueError("parsing_pattern is required for regex parsing")
+        pattern = re.compile(self.parsing_pattern, re.DOTALL)
+        all_matches = pattern.findall(generated_string)
+        matches: dict[str, list[str]] = {
+            column_name: [] for column_name in self.output_cols
+        }
+        logger.debug(
+            f"Regex parsing found {len(all_matches)} matches with pattern: {self.parsing_pattern}"
+        )
+        if all_matches and isinstance(all_matches[0], tuple):
+            return self._process_tuple_matches(all_matches, matches)
+        return self._process_single_matches(all_matches, matches)
+    def _parse_with_tags(self, generated_string: str) -> dict[str, list[str]]:
+        """Parse using start/end tags."""
+        matches: dict[str, list[str]] = {
+            column_name: [] for column_name in self.output_cols
+        }
+        for start_tag, end_tag, output_col in zip(
+            self.start_tags, self.end_tags, self.output_cols
+        ):
+            extracted = self._extract_matches(generated_string, start_tag, end_tag)
+            matches[output_col] = extracted
+            logger.debug(
+                f"Tag parsing for '{output_col}' with tags '{start_tag}'/'{end_tag}' found {len(extracted)} matches"
+            )
+        return matches
+    def _process_tuple_matches(
+        self, all_matches: list, matches: dict[str, list[str]]
+    ) -> dict[str, list[str]]:
+        """Process regex matches that are tuples."""
+        for match in all_matches:
+            for column_name, value in zip(self.output_cols, match):
+                value = self._clean_value(value.strip())
+                matches[column_name].append(value)
+        return matches
+    def _process_single_matches(
+        self, all_matches: list, matches: dict[str, list[str]]
+    ) -> dict[str, list[str]]:
+        """Process regex matches that are single values."""
+        cleaned_matches = [self._clean_value(match.strip()) for match in all_matches]
+        matches[self.output_cols[0]] = cleaned_matches
+        return matches
+    def _clean_value(self, value: str) -> str:
+        """Clean value by removing cleanup tags."""
+        if self.parser_cleanup_tags:
+            for clean_tag in self.parser_cleanup_tags:
+                value = value.replace(clean_tag, "")
+        return value
+    def _generate(self, sample: dict) -> list[dict]:
+        input_column = self.input_cols[0]
+        raw_output = sample[input_column]
+        # Handle list inputs (e.g., from LLMChatBlock with n > 1)
+        if isinstance(raw_output, list):
+            if not raw_output:
+                logger.warning(f"Input column '{input_column}' contains empty list")
+                return []
+            all_results = []
+            for i, response in enumerate(raw_output):
+                if not response or not isinstance(response, str):
+                    logger.warning(
+                        f"List item {i} in column '{input_column}' contains invalid data "
+                        f"(empty or non-string): {type(response)}"
+                    )
+                    continue
+                parsed_outputs = self._parse(response)
+                if not parsed_outputs or not any(
+                    len(value) > 0 for value in parsed_outputs.values()
+                ):
+                    logger.warning(
+                        f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
+                        f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
+                    )
+                    continue
+                # Create output rows for this response
+                max_length = max(len(value) for value in parsed_outputs.values())
+                for values in zip(
+                    *(lst[:max_length] for lst in parsed_outputs.values())
+                ):
+                    all_results.append(
+                        {**sample, **dict(zip(parsed_outputs.keys(), values))}
+                    )
+            return all_results
+        # Handle string inputs (existing logic)
+        elif isinstance(raw_output, str):
+            if not raw_output:
+                logger.warning(f"Input column '{input_column}' contains empty string")
+                return []
+            parsed_outputs = self._parse(raw_output)
+            if not parsed_outputs or not any(
+                len(value) > 0 for value in parsed_outputs.values()
+            ):
+                logger.warning(
+                    f"Failed to parse any content from input. Raw output length: {len(raw_output)}, "
+                    f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
+                )
+                return []
+            result = []
+            max_length = max(len(value) for value in parsed_outputs.values())
+            for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
+                result.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
+            return result
+        else:
+            logger.warning(
+                f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
+                f"Expected str or List[str]"
+            )
+            return []
+    def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
+        logger.debug(f"Parsing outputs for {len(samples)} samples")
+        if len(samples) == 0:
+            logger.warning("No samples to parse, returning empty dataset")
+            return Dataset.from_list([])
+        new_data = []
+        for sample in samples:
+            new_data.extend(self._generate(sample))
+        return Dataset.from_list(new_data)

sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

sdg-hub 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl