PyPI - sdg-hub - Versions diffs - 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

sdg-hub 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

sdg_hub/core/blocks/llm/llm_parser_block.py ADDED Viewed

@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+"""LLM parser block for extracting fields from LLM response objects.
+This module provides the LLMParserBlock for extracting specific fields
+(content, reasoning_content, tool_calls) from chat completion response objects.
+"""
+# Standard
+from typing import Any
+# Third Party
+from datasets import Dataset
+from pydantic import Field, model_validator
+# Local
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+@BlockRegistry.register(
+    "LLMParserBlock",
+    "llm",
+    "Extracts specified fields from LLM response objects",
+)
+class LLMParserBlock(BaseBlock):
+    """Block for extracting fields from LLM response objects.
+    This block extracts specified fields from chat completion response objects.
+    It expects exactly one input column containing response objects (dict or list of dicts).
+    Attributes
+    ----------
+    block_name : str
+        Unique identifier for this block instance.
+    input_cols : Union[str, List[str], Dict[str, Any], None]
+        Input column name(s) containing LLM response objects. Must specify exactly one column.
+    output_cols : Union[str, List[str], Dict[str, Any], None]
+        Output column name(s) for extracted fields.
+    extract_content : bool
+        Whether to extract 'content' field from responses.
+    extract_reasoning_content : bool
+        Whether to extract 'reasoning_content' field from responses.
+    extract_tool_calls : bool
+        Whether to extract 'tool_calls' field from responses.
+    expand_lists : bool
+        Whether to expand list inputs into individual rows (True) or preserve lists (False).
+        Default is True for backward compatibility.
+    field_prefix : str
+        Prefix to add to output field names. Default is empty string (no prefix).
+        Example: 'llm_' results in 'llm_content', 'llm_reasoning_content', 'llm_tool_calls'.
+    """
+    extract_content: bool = Field(
+        default=True,
+        description="Whether to extract 'content' field from responses.",
+    )
+    extract_reasoning_content: bool = Field(
+        default=False,
+        description="Whether to extract 'reasoning_content' field from responses.",
+    )
+    extract_tool_calls: bool = Field(
+        default=False,
+        description="Whether to extract 'tool_calls' field from responses.",
+    )
+    expand_lists: bool = Field(
+        default=True,
+        description="Whether to expand list inputs into individual rows (True) or preserve lists (False).",
+    )
+    field_prefix: str = Field(
+        default="",
+        description="Prefix to add to output field names (e.g., 'llm_' results in 'llm_content', 'llm_reasoning_content').",
+    )
+    @model_validator(mode="after")
+    def validate_extraction_configuration(self):
+        """Validate that at least one extraction field is enabled and pre-compute field names."""
+        if not any(
+            [
+                self.extract_content,
+                self.extract_reasoning_content,
+                self.extract_tool_calls,
+            ]
+        ):
+            raise ValueError(
+                "LLMParserBlock requires at least one extraction field to be enabled: "
+                "extract_content, extract_reasoning_content, or extract_tool_calls"
+            )
+        # Pre-compute prefixed field names for efficiency
+        prefix = self.field_prefix
+        if prefix == "":
+            prefix = self.block_name + "_"
+        self._content_field = f"{prefix}content"
+        self._reasoning_content_field = f"{prefix}reasoning_content"
+        self._tool_calls_field = f"{prefix}tool_calls"
+        # Advertise output columns for standard collision checks
+        self.output_cols = self._get_output_columns()
+        return self
+    def _validate_custom(self, dataset: Dataset) -> None:
+        """Validate LLMParserBlock specific requirements.
+        Parameters
+        ----------
+        dataset : Dataset
+            The dataset to validate.
+        Raises
+        ------
+        ValueError
+            If LLMParserBlock requirements are not met.
+        """
+        # Validate that we have exactly one input column
+        if len(self.input_cols) == 0:
+            raise ValueError("LLMParserBlock expects at least one input column")
+        if len(self.input_cols) > 1:
+            logger.warning(
+                f"LLMParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
+                f"Using the first column: {self.input_cols[0]}"
+            )
+    def _extract_fields_from_response(self, response: dict) -> dict[str, Any]:
+        """Extract specified fields from a single response object.
+        Parameters
+        ----------
+        response : dict
+            Response object from chat completion API
+        Returns
+        -------
+        dict[str, Any]
+            Dictionary with extracted fields using prefixed field names
+        Raises
+        ------
+        ValueError
+            If none of the requested fields are found in the response
+        """
+        extracted = {}
+        missing_fields = []
+        if self.extract_content:
+            if "content" not in response:
+                missing_fields.append("content")
+            else:
+                if response["content"] is None:
+                    ## skip this field
+                    logger.warning("Content field is None, using empty string instead")
+                    extracted[self._content_field] = ""
+                else:
+                    extracted[self._content_field] = response["content"]
+        if self.extract_reasoning_content:
+            if "reasoning_content" not in response:
+                missing_fields.append("reasoning_content")
+            else:
+                if response["reasoning_content"] is None:
+                    ## skip this field
+                    logger.warning(
+                        "Reasoning content field is None, using empty string instead"
+                    )
+                    extracted[self._reasoning_content_field] = ""
+                else:
+                    extracted[self._reasoning_content_field] = response[
+                        "reasoning_content"
+                    ]
+        if self.extract_tool_calls:
+            if "tool_calls" not in response:
+                missing_fields.append("tool_calls")
+            else:
+                if response["tool_calls"] is None:
+                    ## skip this field
+                    logger.warning("Tool calls field is None, using empty list instead")
+                    extracted[self._tool_calls_field] = []
+                else:
+                    extracted[self._tool_calls_field] = response["tool_calls"]
+        if missing_fields:
+            logger.warning(
+                f"Requested fields {missing_fields} not found in response. Available keys: {list(response.keys())}"
+            )
+        if not extracted:
+            raise ValueError(
+                f"No requested fields found in response. Available keys: {list(response.keys())}"
+            )
+        return extracted
+    def _get_output_columns(self) -> list[str]:
+        """Get the list of output columns based on extraction settings."""
+        columns = []
+        if self.extract_content:
+            columns.append(self._content_field)
+        if self.extract_reasoning_content:
+            columns.append(self._reasoning_content_field)
+        if self.extract_tool_calls:
+            columns.append(self._tool_calls_field)
+        return columns
+    def _generate(self, sample: dict) -> list[dict]:
+        input_column = self.input_cols[0]
+        raw_output = sample[input_column]
+        # Handle list inputs (e.g., from LLMChatBlock with n > 1)
+        if isinstance(raw_output, list):
+            return self._process_list_input(sample, raw_output, input_column)
+        # Handle single dict input
+        elif isinstance(raw_output, dict):
+            return self._process_single_input(sample, raw_output)
+        else:
+            logger.warning(
+                f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
+                f"Expected dict or list[dict]"
+            )
+            return []
+    def _process_list_input(
+        self, sample: dict, raw_output: list, input_column: str
+    ) -> list[dict]:
+        """Process list of response objects."""
+        if not raw_output:
+            logger.warning(f"Input column '{input_column}' contains empty list")
+            return []
+        if not self.expand_lists:
+            # Preserve list structure - collect all extracted fields as lists
+            return self._process_list_preserve_structure(
+                sample, raw_output, input_column
+            )
+        else:
+            # Expand lists - create individual rows for each response
+            return self._process_list_expand_rows(sample, raw_output, input_column)
+    def _process_list_preserve_structure(
+        self, sample: dict, raw_output: list, input_column: str
+    ) -> list[dict]:
+        """Process list input while preserving list structure."""
+        output_columns = self._get_output_columns()
+        all_extracted = {col: [] for col in output_columns}
+        valid_responses = 0
+        for i, response in enumerate(raw_output):
+            if not isinstance(response, dict):
+                logger.warning(
+                    f"List item {i} in column '{input_column}' is not a dict"
+                )
+                continue
+            try:
+                extracted = self._extract_fields_from_response(response)
+                valid_responses += 1
+                for col in output_columns:
+                    if col in extracted:
+                        all_extracted[col].append(extracted[col])
+            except ValueError as e:
+                logger.warning(f"Failed to extract fields from list item {i}: {e}")
+                continue
+        if valid_responses == 0:
+            raise ValueError(
+                f"No valid responses found in list input for column '{input_column}'"
+            )
+        # Return single row with lists as values
+        return [{**sample, **all_extracted}]
+    def _process_list_expand_rows(
+        self, sample: dict, raw_output: list, input_column: str
+    ) -> list[dict]:
+        """Process list input by expanding into individual rows."""
+        all_results = []
+        for i, response in enumerate(raw_output):
+            if not isinstance(response, dict):
+                logger.warning(
+                    f"List item {i} in column '{input_column}' is not a dict"
+                )
+                continue
+            try:
+                extracted = self._extract_fields_from_response(response)
+                # Create a row for this response
+                result_row = {**sample, **extracted}
+                all_results.append(result_row)
+            except ValueError as e:
+                logger.warning(f"Failed to extract fields from list item {i}: {e}")
+                continue
+        if not all_results:
+            raise ValueError(
+                f"No valid responses found in list input for column '{input_column}'"
+            )
+        return all_results
+    def _process_single_input(self, sample: dict, raw_output: dict) -> list[dict]:
+        """Process single response object."""
+        # _extract_fields_from_response now raises ValueError if no fields found
+        extracted = self._extract_fields_from_response(raw_output)
+        return [{**sample, **extracted}]
+    def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
+        logger.debug(f"Extracting fields from {len(samples)} samples")
+        if len(samples) == 0:
+            logger.warning("No samples to process, returning empty dataset")
+            return Dataset.from_list([])
+        new_data = []
+        for sample in samples:
+            new_data.extend(self._generate(sample))
+        return Dataset.from_list(new_data)

sdg_hub/core/blocks/llm/text_parser_block.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Text parser block for parsing and post-processing LLM outputs.
+"""Text parser block for parsing and post-processing text content.
-This module provides the TextParserBlock for handling output parsing using
+This module provides the TextParserBlock for handling text parsing using
 start/end tags, custom regex patterns, and cleanup operations.
 """
@@ -24,20 +24,21 @@ logger = setup_logger(__name__)
 @BlockRegistry.register(
     "TextParserBlock",
     "llm",
-    "Parses and post-processes LLM outputs using tags or regex patterns",
+    "Parses and post-processes text content using tags or regex patterns",
 )
 class TextParserBlock(BaseBlock):
-    """Block for parsing and post-processing LLM outputs.
+    """Block for parsing and post-processing text content.
-    This block handles output parsing using start/end tags, custom regex patterns,
-    and cleanup operations. It expects exactly one input column containing raw LLM output.
+    This block handles text parsing using start/end tags, custom regex patterns,
+    and cleanup operations. It expects exactly one input column containing text content
+    as either a string or a list of strings.
     Attributes
     ----------
     block_name : str
         Unique identifier for this block instance.
     input_cols : Union[str, List[str], Dict[str, Any], None]
-        Input column name(s) containing raw LLM output. Must specify exactly one column.
+        Input column name(s) containing text content (str or List[str]). Must specify exactly one column.
     output_cols : Union[str, List[str], Dict[str, Any], None]
         Output column name(s) for parsed results.
     start_tags : List[str]
@@ -51,10 +52,6 @@ class TextParserBlock(BaseBlock):
     expand_lists : bool
         Whether to expand list inputs into individual rows (True) or preserve lists (False).
         Default is True for backward compatibility.
-    save_reasoning_content : bool
-        Whether to save the reasoning content to the output.
-    reasoning_content_field : Optional[str]
-        The field name of the reasoning content to save to the output.
     """
     start_tags: list[str] = Field(
@@ -69,18 +66,6 @@ class TextParserBlock(BaseBlock):
     parser_cleanup_tags: Optional[list[str]] = Field(
         default=None, description="List of tags to clean from parsed output"
     )
-    expand_lists: bool = Field(
-        default=True,
-        description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
-    )
-    save_reasoning_content: bool = Field(
-        default=False,
-        description="Whether to save the reasoning content to the output.",
-    )
-    reasoning_content_field: Optional[str] = Field(
-        default="reasoning_content",
-        description="The field name of the reasoning content to save to the output.",
-    )
     @field_validator("start_tags", "end_tags", mode="before")
     @classmethod
@@ -246,147 +231,67 @@ class TextParserBlock(BaseBlock):
                 value = value.replace(clean_tag, "")
         return value
-    def _handle_message(self, sample: dict) -> dict[str, list[str]]:
-        if "content" not in sample:
-            logger.warning(f"Content not found in sample: {sample}")
-            return {}
-        parsed_output = self._parse(sample["content"])
-        if self.save_reasoning_content:
-            parsed_output[self.reasoning_content_field] = [
-                self._get_reasoning_content(sample)
-            ]
-        return parsed_output
-    def _get_reasoning_content(self, sample: dict) -> str:
-        if self.save_reasoning_content:
-            if self.reasoning_content_field in sample:
-                return sample[self.reasoning_content_field]
-            else:
-                logger.warning(
-                    f"Reasoning content field '{self.reasoning_content_field}' not found in response"
-                )
-                return ""
     def _generate(self, sample: dict) -> list[dict]:
         input_column = self.input_cols[0]
         raw_output = sample[input_column]
-        # Handle list inputs (e.g., from LLMChatBlock with n > 1)
+        # Handle list inputs (e.g., multiple text strings to process)
         if isinstance(raw_output, list):
             if not raw_output:
                 logger.warning(f"Input column '{input_column}' contains empty list")
                 return []
-            if not self.expand_lists:
-                # When expand_lists=False, preserve the list structure
-                # Parse each response in the list and collect results as lists
-                all_parsed_outputs = {col: [] for col in self.output_cols}
-                valid_responses = 0
-                for i, message in enumerate(raw_output):
-                    if not message:
-                        logger.warning(
-                            f"List item {i} in column '{input_column}' is empty"
-                        )
-                        continue
-                    parsed_outputs = self._handle_message(message)
-                    if self.save_reasoning_content:
-                        reasoning_content = parsed_outputs.pop(
-                            self.reasoning_content_field
-                        )
-                    if not parsed_outputs or not any(
-                        len(value) > 0 for value in parsed_outputs.values()
-                    ):
-                        logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
-                            f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
-                        )
-                        continue
-                    valid_responses += 1
-                    # Collect all parsed values for each column as lists
-                    for col in self.output_cols:
-                        all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
-                    if self.save_reasoning_content:
-                        if (
-                            self.block_name + "_" + self.reasoning_content_field
-                            not in all_parsed_outputs
-                        ):
-                            all_parsed_outputs[
-                                self.block_name + "_" + self.reasoning_content_field
-                            ] = []
-                        all_parsed_outputs[
-                            self.block_name + "_" + self.reasoning_content_field
-                        ].extend(reasoning_content)
-                if valid_responses == 0:
-                    return []
-                # Return single row with lists as values
-                return [{**sample, **all_parsed_outputs}]
-            else:
-                # When expand_lists=True, use existing expanding behavior
-                all_results = []
-                for i, message in enumerate(raw_output):
-                    if not message:
-                        logger.warning(
-                            f"List item {i} in column '{input_column}' is empty"
-                        )
-                        continue
-                    parsed_outputs = self._handle_message(message)
-                    if self.save_reasoning_content:
-                        reasoning_content = parsed_outputs.pop(
-                            self.reasoning_content_field
-                        )
-                    if not parsed_outputs or not any(
-                        len(value) > 0 for value in parsed_outputs.values()
-                    ):
-                        logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
-                            f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
-                        )
-                        continue
-                    # Create output rows for this response
-                    max_length = max(len(value) for value in parsed_outputs.values())
-                    for values in zip(
-                        *(lst[:max_length] for lst in parsed_outputs.values())
-                    ):
-                        result_row = {
-                            **sample,
-                            **dict(zip(parsed_outputs.keys(), values)),
-                        }
-                        if self.save_reasoning_content:
-                            result_row[
-                                self.block_name + "_" + self.reasoning_content_field
-                            ] = reasoning_content[0]
-                        all_results.append(result_row)
-                return all_results
-        # Handle dict inputs (existing logic)
-        elif isinstance(raw_output, dict) or isinstance(raw_output, str):
-            if not raw_output:
-                logger.warning(f"Input column '{input_column}' contains empty dict")
+            # Parse each text string in the list and collect results as lists
+            all_parsed_outputs = {col: [] for col in self.output_cols}
+            valid_responses = 0
+            for i, message in enumerate(raw_output):
+                # Ensure each item in the list is a string
+                if not isinstance(message, str):
+                    logger.warning(
+                        f"List item {i} in column '{input_column}' is not a string: {type(message)}. "
+                        f"Expected List[str], skipping this item."
+                    )
+                    continue
+                if not message:
+                    logger.warning(f"List item {i} in column '{input_column}' is empty")
+                    continue
+                parsed_outputs = self._parse(message)
+                if not parsed_outputs or not any(
+                    len(value) > 0 for value in parsed_outputs.values()
+                ):
+                    logger.warning(
+                        f"Failed to parse content from list item {i}. Text length: {len(message)}, "
+                        f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
+                    )
+                    continue
+                valid_responses += 1
+                # Collect all parsed values for each column as lists
+                for col in self.output_cols:
+                    all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
+            if valid_responses == 0:
                 return []
-            if isinstance(raw_output, str):
-                raw_output = {"content": raw_output}
+            # Return single row with lists as values
+            return [{**sample, **all_parsed_outputs}]
+        # Handle string inputs
+        elif isinstance(raw_output, str):
+            if not raw_output:
+                logger.warning(f"Input column '{input_column}' contains empty string")
+                return []
-            parsed_outputs = self._handle_message(raw_output)
-            if self.save_reasoning_content:
-                reasoning_content = parsed_outputs.pop(self.reasoning_content_field)
+            parsed_outputs = self._parse(raw_output)
             if not parsed_outputs or not any(
                 len(value) > 0 for value in parsed_outputs.values()
             ):
                 logger.warning(
-                    f"Failed to parse any content from input. Raw output length: {len(raw_output)}, "
+                    f"Failed to parse any content from input. Text length: {len(raw_output)}, "
                     f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
                 )
                 return []
@@ -395,10 +300,6 @@ class TextParserBlock(BaseBlock):
             max_length = max(len(value) for value in parsed_outputs.values())
             for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
                 result_row = {**sample, **dict(zip(parsed_outputs.keys(), values))}
-                if self.save_reasoning_content:
-                    result_row[self.block_name + "_" + self.reasoning_content_field] = (
-                        reasoning_content[0]
-                    )
                 result.append(result_row)
             return result
@@ -406,7 +307,7 @@ class TextParserBlock(BaseBlock):
         else:
             logger.warning(
                 f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
-                f"Expected dict or List[dict]"
+                f"Expected str or List[str]"
             )
             return []

sdg_hub/core/flow/__init__.py CHANGED Viewed

@@ -1,20 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 """New flow implementation for SDG Hub.
-This module provides a redesigned Flow class with metadata support,
-dual initialization modes, and runtime parameter overrides.
+This module provides a redesigned Flow class with metadata support
+and dual initialization modes.
 """
 # Local
 from .base import Flow
-from .metadata import FlowMetadata, FlowParameter
+from .metadata import FlowMetadata
 from .registry import FlowRegistry
 from .validation import FlowValidator
 __all__ = [
     "Flow",
     "FlowMetadata",
-    "FlowParameter",
     "FlowRegistry",
     "FlowValidator",
 ]

sdg-hub 0.3.1__py3-none-any.whl → 0.4.1__py3-none-any.whl

sdg-hub 0.3.1py3-none-any.whl → 0.4.1py3-none-any.whl