PyPI - sdg-hub - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

sdg_hub/core/blocks/llm/text_parser_block.py CHANGED Viewed

@@ -51,6 +51,10 @@ class TextParserBlock(BaseBlock):
     expand_lists : bool
         Whether to expand list inputs into individual rows (True) or preserve lists (False).
         Default is True for backward compatibility.
+    save_reasoning_content : bool
+        Whether to save the reasoning content to the output.
+    reasoning_content_field : Optional[str]
+        The field name of the reasoning content to save to the output.
     """
     start_tags: list[str] = Field(
@@ -69,6 +73,14 @@ class TextParserBlock(BaseBlock):
         default=True,
         description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
     )
+    save_reasoning_content: bool = Field(
+        default=False,
+        description="Whether to save the reasoning content to the output.",
+    )
+    reasoning_content_field: Optional[str] = Field(
+        default="reasoning_content",
+        description="The field name of the reasoning content to save to the output.",
+    )
     @field_validator("start_tags", "end_tags", mode="before")
     @classmethod
@@ -234,6 +246,27 @@ class TextParserBlock(BaseBlock):
                 value = value.replace(clean_tag, "")
         return value
+    def _handle_message(self, sample: dict) -> dict[str, list[str]]:
+        if "content" not in sample:
+            logger.warning(f"Content not found in sample: {sample}")
+            return {}
+        parsed_output = self._parse(sample["content"])
+        if self.save_reasoning_content:
+            parsed_output[self.reasoning_content_field] = [
+                self._get_reasoning_content(sample)
+            ]
+        return parsed_output
+    def _get_reasoning_content(self, sample: dict) -> str:
+        if self.save_reasoning_content:
+            if self.reasoning_content_field in sample:
+                return sample[self.reasoning_content_field]
+            else:
+                logger.warning(
+                    f"Reasoning content field '{self.reasoning_content_field}' not found in response"
+                )
+                return ""
     def _generate(self, sample: dict) -> list[dict]:
         input_column = self.input_cols[0]
         raw_output = sample[input_column]
@@ -250,21 +283,24 @@ class TextParserBlock(BaseBlock):
                 all_parsed_outputs = {col: [] for col in self.output_cols}
                 valid_responses = 0
-                for i, response in enumerate(raw_output):
-                    if not response or not isinstance(response, str):
+                for i, message in enumerate(raw_output):
+                    if not message:
                         logger.warning(
-                            f"List item {i} in column '{input_column}' contains invalid data "
-                            f"(empty or non-string): {type(response)}"
+                            f"List item {i} in column '{input_column}' is empty"
                         )
                         continue
-                    parsed_outputs = self._parse(response)
+                    parsed_outputs = self._handle_message(message)
+                    if self.save_reasoning_content:
+                        reasoning_content = parsed_outputs.pop(
+                            self.reasoning_content_field
+                        )
                     if not parsed_outputs or not any(
                         len(value) > 0 for value in parsed_outputs.values()
                     ):
                         logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
+                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
                             f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
                         )
                         continue
@@ -273,33 +309,45 @@ class TextParserBlock(BaseBlock):
                     # Collect all parsed values for each column as lists
                     for col in self.output_cols:
                         all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
+                    if self.save_reasoning_content:
+                        if (
+                            self.block_name + "_" + self.reasoning_content_field
+                            not in all_parsed_outputs
+                        ):
+                            all_parsed_outputs[
+                                self.block_name + "_" + self.reasoning_content_field
+                            ] = []
+                        all_parsed_outputs[
+                            self.block_name + "_" + self.reasoning_content_field
+                        ].extend(reasoning_content)
                 if valid_responses == 0:
                     return []
                 # Return single row with lists as values
-                # TODO: This breaks retry counting in LLMChatWithParsingRetryBlock until LLMChatWithParsingRetryBlock is re-based
-                # which expects one row per successful parse for counting
                 return [{**sample, **all_parsed_outputs}]
             else:
                 # When expand_lists=True, use existing expanding behavior
                 all_results = []
-                for i, response in enumerate(raw_output):
-                    if not response or not isinstance(response, str):
+                for i, message in enumerate(raw_output):
+                    if not message:
                         logger.warning(
-                            f"List item {i} in column '{input_column}' contains invalid data "
-                            f"(empty or non-string): {type(response)}"
+                            f"List item {i} in column '{input_column}' is empty"
                         )
                         continue
-                    parsed_outputs = self._parse(response)
+                    parsed_outputs = self._handle_message(message)
+                    if self.save_reasoning_content:
+                        reasoning_content = parsed_outputs.pop(
+                            self.reasoning_content_field
+                        )
                     if not parsed_outputs or not any(
                         len(value) > 0 for value in parsed_outputs.values()
                     ):
                         logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
+                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
                             f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
                         )
                         continue
@@ -309,19 +357,30 @@ class TextParserBlock(BaseBlock):
                     for values in zip(
                         *(lst[:max_length] for lst in parsed_outputs.values())
                     ):
-                        all_results.append(
-                            {**sample, **dict(zip(parsed_outputs.keys(), values))}
-                        )
+                        result_row = {
+                            **sample,
+                            **dict(zip(parsed_outputs.keys(), values)),
+                        }
+                        if self.save_reasoning_content:
+                            result_row[
+                                self.block_name + "_" + self.reasoning_content_field
+                            ] = reasoning_content[0]
+                        all_results.append(result_row)
                 return all_results
-        # Handle string inputs (existing logic)
-        elif isinstance(raw_output, str):
+        # Handle dict inputs (existing logic)
+        elif isinstance(raw_output, dict) or isinstance(raw_output, str):
             if not raw_output:
-                logger.warning(f"Input column '{input_column}' contains empty string")
+                logger.warning(f"Input column '{input_column}' contains empty dict")
                 return []
-            parsed_outputs = self._parse(raw_output)
+            if isinstance(raw_output, str):
+                raw_output = {"content": raw_output}
+            parsed_outputs = self._handle_message(raw_output)
+            if self.save_reasoning_content:
+                reasoning_content = parsed_outputs.pop(self.reasoning_content_field)
             if not parsed_outputs or not any(
                 len(value) > 0 for value in parsed_outputs.values()
@@ -335,13 +394,19 @@ class TextParserBlock(BaseBlock):
             result = []
             max_length = max(len(value) for value in parsed_outputs.values())
             for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
-                result.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
+                result_row = {**sample, **dict(zip(parsed_outputs.keys(), values))}
+                if self.save_reasoning_content:
+                    result_row[self.block_name + "_" + self.reasoning_content_field] = (
+                        reasoning_content[0]
+                    )
+                result.append(result_row)
             return result
         else:
             logger.warning(
                 f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
-                f"Expected str or List[str]"
+                f"Expected dict or List[dict]"
             )
             return []

sdg_hub/core/blocks/registry.py CHANGED Viewed

@@ -164,8 +164,10 @@ class BlockRegistry:
                 ) from exc
     @classmethod
-    def get(cls, block_name: str) -> type:
-        """Get a block class with enhanced error handling.
+    def _get(cls, block_name: str) -> type:
+        """Internal method to get a block class with enhanced error handling.
+        This is a private method used by the framework internals (Flow system).
         Parameters
         ----------
@@ -216,29 +218,6 @@ class BlockRegistry:
         return metadata.block_class
-    @classmethod
-    def info(cls, block_name: str) -> BlockMetadata:
-        """Get metadata for a specific block.
-        Parameters
-        ----------
-        block_name : str
-            Name of the block.
-        Returns
-        -------
-        BlockMetadata
-            The block's metadata.
-        Raises
-        ------
-        KeyError
-            If the block is not found.
-        """
-        if block_name not in cls._metadata:
-            raise KeyError(f"Block '{block_name}' not found in registry.")
-        return cls._metadata[block_name]
     @classmethod
     def categories(cls) -> list[str]:
         """Get all available categories.
@@ -251,8 +230,8 @@ class BlockRegistry:
         return sorted(cls._categories.keys())
     @classmethod
-    def category(cls, category: str) -> list[str]:
-        """Get all blocks in a specific category.
+    def _get_category_blocks(cls, category: str) -> list[str]:
+        """Get all blocks in a specific category (private method).
         Parameters
         ----------
@@ -278,17 +257,52 @@ class BlockRegistry:
         return sorted(cls._categories[category])
     @classmethod
-    def all(cls) -> dict[str, list[str]]:
-        """List all blocks organized by category.
+    def list_blocks(
+        cls,
+        category: Optional[str] = None,
+        *,
+        grouped: bool = False,
+        include_deprecated: bool = True,
+    ) -> list[str] | dict[str, list[str]]:
+        """
+        List registered blocks, optionally filtered by category.
+        Args:
+            category: If provided, return only blocks in this category.
+            grouped: If True (and category is None), return a dict
+                    mapping categories to lists of blocks.
+            include_deprecated: If True, return deprecated blocks.
         Returns
         -------
-        Dict[str, List[str]]
-            Dictionary mapping categories to lists of block names.
+        List[str] | Dict[str, List[str]]
+            If grouped is False, returns a list of block names.
+            If grouped is True, returns a dict mapping categories to lists of block names.
         """
-        return {
-            category: sorted(blocks) for category, blocks in cls._categories.items()
-        }
+        def filter_deprecated(block_names: list[str]) -> list[str]:
+            if include_deprecated:
+                return block_names
+            return [name for name in block_names if not cls._metadata[name].deprecated]
+        if category:
+            block_names = cls._get_category_blocks(category)
+            return filter_deprecated(block_names)
+        if grouped:
+            result = {}
+            for cat, blocks in cls._categories.items():
+                filtered = filter_deprecated(sorted(blocks))
+                if filtered:
+                    result[cat] = filtered
+            return result
+        # Flat list of all block names (across all categories)
+        all_block_names = []
+        for blocks in cls._categories.values():
+            all_block_names.extend(blocks)
+        filtered = filter_deprecated(sorted(all_block_names))
+        return filtered
     @classmethod
     def discover_blocks(cls) -> None:

sdg_hub/core/blocks/transform/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ wide-to-long transformations, value selection, and majority value assignment.
 # Local
 from .duplicate_columns import DuplicateColumnsBlock
 from .index_based_mapper import IndexBasedMapperBlock
+from .json_structure_block import JSONStructureBlock
 from .melt_columns import MeltColumnsBlock
 from .rename_columns import RenameColumnsBlock
 from .text_concat import TextConcatBlock
@@ -16,6 +17,7 @@ from .uniform_col_val_setter import UniformColumnValueSetter
 __all__ = [
     "TextConcatBlock",
     "DuplicateColumnsBlock",
+    "JSONStructureBlock",
     "MeltColumnsBlock",
     "IndexBasedMapperBlock",
     "RenameColumnsBlock",

sdg_hub/core/blocks/transform/index_based_mapper.py CHANGED Viewed

@@ -174,7 +174,7 @@ class IndexBasedMapperBlock(BaseBlock):
             sample[output_col] = sample[source_col]
         return sample
-    def generate(self, samples: Dataset) -> Dataset:
+    def generate(self, samples: Dataset, **kwargs) -> Dataset:
         """Generate a new dataset with selected values.
         Parameters

sdg_hub/core/blocks/transform/json_structure_block.py ADDED Viewed

@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+"""JSON structure block for combining multiple columns into a structured JSON object.
+This module provides a block for combining multiple columns into a single column
+containing a structured JSON object with specified field names.
+"""
+# Standard
+from typing import Any, Dict
+import json
+# Third Party
+from datasets import Dataset
+from pydantic import Field, field_validator
+# Local
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+@BlockRegistry.register(
+    "JSONStructureBlock",
+    "transform",
+    "Combines multiple columns into a single column containing a structured JSON object",
+)
+class JSONStructureBlock(BaseBlock):
+    """Block for combining multiple columns into a structured JSON object.
+    This block takes values from multiple input columns and combines them into a single
+    output column containing a JSON object. The JSON field names match the input column names.
+    Attributes
+    ----------
+    block_name : str
+        Name of the block.
+    input_cols : List[str]
+        List of input column names to include in the JSON object.
+        Column names become the JSON field names.
+    output_cols : List[str]
+        List containing the single output column name.
+    ensure_json_serializable : bool
+        Whether to ensure all values are JSON serializable (default True).
+    pretty_print : bool
+        Whether to format JSON with indentation (default False).
+    """
+    ensure_json_serializable: bool = Field(
+        default=True, description="Whether to ensure all values are JSON serializable"
+    )
+    pretty_print: bool = Field(
+        default=False, description="Whether to format JSON with indentation"
+    )
+    @field_validator("output_cols", mode="after")
+    @classmethod
+    def validate_output_cols(cls, v):
+        """Validate that exactly one output column is specified."""
+        if not v or len(v) != 1:
+            raise ValueError("JSONStructureBlock requires exactly one output column")
+        return v
+    def _make_json_serializable(self, value: Any) -> Any:
+        """Convert value to JSON serializable format."""
+        if value is None:
+            return None
+        # Handle basic types that are already JSON serializable
+        if isinstance(value, (str, int, float, bool)):
+            return value
+        # Handle lists
+        if isinstance(value, (list, tuple)):
+            return [self._make_json_serializable(item) for item in value]
+        # Handle dictionaries
+        if isinstance(value, dict):
+            return {k: self._make_json_serializable(v) for k, v in value.items()}
+        # Convert other types to string
+        return str(value)
+    def _get_field_mapping(self) -> Dict[str, str]:
+        """Get the mapping of JSON field names to input column names."""
+        # Use column names as JSON field names (standard SDG Hub pattern)
+        if isinstance(self.input_cols, list):
+            return {col: col for col in self.input_cols}
+        raise ValueError("input_cols must be a list of column names")
+    def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
+        """Generate a dataset with JSON structured output.
+        Parameters
+        ----------
+        samples : Dataset
+            Input dataset to process.
+        Returns
+        -------
+        Dataset
+            Dataset with JSON structured output in the specified column.
+        """
+        if not self.output_cols:
+            raise ValueError("output_cols must be specified")
+        output_col = self.output_cols[0]
+        field_mapping = self._get_field_mapping()
+        def _create_json_structure(sample):
+            """Create JSON structure from input columns."""
+            json_obj = {}
+            # Build the JSON object using the field mapping
+            for json_field, col_name in field_mapping.items():
+                if col_name not in sample:
+                    logger.warning(f"Input column '{col_name}' not found in sample")
+                    json_obj[json_field] = None
+                else:
+                    value = sample[col_name]
+                    if self.ensure_json_serializable:
+                        value = self._make_json_serializable(value)
+                    json_obj[json_field] = value
+            # Convert to JSON string
+            try:
+                if self.pretty_print:
+                    json_string = json.dumps(json_obj, indent=2, ensure_ascii=False)
+                else:
+                    json_string = json.dumps(json_obj, ensure_ascii=False)
+                sample[output_col] = json_string
+            except (TypeError, ValueError) as e:
+                logger.error(f"Failed to serialize JSON object: {e}")
+                sample[output_col] = "{}"
+            return sample
+        # Apply the JSON structuring to all samples
+        result = samples.map(_create_json_structure)
+        return result

sdg-hub 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl