PyPI - sdg-hub - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sdg-hub 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sdg_hub/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.2'
-__version_tuple__ = version_tuple = (0, 2, 2)
+__version__ = version = '0.3.1'
+__version_tuple__ = version_tuple = (0, 3, 1)
 __commit_id__ = commit_id = None

sdg_hub/core/blocks/llm/client_manager.py CHANGED Viewed

@@ -107,9 +107,18 @@ class LLMClientManager:
                 f"Could not validate setup for model '{self.config.model}': {e}"
             )
+    def _message_to_dict(self, message: Any) -> dict[str, Any]:
+        """Convert a message to a dict."""
+        if hasattr(message, "to_dict"):
+            return message.to_dict()
+        elif hasattr(message, "__dict__"):
+            return message.__dict__
+        else:
+            return dict(message)
     def create_completion(
         self, messages: list[dict[str, Any]], **overrides: Any
-    ) -> Union[str, list[str]]:
+    ) -> Union[dict, list[dict]]:
         """Create a completion using LiteLLM.
         Parameters
@@ -121,9 +130,9 @@ class LLMClientManager:
         Returns
         -------
-        Union[str, List[str]]
-            The completion text(s). Returns a single string when n=1 or n is None,
-            returns a list of strings when n>1.
+        Union[dict, List[dict]]
+            The completion response(s). Returns a single response when n=1 or n is None,
+            returns a list of responses when n>1. Response dicts contain 'content' and may contain 'reasoning_content'.
         Raises
         ------
@@ -151,28 +160,30 @@ class LLMClientManager:
         # Make the completion call
         response = completion_func(kwargs)
-        # Extract content from response
+        # Extract message objects from response
         # Check if n > 1 to determine return type
         n_value = final_config.n or 1
         if n_value > 1:
-            return [choice.message.content for choice in response.choices]
+            return [
+                self._message_to_dict(choice.message) for choice in response.choices
+            ]
         else:
-            return response.choices[0].message.content
+            return self._message_to_dict(response.choices[0].message)
     async def acreate_completion(
         self,
         messages: Union[list[dict[str, Any]], list[list[dict[str, Any]]]],
         max_concurrency: Optional[int] = None,
         **overrides: Any,
-    ) -> Union[str, list[str], list[Union[str, list[str]]]]:
+    ) -> Union[dict, list[dict]] | list[Union[dict, list[dict]]]:
         """Create async completion(s) using LiteLLM with optional concurrency control.
         Parameters
         ----------
         messages : Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]
             Single message list or list of message lists.
-            - For single: List[Dict[str, Any]] - returns Union[str, List[str]]
-            - For multiple: List[List[Dict[str, Any]]] - returns List[Union[str, List[str]]]
+            - For single: List[Dict[str, Any]] - returns Union[Any, List[Any]]
+            - For multiple: List[List[Dict[str, Any]]] - returns List[Union[Any, List[Any]]]
         max_concurrency : Optional[int], optional
             Maximum number of concurrent requests when processing multiple messages.
             If None, all requests run concurrently.
@@ -181,9 +192,9 @@ class LLMClientManager:
         Returns
         -------
-        Union[str, List[str], List[Union[str, List[str]]]]
-            For single message: completion text (string when n=1, list when n>1)
-            For multiple messages: list of completion texts (each element can be str or List[str])
+        Union[dict, List[dict], List[Union[dict, List[dict]]]]
+            For single message: completion response (dict when n=1, List[dict] when n>1)
+            For multiple messages: list of completion responses (each element can be dict or List[dict])
         Raises
         ------
@@ -203,8 +214,33 @@ class LLMClientManager:
             messages_list = messages
             if max_concurrency is not None:
+                if max_concurrency < 1:
+                    raise ValueError(
+                        "max_concurrency must be greater than 0, got {max_concurrency}"
+                    )
+                # Adjust concurrency based on n parameter to avoid overwhelming API
+                # when n > 1 (multiple completions per request)
+                n_value = overrides.get("n") or self.config.n or 1
+                if n_value > 1:
+                    # Warn if max_concurrency is less than n
+                    if max_concurrency < n_value:
+                        logger.warning(
+                            f"max_concurrency ({max_concurrency}) is less than n ({n_value}). "
+                            f"This may result in very low concurrency. Consider increasing max_concurrency "
+                            f"or reducing n for better performance."
+                        )
+                    # Reduce concurrency when generating multiple completions per request
+                    adjusted_concurrency = max(1, max_concurrency // n_value)
+                    logger.debug(
+                        f"Adjusted max_concurrency from {max_concurrency} to {adjusted_concurrency} "
+                        f"for n={n_value} completions per request"
+                    )
+                else:
+                    adjusted_concurrency = max_concurrency
                 # Use semaphore for concurrency control
-                semaphore = asyncio.Semaphore(max_concurrency)
+                semaphore = asyncio.Semaphore(adjusted_concurrency)
                 async def _create_with_semaphore(msgs):
                     async with semaphore:
@@ -221,7 +257,7 @@ class LLMClientManager:
     async def _acreate_single(
         self, messages: list[dict[str, Any]], **overrides: Any
-    ) -> Union[str, list[str]]:
+    ) -> Union[dict, list[dict]]:
         """Create a single async completion using LiteLLM.
         Parameters
@@ -233,10 +269,9 @@ class LLMClientManager:
         Returns
         -------
-        Union[str, List[str]]
-            The completion text(s). Returns a single string when n=1 or n is None,
-            returns a list of strings when n>1.
+        Union[dict, List[dict]]
+            List of completion message objects. Each element is a dict when n=1 or n is None,
+            or a list of dicts when n>1. Message dicts contain 'content' and may contain 'reasoning_content'.
         Raises
         ------
         Exception
@@ -263,17 +298,19 @@ class LLMClientManager:
         # Make the async completion call
         response = await completion_func(kwargs)
-        # Extract content from response
+        # Extract message objects from response
         # Check if n > 1 to determine return type
         n_value = final_config.n or 1
         if n_value > 1:
-            return [choice.message.content for choice in response.choices]
+            return [
+                self._message_to_dict(choice.message) for choice in response.choices
+            ]
         else:
-            return response.choices[0].message.content
+            return self._message_to_dict(response.choices[0].message)
     def create_completions_batch(
         self, messages_list: list[list[dict[str, Any]]], **overrides: Any
-    ) -> list[Union[str, list[str]]]:
+    ) -> list[Union[dict, list[dict]]]:
         """Create multiple completions in batch.
         Parameters
@@ -285,9 +322,9 @@ class LLMClientManager:
         Returns
         -------
-        List[Union[str, List[str]]]
-            List of completion texts. Each element is a single string when n=1 or n is None,
-            or a list of strings when n>1.
+        List[dict] | List[List[dict]]
+            List of completion responses. Each element is a dict when n=1 or n is None,
+            or a list of dicts when n>1. Response dicts contain 'content' and may contain 'reasoning_content'.
         """
         results = []
         for messages in messages_list:

sdg_hub/core/blocks/llm/llm_chat_block.py CHANGED Viewed

@@ -42,9 +42,10 @@ class LLMChatBlock(BaseBlock):
         Name of the block.
     input_cols : Union[str, List[str]]
         Input column name(s). Should contain the messages list.
-    output_cols : Union[str, List[str]]
+    output_cols : Union[dict, List[dict]]
         Output column name(s) for the response. When n > 1, the column will contain
-        a list of responses instead of a single string.
+        a list of responses instead of a single response. Responses contain 'content',
+        may contain 'reasoning_content' and other fields if any.
     model : str
         Model identifier in LiteLLM format. Examples:
         - "openai/gpt-4"
@@ -131,7 +132,7 @@ class LLMChatBlock(BaseBlock):
     >>> block = LLMChatBlock(
     ...     block_name="gpt4_multiple",
     ...     input_cols="messages",
-    ...     output_cols="responses",  # Will contain lists of strings
+    ...     output_cols="responses",  # Will contain lists of responses
     ...     model="openai/gpt-4",
     ...     n=3,  # Generate 3 responses per input
     ...     temperature=0.8
@@ -406,7 +407,7 @@ class LLMChatBlock(BaseBlock):
         self,
         messages_list: list[list[dict[str, Any]]],
         **override_kwargs: dict[str, Any],
-    ) -> list[Union[str, list[str]]]:
+    ) -> list[Union[dict, list[dict]]]:
         """Generate responses synchronously.
         Parameters
@@ -418,8 +419,9 @@ class LLMChatBlock(BaseBlock):
         Returns
         -------
-        List[Union[str, List[str]]]
-            List of response strings or lists of response strings (when n > 1).
+        List[Union[dict, List[dict]]]
+            List of responses. Each element is a dict when n=1 or n is None,
+            or a list of dicts when n>1. Response dicts contain 'content', may contain 'reasoning_content' and other fields if any.
         """
         responses = []
@@ -461,7 +463,7 @@ class LLMChatBlock(BaseBlock):
         messages_list: list[list[dict[str, Any]]],
         flow_max_concurrency: Optional[int] = None,
         **override_kwargs: dict[str, Any],
-    ) -> list[Union[str, list[str]]]:
+    ) -> list[Union[dict, list[dict]]]:
         """Generate responses asynchronously.
         Parameters
@@ -475,8 +477,9 @@ class LLMChatBlock(BaseBlock):
         Returns
         -------
-        List[Union[str, List[str]]]
-            List of response strings or lists of response strings (when n > 1).
+        List[Union[dict, List[dict]]]
+            List of responses. Each element is a dict when n=1 or n is None,
+            or a list of dicts when n>1. Response dicts contain 'content', may contain 'reasoning_content' and other fields if any.
         """
         try:
             # Use unified client manager method with optional concurrency control

sdg_hub/core/blocks/llm/text_parser_block.py CHANGED Viewed

@@ -51,6 +51,10 @@ class TextParserBlock(BaseBlock):
     expand_lists : bool
         Whether to expand list inputs into individual rows (True) or preserve lists (False).
         Default is True for backward compatibility.
+    save_reasoning_content : bool
+        Whether to save the reasoning content to the output.
+    reasoning_content_field : Optional[str]
+        The field name of the reasoning content to save to the output.
     """
     start_tags: list[str] = Field(
@@ -69,6 +73,14 @@ class TextParserBlock(BaseBlock):
         default=True,
         description="Whether to expand list inputs into individual rows (True) or preserve lists (False). ",
     )
+    save_reasoning_content: bool = Field(
+        default=False,
+        description="Whether to save the reasoning content to the output.",
+    )
+    reasoning_content_field: Optional[str] = Field(
+        default="reasoning_content",
+        description="The field name of the reasoning content to save to the output.",
+    )
     @field_validator("start_tags", "end_tags", mode="before")
     @classmethod
@@ -234,6 +246,27 @@ class TextParserBlock(BaseBlock):
                 value = value.replace(clean_tag, "")
         return value
+    def _handle_message(self, sample: dict) -> dict[str, list[str]]:
+        if "content" not in sample:
+            logger.warning(f"Content not found in sample: {sample}")
+            return {}
+        parsed_output = self._parse(sample["content"])
+        if self.save_reasoning_content:
+            parsed_output[self.reasoning_content_field] = [
+                self._get_reasoning_content(sample)
+            ]
+        return parsed_output
+    def _get_reasoning_content(self, sample: dict) -> str:
+        if self.save_reasoning_content:
+            if self.reasoning_content_field in sample:
+                return sample[self.reasoning_content_field]
+            else:
+                logger.warning(
+                    f"Reasoning content field '{self.reasoning_content_field}' not found in response"
+                )
+                return ""
     def _generate(self, sample: dict) -> list[dict]:
         input_column = self.input_cols[0]
         raw_output = sample[input_column]
@@ -250,21 +283,24 @@ class TextParserBlock(BaseBlock):
                 all_parsed_outputs = {col: [] for col in self.output_cols}
                 valid_responses = 0
-                for i, response in enumerate(raw_output):
-                    if not response or not isinstance(response, str):
+                for i, message in enumerate(raw_output):
+                    if not message:
                         logger.warning(
-                            f"List item {i} in column '{input_column}' contains invalid data "
-                            f"(empty or non-string): {type(response)}"
+                            f"List item {i} in column '{input_column}' is empty"
                         )
                         continue
-                    parsed_outputs = self._parse(response)
+                    parsed_outputs = self._handle_message(message)
+                    if self.save_reasoning_content:
+                        reasoning_content = parsed_outputs.pop(
+                            self.reasoning_content_field
+                        )
                     if not parsed_outputs or not any(
                         len(value) > 0 for value in parsed_outputs.values()
                     ):
                         logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
+                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
                             f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
                         )
                         continue
@@ -273,6 +309,17 @@ class TextParserBlock(BaseBlock):
                     # Collect all parsed values for each column as lists
                     for col in self.output_cols:
                         all_parsed_outputs[col].extend(parsed_outputs.get(col, []))
+                    if self.save_reasoning_content:
+                        if (
+                            self.block_name + "_" + self.reasoning_content_field
+                            not in all_parsed_outputs
+                        ):
+                            all_parsed_outputs[
+                                self.block_name + "_" + self.reasoning_content_field
+                            ] = []
+                        all_parsed_outputs[
+                            self.block_name + "_" + self.reasoning_content_field
+                        ].extend(reasoning_content)
                 if valid_responses == 0:
                     return []
@@ -283,21 +330,24 @@ class TextParserBlock(BaseBlock):
             else:
                 # When expand_lists=True, use existing expanding behavior
                 all_results = []
-                for i, response in enumerate(raw_output):
-                    if not response or not isinstance(response, str):
+                for i, message in enumerate(raw_output):
+                    if not message:
                         logger.warning(
-                            f"List item {i} in column '{input_column}' contains invalid data "
-                            f"(empty or non-string): {type(response)}"
+                            f"List item {i} in column '{input_column}' is empty"
                         )
                         continue
-                    parsed_outputs = self._parse(response)
+                    parsed_outputs = self._handle_message(message)
+                    if self.save_reasoning_content:
+                        reasoning_content = parsed_outputs.pop(
+                            self.reasoning_content_field
+                        )
                     if not parsed_outputs or not any(
                         len(value) > 0 for value in parsed_outputs.values()
                     ):
                         logger.warning(
-                            f"Failed to parse content from list item {i}. Raw output length: {len(response)}, "
+                            f"Failed to parse content from list item {i}. Raw output length: {len(message)}, "
                             f"parsing method: {'regex' if self.parsing_pattern else 'tags'}"
                         )
                         continue
@@ -307,19 +357,30 @@ class TextParserBlock(BaseBlock):
                     for values in zip(
                         *(lst[:max_length] for lst in parsed_outputs.values())
                     ):
-                        all_results.append(
-                            {**sample, **dict(zip(parsed_outputs.keys(), values))}
-                        )
+                        result_row = {
+                            **sample,
+                            **dict(zip(parsed_outputs.keys(), values)),
+                        }
+                        if self.save_reasoning_content:
+                            result_row[
+                                self.block_name + "_" + self.reasoning_content_field
+                            ] = reasoning_content[0]
+                        all_results.append(result_row)
                 return all_results
-        # Handle string inputs (existing logic)
-        elif isinstance(raw_output, str):
+        # Handle dict inputs (existing logic)
+        elif isinstance(raw_output, dict) or isinstance(raw_output, str):
             if not raw_output:
-                logger.warning(f"Input column '{input_column}' contains empty string")
+                logger.warning(f"Input column '{input_column}' contains empty dict")
                 return []
-            parsed_outputs = self._parse(raw_output)
+            if isinstance(raw_output, str):
+                raw_output = {"content": raw_output}
+            parsed_outputs = self._handle_message(raw_output)
+            if self.save_reasoning_content:
+                reasoning_content = parsed_outputs.pop(self.reasoning_content_field)
             if not parsed_outputs or not any(
                 len(value) > 0 for value in parsed_outputs.values()
@@ -333,13 +394,19 @@ class TextParserBlock(BaseBlock):
             result = []
             max_length = max(len(value) for value in parsed_outputs.values())
             for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
-                result.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
+                result_row = {**sample, **dict(zip(parsed_outputs.keys(), values))}
+                if self.save_reasoning_content:
+                    result_row[self.block_name + "_" + self.reasoning_content_field] = (
+                        reasoning_content[0]
+                    )
+                result.append(result_row)
             return result
         else:
             logger.warning(
                 f"Input column '{input_column}' contains invalid data type: {type(raw_output)}. "
-                f"Expected str or List[str]"
+                f"Expected dict or List[dict]"
             )
             return []

sdg_hub/core/blocks/transform/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ wide-to-long transformations, value selection, and majority value assignment.
 # Local
 from .duplicate_columns import DuplicateColumnsBlock
 from .index_based_mapper import IndexBasedMapperBlock
+from .json_structure_block import JSONStructureBlock
 from .melt_columns import MeltColumnsBlock
 from .rename_columns import RenameColumnsBlock
 from .text_concat import TextConcatBlock
@@ -16,6 +17,7 @@ from .uniform_col_val_setter import UniformColumnValueSetter
 __all__ = [
     "TextConcatBlock",
     "DuplicateColumnsBlock",
+    "JSONStructureBlock",
     "MeltColumnsBlock",
     "IndexBasedMapperBlock",
     "RenameColumnsBlock",

sdg_hub/core/blocks/transform/json_structure_block.py ADDED Viewed

@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+"""JSON structure block for combining multiple columns into a structured JSON object.
+This module provides a block for combining multiple columns into a single column
+containing a structured JSON object with specified field names.
+"""
+# Standard
+from typing import Any, Dict
+import json
+# Third Party
+from datasets import Dataset
+from pydantic import Field, field_validator
+# Local
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+@BlockRegistry.register(
+    "JSONStructureBlock",
+    "transform",
+    "Combines multiple columns into a single column containing a structured JSON object",
+)
+class JSONStructureBlock(BaseBlock):
+    """Block for combining multiple columns into a structured JSON object.
+    This block takes values from multiple input columns and combines them into a single
+    output column containing a JSON object. The JSON field names match the input column names.
+    Attributes
+    ----------
+    block_name : str
+        Name of the block.
+    input_cols : List[str]
+        List of input column names to include in the JSON object.
+        Column names become the JSON field names.
+    output_cols : List[str]
+        List containing the single output column name.
+    ensure_json_serializable : bool
+        Whether to ensure all values are JSON serializable (default True).
+    pretty_print : bool
+        Whether to format JSON with indentation (default False).
+    """
+    ensure_json_serializable: bool = Field(
+        default=True, description="Whether to ensure all values are JSON serializable"
+    )
+    pretty_print: bool = Field(
+        default=False, description="Whether to format JSON with indentation"
+    )
+    @field_validator("output_cols", mode="after")
+    @classmethod
+    def validate_output_cols(cls, v):
+        """Validate that exactly one output column is specified."""
+        if not v or len(v) != 1:
+            raise ValueError("JSONStructureBlock requires exactly one output column")
+        return v
+    def _make_json_serializable(self, value: Any) -> Any:
+        """Convert value to JSON serializable format."""
+        if value is None:
+            return None
+        # Handle basic types that are already JSON serializable
+        if isinstance(value, (str, int, float, bool)):
+            return value
+        # Handle lists
+        if isinstance(value, (list, tuple)):
+            return [self._make_json_serializable(item) for item in value]
+        # Handle dictionaries
+        if isinstance(value, dict):
+            return {k: self._make_json_serializable(v) for k, v in value.items()}
+        # Convert other types to string
+        return str(value)
+    def _get_field_mapping(self) -> Dict[str, str]:
+        """Get the mapping of JSON field names to input column names."""
+        # Use column names as JSON field names (standard SDG Hub pattern)
+        if isinstance(self.input_cols, list):
+            return {col: col for col in self.input_cols}
+        raise ValueError("input_cols must be a list of column names")
+    def generate(self, samples: Dataset, **kwargs: Any) -> Dataset:
+        """Generate a dataset with JSON structured output.
+        Parameters
+        ----------
+        samples : Dataset
+            Input dataset to process.
+        Returns
+        -------
+        Dataset
+            Dataset with JSON structured output in the specified column.
+        """
+        if not self.output_cols:
+            raise ValueError("output_cols must be specified")
+        output_col = self.output_cols[0]
+        field_mapping = self._get_field_mapping()
+        def _create_json_structure(sample):
+            """Create JSON structure from input columns."""
+            json_obj = {}
+            # Build the JSON object using the field mapping
+            for json_field, col_name in field_mapping.items():
+                if col_name not in sample:
+                    logger.warning(f"Input column '{col_name}' not found in sample")
+                    json_obj[json_field] = None
+                else:
+                    value = sample[col_name]
+                    if self.ensure_json_serializable:
+                        value = self._make_json_serializable(value)
+                    json_obj[json_field] = value
+            # Convert to JSON string
+            try:
+                if self.pretty_print:
+                    json_string = json.dumps(json_obj, indent=2, ensure_ascii=False)
+                else:
+                    json_string = json.dumps(json_obj, ensure_ascii=False)
+                sample[output_col] = json_string
+            except (TypeError, ValueError) as e:
+                logger.error(f"Failed to serialize JSON object: {e}")
+                sample[output_col] = "{}"
+            return sample
+        # Apply the JSON structuring to all samples
+        result = samples.map(_create_json_structure)
+        return result

sdg-hub 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

sdg-hub 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl