PyPI - sdg-hub - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

sdg-hub 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

sdg_hub/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.7.2'
-__version_tuple__ = version_tuple = (0, 7, 2)
+__version__ = version = '0.8.0'
+__version_tuple__ = version_tuple = (0, 8, 0)
 __commit_id__ = commit_id = None

sdg_hub/core/__init__.py CHANGED Viewed

@@ -2,14 +2,26 @@
 """Core SDG Hub components."""
 # Local
-from .blocks import BaseBlock, BlockRegistry
+from .blocks import AgentBlock, BaseBlock, BlockRegistry
+from .connectors import (
+    BaseConnector,
+    ConnectorConfig,
+    ConnectorError,
+    ConnectorRegistry,
+)
 from .flow import Flow, FlowMetadata, FlowRegistry, FlowValidator
 from .utils import GenerateError, resolve_path
 __all__ = [
     # Block components
+    "AgentBlock",
     "BaseBlock",
     "BlockRegistry",
+    # Connector components
+    "BaseConnector",
+    "ConnectorConfig",
+    "ConnectorError",
+    "ConnectorRegistry",
     # Flow components
     "Flow",
     "FlowRegistry",

sdg_hub/core/blocks/__init__.py CHANGED Viewed

@@ -4,9 +4,16 @@ This package provides various block implementations for data generation, process
 """
 # Local
+from .agent import AgentBlock
 from .base import BaseBlock
 from .filtering import ColumnValueFilterBlock
-from .llm import LLMChatBlock, LLMParserBlock, PromptBuilderBlock, TextParserBlock
+from .llm import (
+    LLMChatBlock,
+    LLMParserBlock,
+    LLMResponseExtractorBlock,
+    PromptBuilderBlock,
+    TextParserBlock,
+)
 from .registry import BlockRegistry
 from .transform import (
     DuplicateColumnsBlock,
@@ -18,6 +25,7 @@ from .transform import (
 )
 __all__ = [
+    "AgentBlock",
     "BaseBlock",
     "BlockRegistry",
     "ColumnValueFilterBlock",
@@ -28,7 +36,8 @@ __all__ = [
     "TextConcatBlock",
     "UniformColumnValueSetter",
     "LLMChatBlock",
-    "LLMParserBlock",
+    "LLMParserBlock",  # Deprecated alias for LLMResponseExtractorBlock
+    "LLMResponseExtractorBlock",
     "TextParserBlock",
     "PromptBuilderBlock",
 ]

sdg_hub/core/blocks/agent/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Agent blocks for external agent framework integration."""
+from .agent_block import AgentBlock
+__all__ = ["AgentBlock"]

sdg_hub/core/blocks/agent/agent_block.py ADDED Viewed

@@ -0,0 +1,397 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Agent block for integrating external agent frameworks."""
+from typing import Any, Optional
+import asyncio
+import uuid
+from pydantic import Field, PrivateAttr
+from tqdm import tqdm
+import pandas as pd
+from ...connectors.agent.base import BaseAgentConnector
+from ...connectors.base import ConnectorConfig
+from ...connectors.exceptions import ConnectorError
+from ...connectors.registry import ConnectorRegistry
+from ...utils.logger_config import setup_logger
+from ..base import BaseBlock
+from ..registry import BlockRegistry
+logger = setup_logger(__name__)
+@BlockRegistry.register(
+    "AgentBlock",
+    category="agent",
+    description="Execute agent frameworks (Langflow, etc.) on DataFrame rows",
+)
+class AgentBlock(BaseBlock):
+    """Block for executing external agent frameworks on DataFrame rows.
+    This block integrates with various agent frameworks through the connector
+    system. Each row in the DataFrame is processed by sending messages to the
+    agent and storing the response.
+    The block supports both sync and async execution modes for optimal
+    performance with large datasets.
+    Parameters
+    ----------
+    agent_framework : str
+        Name of the connector to use (e.g., 'langflow').
+    agent_url : str
+        API endpoint URL for the agent.
+    agent_api_key : str, optional
+        API key for authentication.
+    timeout : float
+        Request timeout in seconds. Default 120.0.
+    max_retries : int
+        Maximum retry attempts. Default 3.
+    session_id_col : str, optional
+        Column containing session IDs. If not provided, generates UUIDs.
+    async_mode : bool
+        Whether to use async execution. Default False.
+    max_concurrency : int
+        Maximum concurrent requests in async mode. Default 10.
+    Example YAML Configuration
+    --------------------------
+    ```yaml
+    - block_type: AgentBlock
+      block_config:
+        block_name: my_agent
+        agent_framework: langflow
+        agent_url: http://localhost:7860/api/v1/run/my-flow
+        agent_api_key: ${LANGFLOW_API_KEY}
+        input_cols:
+          messages: messages_col
+        output_cols:
+          - agent_response
+    ```
+    Example
+    -------
+    >>> block = AgentBlock(
+    ...     block_name="qa_agent",
+    ...     agent_framework="langflow",
+    ...     agent_url="http://localhost:7860/api/v1/run/qa-flow",
+    ...     input_cols={"messages": "question"},
+    ...     output_cols=["response"],
+    ... )
+    >>> result_df = block(df)
+    """
+    # Required configuration
+    agent_framework: str = Field(
+        ...,
+        description="Connector name (e.g., 'langflow')",
+    )
+    agent_url: str = Field(
+        ...,
+        description="Agent API endpoint URL",
+    )
+    # Optional configuration
+    agent_api_key: Optional[str] = Field(
+        None,
+        description="API key for authentication",
+    )
+    timeout: float = Field(
+        120.0,
+        description="Request timeout in seconds",
+        gt=0,
+    )
+    max_retries: int = Field(
+        3,
+        description="Maximum retry attempts",
+        ge=0,
+    )
+    session_id_col: Optional[str] = Field(
+        None,
+        description="Column containing session IDs",
+    )
+    async_mode: bool = Field(
+        False,
+        description="Use async execution for better throughput",
+    )
+    max_concurrency: int = Field(
+        10,
+        description="Maximum concurrent requests in async mode",
+        gt=0,
+    )
+    extract_response: bool = Field(
+        False,
+        description="Extract just the text content from agent response",
+    )
+    # Private attributes
+    _connector: Optional[BaseAgentConnector] = PrivateAttr(default=None)
+    _connector_config_key: Optional[tuple] = PrivateAttr(default=None)
+    def _get_connector(self) -> BaseAgentConnector:
+        """Get or create the connector instance.
+        Invalidates the cached connector if the config has changed (e.g., due
+        to runtime overrides).
+        Returns
+        -------
+        BaseAgentConnector
+            The configured connector instance.
+        """
+        config_key = (
+            self.agent_framework,
+            self.agent_url,
+            self.agent_api_key,
+            self.timeout,
+            self.max_retries,
+            self.extract_response,
+        )
+        if self._connector is None or self._connector_config_key != config_key:
+            connector_class = ConnectorRegistry.get(self.agent_framework)
+            config = ConnectorConfig(
+                url=self.agent_url,
+                api_key=self.agent_api_key,
+                timeout=self.timeout,
+                max_retries=self.max_retries,
+                extract_text=self.extract_response,
+            )
+            self._connector = connector_class(config=config)
+            self._connector_config_key = config_key
+        return self._connector
+    def _get_messages_col(self) -> str:
+        """Get the input column name for messages.
+        Returns
+        -------
+        str
+            Column name containing messages.
+        """
+        if isinstance(self.input_cols, dict):
+            if "messages" in self.input_cols:
+                return self.input_cols["messages"]
+            elif self.input_cols:
+                return list(self.input_cols.keys())[0]
+            else:
+                raise ConnectorError("input_cols must specify the messages column")
+        elif isinstance(self.input_cols, list) and len(self.input_cols) > 0:
+            return self.input_cols[0]
+        else:
+            raise ConnectorError("input_cols must specify the messages column")
+    def _get_output_col(self) -> str:
+        """Get the output column name for responses.
+        Returns
+        -------
+        str
+            Column name for storing responses.
+        """
+        if isinstance(self.output_cols, dict):
+            return list(self.output_cols.keys())[0]
+        elif isinstance(self.output_cols, list) and len(self.output_cols) > 0:
+            return self.output_cols[0]
+        else:
+            return "agent_response"
+    def _build_messages(self, content: Any) -> list[dict[str, Any]]:
+        """Build message list from row content.
+        Parameters
+        ----------
+        content : Any
+            Content from the DataFrame cell.
+        Returns
+        -------
+        list[dict]
+            List of messages in standard format.
+        """
+        if isinstance(content, list):
+            # Already a message list
+            return content
+        elif isinstance(content, dict):
+            # Single message dict
+            return [content]
+        else:
+            # Plain text - wrap as user message
+            return [{"role": "user", "content": str(content)}]
+    def _get_session_id(self, row: pd.Series, idx: int) -> str:
+        """Get session ID for a row.
+        Parameters
+        ----------
+        row : pd.Series
+            DataFrame row.
+        idx : int
+            Row index.
+        Returns
+        -------
+        str
+            Session ID.
+        """
+        if self.session_id_col and self.session_id_col in row:
+            return str(row[self.session_id_col])
+        return str(uuid.uuid4())
+    def _process_row_sync(
+        self,
+        row: pd.Series,
+        idx: int,
+        connector: BaseAgentConnector,
+        messages_col: str,
+    ) -> dict[str, Any]:
+        """Process a single row synchronously.
+        Parameters
+        ----------
+        row : pd.Series
+            DataFrame row.
+        idx : int
+            Row index.
+        connector : BaseAgentConnector
+            Connector instance.
+        messages_col : str
+            Column containing messages.
+        Returns
+        -------
+        dict
+            Response from the agent.
+        """
+        messages = self._build_messages(row[messages_col])
+        session_id = self._get_session_id(row, idx)
+        return connector.send(messages, session_id)
+    async def _process_row_async(
+        self,
+        row: pd.Series,
+        idx: int,
+        connector: BaseAgentConnector,
+        messages_col: str,
+        semaphore: asyncio.Semaphore,
+    ) -> tuple[int, dict[str, Any]]:
+        """Process a single row asynchronously.
+        Parameters
+        ----------
+        row : pd.Series
+            DataFrame row.
+        idx : int
+            Row index.
+        connector : BaseAgentConnector
+            Connector instance.
+        messages_col : str
+            Column containing messages.
+        semaphore : asyncio.Semaphore
+            Semaphore for concurrency control.
+        Returns
+        -------
+        tuple[int, dict]
+            Row index and response.
+        """
+        async with semaphore:
+            messages = self._build_messages(row[messages_col])
+            session_id = self._get_session_id(row, idx)
+            response = await connector.asend(messages, session_id)
+            return idx, response
+    async def _process_batch_async(
+        self,
+        df: pd.DataFrame,
+        connector: BaseAgentConnector,
+        messages_col: str,
+    ) -> dict[int, dict[str, Any]]:
+        """Process all rows asynchronously.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            Input DataFrame.
+        connector : BaseAgentConnector
+            Connector instance.
+        messages_col : str
+            Column containing messages.
+        Returns
+        -------
+        dict[int, dict]
+            Mapping from row index to response.
+        """
+        semaphore = asyncio.Semaphore(self.max_concurrency)
+        tasks = [
+            self._process_row_async(row, idx, connector, messages_col, semaphore)
+            for idx, row in df.iterrows()
+        ]
+        results = {}
+        for coro in tqdm(
+            asyncio.as_completed(tasks),
+            total=len(tasks),
+            desc=f"{self.block_name} (async)",
+        ):
+            idx, response = await coro
+            results[idx] = response
+        return results
+    def generate(self, samples: pd.DataFrame, **kwargs: Any) -> pd.DataFrame:
+        """Process DataFrame rows through the agent.
+        Parameters
+        ----------
+        samples : pd.DataFrame
+            Input DataFrame with messages column.
+        **kwargs : Any
+            Runtime overrides.
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame with agent responses added.
+        """
+        df = samples.copy()
+        connector = self._get_connector()
+        messages_col = self._get_messages_col()
+        output_col = self._get_output_col()
+        if self.async_mode:
+            # Async execution
+            try:
+                asyncio.get_running_loop()
+                # Already in async context - use thread executor
+                import concurrent.futures
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future = executor.submit(
+                        asyncio.run,
+                        self._process_batch_async(df, connector, messages_col),
+                    )
+                    results = future.result()
+            except RuntimeError:
+                # No event loop - create one
+                results = asyncio.run(
+                    self._process_batch_async(df, connector, messages_col)
+                )
+            # Apply results
+            df[output_col] = df.index.map(results)
+        else:
+            # Sync execution with progress bar
+            responses = []
+            for idx, row in tqdm(
+                df.iterrows(),
+                total=len(df),
+                desc=self.block_name,
+            ):
+                response = self._process_row_sync(row, idx, connector, messages_col)
+                responses.append(response)
+            df[output_col] = responses
+        logger.info(f"Processed {len(df)} rows with {self.agent_framework} agent")
+        return df

sdg_hub/core/blocks/base.py CHANGED Viewed

@@ -49,6 +49,9 @@ class BaseBlock(BaseModel, ABC):
     block_name: str = Field(
         ..., description="Unique identifier for this block instance"
     )
+    block_type: Optional[str] = Field(
+        None, description="Block type (e.g., 'llm', 'transform', 'parser', 'filtering')"
+    )
     input_cols: Union[str, list[str], dict[str, Any], None] = Field(
         None, description="Input columns: str, list, or dict"
     )
@@ -366,5 +369,5 @@ class BaseBlock(BaseModel, ABC):
         Dict[str, Any]
         """
         config = self.get_config()
-        config["block_type"] = self.__class__.__name__
+        config["block_class"] = self.__class__.__name__
         return config

sdg_hub/core/blocks/filtering/column_value_filter.py CHANGED Viewed

@@ -46,6 +46,8 @@ DTYPE_MAP = {
     "Filters datasets based on column values using various comparison operations",
 )
 class ColumnValueFilterBlock(BaseBlock):
+    block_type: str = "filtering"
     """A block for filtering datasets based on column values.
     This block allows filtering of datasets using various operations (e.g., equals, contains)

sdg_hub/core/blocks/llm/__init__.py CHANGED Viewed

@@ -9,7 +9,7 @@ local models (vLLM, Ollama), and more.
 # Local
 from .error_handler import ErrorCategory, LLMErrorHandler
 from .llm_chat_block import LLMChatBlock
-from .llm_parser_block import LLMParserBlock
+from .llm_response_extractor_block import LLMParserBlock, LLMResponseExtractorBlock
 from .prompt_builder_block import PromptBuilderBlock
 from .text_parser_block import TextParserBlock
@@ -17,7 +17,8 @@ __all__ = [
     "LLMErrorHandler",
     "ErrorCategory",
     "LLMChatBlock",
-    "LLMParserBlock",
+    "LLMParserBlock",  # Deprecated alias for LLMResponseExtractorBlock
+    "LLMResponseExtractorBlock",
     "PromptBuilderBlock",
     "TextParserBlock",
 ]

sdg_hub/core/blocks/llm/llm_chat_block.py CHANGED Viewed

@@ -32,6 +32,8 @@ logger = setup_logger(__name__)
 class LLMChatBlock(BaseBlock):
     model_config = ConfigDict(extra="allow")
+    block_type: str = "llm"
     """Unified LLM chat block supporting all providers via LiteLLM.
     This block provides a minimal wrapper around LiteLLM's completion API,

sdg_hub/core/blocks/llm/{llm_parser_block.py → llm_response_extractor_block.py} RENAMED Viewed

@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-"""LLM parser block for extracting fields from LLM response objects.
+"""LLM response extractor block for extracting fields from LLM response objects.
-This module provides the LLMParserBlock for extracting specific fields
+This module provides the LLMResponseExtractorBlock for extracting specific fields
 (content, reasoning_content, tool_calls) from chat completion response objects.
 """
@@ -22,13 +22,15 @@ logger = setup_logger(__name__)
 @BlockRegistry.register(
-    "LLMParserBlock",
+    "LLMResponseExtractorBlock",
     "llm",
     "Extracts specified fields from LLM response objects",
 )
-class LLMParserBlock(BaseBlock):
+class LLMResponseExtractorBlock(BaseBlock):
     _flow_requires_jsonl_tmp: bool = True
+    block_type: str = "llm_util"
     """Block for extracting fields from LLM response objects.
     This block extracts specified fields from chat completion response objects.
@@ -88,7 +90,7 @@ class LLMParserBlock(BaseBlock):
             ]
         ):
             raise ValueError(
-                "LLMParserBlock requires at least one extraction field to be enabled: "
+                "LLMResponseExtractorBlock requires at least one extraction field to be enabled: "
                 "extract_content, extract_reasoning_content, or extract_tool_calls"
             )
@@ -106,7 +108,7 @@ class LLMParserBlock(BaseBlock):
         return self
     def _validate_custom(self, dataset: pd.DataFrame) -> None:
-        """Validate LLMParserBlock specific requirements.
+        """Validate LLMResponseExtractorBlock specific requirements.
         Parameters
         ----------
@@ -116,14 +118,16 @@ class LLMParserBlock(BaseBlock):
         Raises
         ------
         ValueError
-            If LLMParserBlock requirements are not met.
+            If LLMResponseExtractorBlock requirements are not met.
         """
         # Validate that we have exactly one input column
         if len(self.input_cols) == 0:
-            raise ValueError("LLMParserBlock expects at least one input column")
+            raise ValueError(
+                "LLMResponseExtractorBlock expects at least one input column"
+            )
         if len(self.input_cols) > 1:
             logger.warning(
-                f"LLMParserBlock expects exactly one input column, but got {len(self.input_cols)}. "
+                f"LLMResponseExtractorBlock expects exactly one input column, but got {len(self.input_cols)}. "
                 f"Using the first column: {self.input_cols[0]}"
             )
@@ -324,3 +328,22 @@ class LLMParserBlock(BaseBlock):
             new_data.extend(self._generate(sample))
         return pd.DataFrame(new_data)
+# Backwards compatibility alias (deprecated)
+# Register deprecated alias in BlockRegistry so old YAML flows still work
+@BlockRegistry.register(
+    "LLMParserBlock",
+    "llm",
+    "Deprecated: Use LLMResponseExtractorBlock instead",
+    deprecated=True,
+    replacement="LLMResponseExtractorBlock",
+)
+class LLMParserBlock(LLMResponseExtractorBlock):
+    """Deprecated alias for LLMResponseExtractorBlock.
+    This class exists for backwards compatibility with existing code and YAML flows.
+    Use LLMResponseExtractorBlock instead.
+    """
+    pass

sdg_hub/core/blocks/llm/prompt_builder_block.py CHANGED Viewed

@@ -222,6 +222,8 @@ class PromptRenderer:
     "Formats prompts into structured chat messages or plain text using Jinja templates",
 )
 class PromptBuilderBlock(BaseBlock):
+    block_type: str = "llm_util"
     """Block for formatting prompts into structured chat messages or plain text.
     This block takes input from dataset columns, applies Jinja templates from a YAML config

sdg_hub/core/blocks/llm/text_parser_block.py CHANGED Viewed

@@ -30,6 +30,8 @@ logger = setup_logger(__name__)
 class TextParserBlock(BaseBlock):
     _flow_requires_jsonl_tmp: bool = True
+    block_type: str = "parser"
     """Block for parsing and post-processing text content.
     This block handles text parsing using start/end tags, custom regex patterns,

sdg_hub/core/blocks/transform/duplicate_columns.py CHANGED Viewed

@@ -27,6 +27,8 @@ logger = setup_logger(__name__)
     "Duplicates existing columns with new names according to a mapping specification",
 )
 class DuplicateColumnsBlock(BaseBlock):
+    block_type: str = "transform"
     """Block for duplicating existing columns with new names.
     This block creates copies of existing columns with new names according to a mapping specification.

sdg_hub/core/blocks/transform/index_based_mapper.py CHANGED Viewed

@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
     "Maps values from source columns to output columns based on choice columns using shared mapping",
 )
 class IndexBasedMapperBlock(BaseBlock):
+    block_type: str = "transform"
     """Block for mapping values from source columns to output columns based on choice columns.
     This block uses a shared mapping dictionary to select values from source columns and

sdg_hub/core/blocks/transform/json_structure_block.py CHANGED Viewed

@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
     "Combines multiple columns into a single column containing a structured JSON object",
 )
 class JSONStructureBlock(BaseBlock):
+    block_type: str = "transform"
     """Block for combining multiple columns into a structured JSON object.
     This block takes values from multiple input columns and combines them into a single

sdg_hub/core/blocks/transform/melt_columns.py CHANGED Viewed

@@ -28,6 +28,8 @@ logger = setup_logger(__name__)
     "Transforms wide dataset format into long format by melting columns into rows",
 )
 class MeltColumnsBlock(BaseBlock):
+    block_type: str = "transform"
     """Block for flattening multiple columns into a long format.
     This block transforms a wide dataset format into a long format by melting

sdg-hub 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

sdg-hub 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl