PyPI - sdg-hub - Versions diffs - 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

sdg-hub 0.1.4py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

sdg_hub/__init__.py +28 -1
sdg_hub/_version.py +2 -2
sdg_hub/core/__init__.py +22 -0
sdg_hub/core/blocks/__init__.py +58 -0
sdg_hub/core/blocks/base.py +313 -0
sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
sdg_hub/core/blocks/evaluation/__init__.py +9 -0
sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
sdg_hub/core/blocks/filtering/__init__.py +12 -0
sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
sdg_hub/core/blocks/llm/__init__.py +27 -0
sdg_hub/core/blocks/llm/client_manager.py +398 -0
sdg_hub/core/blocks/llm/config.py +336 -0
sdg_hub/core/blocks/llm/error_handler.py +368 -0
sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
sdg_hub/core/blocks/registry.py +331 -0
sdg_hub/core/blocks/transform/__init__.py +23 -0
sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
sdg_hub/core/blocks/transform/melt_columns.py +126 -0
sdg_hub/core/blocks/transform/rename_columns.py +69 -0
sdg_hub/core/blocks/transform/text_concat.py +102 -0
sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
sdg_hub/core/flow/__init__.py +20 -0
sdg_hub/core/flow/base.py +1209 -0
sdg_hub/core/flow/checkpointer.py +333 -0
sdg_hub/core/flow/metadata.py +389 -0
sdg_hub/core/flow/migration.py +198 -0
sdg_hub/core/flow/registry.py +393 -0
sdg_hub/core/flow/validation.py +277 -0
sdg_hub/{utils → core/utils}/__init__.py +7 -4
sdg_hub/core/utils/datautils.py +63 -0
sdg_hub/core/utils/error_handling.py +208 -0
sdg_hub/core/utils/flow_id_words.yaml +231 -0
sdg_hub/core/utils/flow_identifier.py +94 -0
sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
sdg_hub/core/utils/yaml_utils.py +59 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
sdg_hub-0.2.1.dist-info/METADATA +221 -0
sdg_hub-0.2.1.dist-info/RECORD +68 -0
sdg_hub/blocks/__init__.py +0 -42
sdg_hub/blocks/block.py +0 -96
sdg_hub/blocks/llmblock.py +0 -375
sdg_hub/blocks/openaichatblock.py +0 -556
sdg_hub/blocks/utilblocks.py +0 -597
sdg_hub/checkpointer.py +0 -139
sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
sdg_hub/configs/annotations/detailed_description.yaml +0 -10
sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
sdg_hub/configs/knowledge/__init__.py +0 -0
sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
sdg_hub/configs/knowledge/router.yaml +0 -12
sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
sdg_hub/configs/reasoning/__init__.py +0 -0
sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
sdg_hub/configs/skills/__init__.py +0 -0
sdg_hub/configs/skills/analyzer.yaml +0 -48
sdg_hub/configs/skills/annotation.yaml +0 -36
sdg_hub/configs/skills/contexts.yaml +0 -28
sdg_hub/configs/skills/critic.yaml +0 -60
sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
sdg_hub/configs/skills/freeform_questions.yaml +0 -34
sdg_hub/configs/skills/freeform_responses.yaml +0 -39
sdg_hub/configs/skills/grounded_questions.yaml +0 -38
sdg_hub/configs/skills/grounded_responses.yaml +0 -59
sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
sdg_hub/configs/skills/judge.yaml +0 -53
sdg_hub/configs/skills/planner.yaml +0 -67
sdg_hub/configs/skills/respond.yaml +0 -8
sdg_hub/configs/skills/revised_responder.yaml +0 -78
sdg_hub/configs/skills/router.yaml +0 -59
sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
sdg_hub/flow.py +0 -477
sdg_hub/flow_runner.py +0 -450
sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
sdg_hub/pipeline.py +0 -121
sdg_hub/prompts.py +0 -80
sdg_hub/registry.py +0 -122
sdg_hub/sdg.py +0 -206
sdg_hub/utils/config_validation.py +0 -91
sdg_hub/utils/datautils.py +0 -14
sdg_hub/utils/error_handling.py +0 -94
sdg_hub/utils/validation_result.py +0 -10
sdg_hub-0.1.4.dist-info/METADATA +0 -190
sdg_hub-0.1.4.dist-info/RECORD +0 -89
sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
/sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
/sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
{sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
{sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
{sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0

sdg_hub/core/flow/registry.py ADDED Viewed

@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Flow registry for managing contributed flows."""
+# Standard
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+import os
+# Third Party
+from rich.console import Console
+from rich.table import Table
+import yaml
+# Local
+from ..utils.logger_config import setup_logger
+from ..utils.yaml_utils import save_flow_yaml
+from .metadata import FlowMetadata
+logger = setup_logger(__name__)
+@dataclass
+class FlowRegistryEntry:
+    """Entry in the flow registry.
+    Parameters
+    ----------
+    path : str
+        Path to the flow YAML file.
+    metadata : FlowMetadata
+        Flow metadata extracted from the file.
+    """
+    path: str
+    metadata: FlowMetadata
+class FlowRegistry:
+    """Registry for managing contributed flows."""
+    _entries: dict[str, FlowRegistryEntry] = {}
+    _search_paths: list[str] = []
+    _initialized: bool = False
+    @classmethod
+    def _ensure_initialized(cls) -> None:
+        """Ensure the registry is initialized with built-in flows."""
+        if cls._initialized:
+            return
+        try:
+            # Find the sdg_hub package directory
+            # First Party
+            import sdg_hub
+            package_path = Path(sdg_hub.__file__).parent
+            flows_dir = package_path / "flows"
+            # Register built-in flows directory if it exists
+            if flows_dir.exists():
+                flows_dir_str = str(flows_dir)
+                if flows_dir_str not in cls._search_paths:
+                    cls._search_paths.append(flows_dir_str)
+                    logger.debug(
+                        f"Auto-registered built-in flows directory: {flows_dir}"
+                    )
+            else:
+                logger.debug(f"Built-in flows directory not found: {flows_dir}")
+        except Exception as exc:
+            logger.warning(f"Failed to auto-register built-in flows: {exc}")
+        cls._initialized = True
+    @classmethod
+    def register_search_path(cls, path: str) -> None:
+        """Add a directory to search for flows.
+        Parameters
+        ----------
+        path : str
+            Path to directory containing flow YAML files.
+        """
+        if path not in cls._search_paths:
+            cls._search_paths.append(path)
+            logger.debug(f"Added flow search path: {path}")
+    @classmethod
+    def _discover_flows(cls, force_refresh: bool = False) -> None:
+        """Discover and register flows from search paths (private method).
+        Parameters
+        ----------
+        force_refresh : bool, optional
+            Whether to force refresh the registry.
+        """
+        # Ensure built-in flows are registered
+        cls._ensure_initialized()
+        if cls._entries and not force_refresh:
+            return
+        cls._entries.clear()
+        for search_path in cls._search_paths:
+            if not os.path.exists(search_path):
+                logger.warning(f"Flow search path does not exist: {search_path}")
+                continue
+            cls._discover_flows_in_directory(search_path)
+        logger.info(f"Discovered {len(cls._entries)} flows")
+    @classmethod
+    def _discover_flows_in_directory(cls, directory: str) -> None:
+        """Discover flows in a specific directory."""
+        path = Path(directory)
+        for yaml_file in path.rglob("*.yaml"):
+            try:
+                with open(yaml_file, encoding="utf-8") as f:
+                    flow_config = yaml.safe_load(f)
+                # Check if this is a flow file
+                if "metadata" in flow_config and "blocks" in flow_config:
+                    metadata_dict = flow_config["metadata"]
+                    metadata = FlowMetadata(**metadata_dict)
+                    # If id was generated, update the YAML
+                    if metadata.id and "id" not in metadata_dict:
+                        flow_config["metadata"]["id"] = metadata.id
+                        save_flow_yaml(
+                            yaml_file,
+                            flow_config,
+                            f"updated with generated id: {metadata.id}",
+                        )
+                    entry = FlowRegistryEntry(path=str(yaml_file), metadata=metadata)
+                    cls._entries[metadata.name] = entry
+                    logger.debug(
+                        f"Registered flow: {metadata.name} (id: {metadata.id}) from {yaml_file}"
+                    )
+            except Exception as exc:
+                logger.debug(f"Skipped {yaml_file}: {exc}")
+    @classmethod
+    def get_flow_path(cls, flow_name_or_id: str) -> Optional[str]:
+        """Get the path to a registered flow.
+        For backward compatibility, this function accepts either a flow id or flow_name.
+        Flow ID is preferred and should be used in new code.
+        Parameters
+        ----------
+        flow_name_or_id : str
+            Either the flow id or flow_name to find.
+        Returns
+        -------
+        Optional[str]
+            Path to the flow file, or None if not found.
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        # First try to find by id (preferred)
+        for entry in cls._entries.values():
+            if entry.metadata.id == flow_name_or_id:
+                return entry.path
+        # If not found, try by name (backward compatibility)
+        for entry in cls._entries.values():
+            if entry.metadata.name == flow_name_or_id:
+                logger.debug(
+                    f"Found flow by name (deprecated): {flow_name_or_id}, use id: {entry.metadata.id} instead"
+                )
+                return entry.path
+        return None
+    @classmethod
+    def get_flow_path_safe(cls, flow_name_or_id: str) -> str:
+        """Get the path to a registered flow with better error handling.
+        For backward compatibility, this function accepts either a flow id or flow_name.
+        Flow ID is preferred and should be used in new code.
+        Parameters
+        ----------
+        flow_name_or_id : str
+            Either the flow id or flow_name to find.
+        Returns
+        -------
+        str
+            Path to the flow file.
+        Raises
+        ------
+        ValueError
+            If the flow is not found, with helpful suggestions.
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        path = cls.get_flow_path(flow_name_or_id)
+        if path is None:
+            # Get available flows for better error message
+            available_flows = cls.list_flows()
+            error_msg = f"Flow '{flow_name_or_id}' not found.\n"
+            if available_flows:
+                error_msg += "Available flows:\n"
+                for flow in available_flows:
+                    error_msg += f"  - ID: '{flow['id']}', Name: '{flow['name']}'\n"
+            else:
+                error_msg += "No flows are currently registered. Try running FlowRegistry.discover_flows() first."
+            raise ValueError(error_msg.strip())
+        return path
+    @classmethod
+    def get_flow_metadata(cls, flow_name: str) -> Optional[FlowMetadata]:
+        """Get metadata for a registered flow.
+        Parameters
+        ----------
+        flow_name : str
+            Name of the flow.
+        Returns
+        -------
+        Optional[FlowMetadata]
+            Flow metadata, or None if not found.
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        if flow_name in cls._entries:
+            return cls._entries[flow_name].metadata
+        return None
+    @classmethod
+    def list_flows(cls) -> List[Dict[str, str]]:
+        """List all registered flows with their IDs.
+        Returns
+        -------
+        List[Dict[str, str]]
+            List of dictionaries containing flow IDs and names.
+            Each dictionary has 'id' and 'name' keys.
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        return [
+            {"id": entry.metadata.id, "name": entry.metadata.name}
+            for entry in cls._entries.values()
+        ]
+    @classmethod
+    def search_flows(
+        cls, tag: Optional[str] = None, author: Optional[str] = None
+    ) -> List[Dict[str, str]]:
+        """Search flows by criteria.
+        Parameters
+        ----------
+        tag : Optional[str]
+            Tag to filter by.
+        author : Optional[str]
+            Author to filter by.
+        Returns
+        -------
+        List[Dict[str, str]]
+            List of matching flows. Each dictionary contains:
+            - id: Flow ID
+            - name: Flow name
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        matching_flows = []
+        for entry in cls._entries.values():
+            metadata = entry.metadata
+            # Filter by tag
+            if tag and tag not in metadata.tags:
+                continue
+            # Filter by author
+            if author and author.lower() not in metadata.author.lower():
+                continue
+            matching_flows.append({"id": metadata.id, "name": metadata.name})
+        return matching_flows
+    @classmethod
+    def get_flows_by_category(cls) -> Dict[str, List[Dict[str, str]]]:
+        """Get flows organized by their primary tag.
+        Returns
+        -------
+        Dict[str, List[Dict[str, str]]]
+            Dictionary mapping tags to flow information. Each flow is represented by:
+            - id: Flow ID
+            - name: Flow name
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        categories = {}
+        for entry in cls._entries.values():
+            metadata = entry.metadata
+            # Use first tag as primary category, or "uncategorized"
+            category = metadata.tags[0] if metadata.tags else "uncategorized"
+            if category not in categories:
+                categories[category] = []
+            categories[category].append({"id": metadata.id, "name": metadata.name})
+        return categories
+    @classmethod
+    def discover_flows(cls) -> None:
+        """Discover and display all flows in a formatted table.
+        This is the main public API for flow discovery. It finds all flows
+        in registered search paths and displays them in a beautiful Rich table.
+        """
+        cls._ensure_initialized()
+        cls._discover_flows()
+        if not cls._entries:
+            print(
+                "No flows discovered. Try adding search paths with register_search_path()"
+            )
+            print("Note: Only flows with 'metadata' section are discoverable.")
+            return
+        # Prepare data with fallbacks
+        flow_data = []
+        for _, entry in cls._entries.items():
+            metadata = entry.metadata
+            flow_data.append(
+                {
+                    "name": metadata.name,
+                    "id": metadata.id,
+                    "author": metadata.author or "Unknown",
+                    "tags": ", ".join(metadata.tags) if metadata.tags else "-",
+                    "description": metadata.description or "No description",
+                    "version": metadata.version,
+                    "cost": metadata.estimated_cost,
+                }
+            )
+        # Sort by name for consistency
+        flow_data.sort(key=lambda x: x["id"])
+        # Display Rich table
+        # Third Party
+        console = Console()
+        table = Table(show_header=True, header_style="bold bright_magenta")
+        # Add columns with better visibility colors
+        table.add_column("ID", style="bold bright_magenta", no_wrap=True)
+        table.add_column("Name", style="bold bright_cyan")
+        table.add_column("Author", style="bright_green")
+        table.add_column("Tags", style="yellow")
+        table.add_column("Description", style="white")
+        # Add rows
+        for flow in flow_data:
+            table.add_row(
+                flow["id"],
+                flow["name"],
+                flow["author"],
+                flow["tags"],
+                flow["description"],
+            )
+        console.print(table)

sdg_hub/core/flow/validation.py ADDED Viewed

@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Flow validation utilities."""
+# Standard
+from typing import TYPE_CHECKING, Any
+# Third Party
+from datasets import Dataset
+if TYPE_CHECKING:
+    # Local
+    from .base import Flow
+class FlowValidator:
+    """Validator for flow configurations and execution readiness."""
+    def validate_yaml_structure(self, flow_config: dict[str, Any]) -> list[str]:
+        """Validate the structure of a flow YAML configuration.
+        Parameters
+        ----------
+        flow_config : Dict[str, Any]
+            The loaded YAML configuration.
+        Returns
+        -------
+        List[str]
+            List of validation error messages. Empty if valid.
+        """
+        errors = []
+        # Check required top-level keys
+        if "blocks" not in flow_config:
+            errors.append("Flow configuration must contain 'blocks' section")
+            return errors  # Can't continue without blocks
+        blocks = flow_config["blocks"]
+        if not isinstance(blocks, list):
+            errors.append("'blocks' must be a list")
+            return errors
+        if not blocks:
+            errors.append("Flow must contain at least one block")
+            return errors
+        # Validate each block configuration
+        for i, block_config in enumerate(blocks):
+            block_errors = self._validate_block_config(block_config, i)
+            errors.extend(block_errors)
+        # Validate metadata if present
+        if "metadata" in flow_config:
+            metadata_errors = self._validate_metadata_config(flow_config["metadata"])
+            errors.extend(metadata_errors)
+        # Validate parameters if present
+        if "parameters" in flow_config:
+            param_errors = self._validate_parameters_config(flow_config["parameters"])
+            errors.extend(param_errors)
+        return errors
+    def _validate_block_config(
+        self, block_config: dict[str, Any], index: int
+    ) -> list[str]:
+        """Validate a single block configuration."""
+        errors = []
+        prefix = f"Block {index}"
+        if not isinstance(block_config, dict):
+            errors.append(f"{prefix}: Block configuration must be a dictionary")
+            return errors
+        # Check required fields
+        if "block_type" not in block_config:
+            errors.append(f"{prefix}: Missing required field 'block_type'")
+        if "block_config" not in block_config:
+            errors.append(f"{prefix}: Missing required field 'block_config'")
+        else:
+            # Validate block_config structure
+            inner_config = block_config["block_config"]
+            if not isinstance(inner_config, dict):
+                errors.append(f"{prefix}: 'block_config' must be a dictionary")
+            elif "block_name" not in inner_config:
+                errors.append(f"{prefix}: 'block_config' must contain 'block_name'")
+        # Validate optional fields
+        if "runtime_overrides" in block_config:
+            overrides = block_config["runtime_overrides"]
+            if not isinstance(overrides, list):
+                errors.append(f"{prefix}: 'runtime_overrides' must be a list")
+            elif not all(isinstance(item, str) for item in overrides):
+                errors.append(
+                    f"{prefix}: All 'runtime_overrides' items must be strings"
+                )
+        return errors
+    def _validate_metadata_config(self, metadata: dict[str, Any]) -> list[str]:
+        """Validate metadata configuration."""
+        errors = []
+        if not isinstance(metadata, dict):
+            errors.append("'metadata' must be a dictionary")
+            return errors
+        # Check required name field
+        if "name" not in metadata:
+            errors.append("Metadata must contain 'name' field")
+        elif not isinstance(metadata["name"], str) or not metadata["name"].strip():
+            errors.append("Metadata 'name' must be a non-empty string")
+        # Validate id if present
+        if "id" in metadata:
+            flow_id = metadata["id"]
+            if not isinstance(flow_id, str):
+                errors.append("Metadata: 'id' must be a string")
+            elif flow_id and not flow_id.islower():
+                errors.append("Metadata: 'id' must be lowercase")
+            elif flow_id and not flow_id.replace("-", "").isalnum():
+                errors.append(
+                    "Metadata: 'id' must contain only alphanumeric characters and hyphens"
+                )
+        # Validate optional fields
+        string_fields = [
+            "description",
+            "version",
+            "author",
+            "recommended_model",
+            "license",
+        ]
+        for field in string_fields:
+            if field in metadata and not isinstance(metadata[field], str):
+                errors.append(f"Metadata '{field}' must be a string")
+        if "tags" in metadata:
+            tags = metadata["tags"]
+            if not isinstance(tags, list):
+                errors.append("Metadata 'tags' must be a list")
+            elif not all(isinstance(tag, str) for tag in tags):
+                errors.append("All metadata 'tags' must be strings")
+        return errors
+    def _validate_parameters_config(self, parameters: dict[str, Any]) -> list[str]:
+        """Validate parameters configuration."""
+        errors = []
+        if not isinstance(parameters, dict):
+            errors.append("'parameters' must be a dictionary")
+            return errors
+        for param_name, param_config in parameters.items():
+            if not isinstance(param_name, str):
+                errors.append("Parameter names must be strings")
+                continue
+            if isinstance(param_config, dict):
+                # Full parameter specification
+                if "default" not in param_config:
+                    errors.append(f"Parameter '{param_name}' must have 'default' value")
+                # Validate optional fields
+                if "description" in param_config and not isinstance(
+                    param_config["description"], str
+                ):
+                    errors.append(
+                        f"Parameter '{param_name}' description must be a string"
+                    )
+                if "required" in param_config and not isinstance(
+                    param_config["required"], bool
+                ):
+                    errors.append(
+                        f"Parameter '{param_name}' required field must be boolean"
+                    )
+        return errors
+    def validate_flow_execution(self, flow: "Flow", dataset: Dataset) -> list[str]:
+        """Validate that a flow can be executed with the given dataset.
+        Parameters
+        ----------
+        flow : Flow
+            The flow to validate.
+        dataset : Dataset
+            Dataset to validate against.
+        Returns
+        -------
+        List[str]
+            List of validation error messages. Empty if validation passes.
+        """
+        errors = []
+        if not flow.blocks:
+            errors.append("Flow contains no blocks")
+            return errors
+        if len(dataset) == 0:
+            errors.append("Dataset is empty")
+            return errors
+        # Track available columns as we progress through blocks
+        current_columns = set(dataset.column_names)
+        for _i, block in enumerate(flow.blocks):
+            block_name = block.block_name
+            # Check input columns
+            if hasattr(block, "input_cols") and block.input_cols:
+                missing_cols = self._check_missing_columns(
+                    block.input_cols, current_columns
+                )
+                if missing_cols:
+                    errors.append(
+                        f"Block '{block_name}' missing input columns: {missing_cols}"
+                    )
+            # Update available columns for next block
+            if hasattr(block, "output_cols") and block.output_cols:
+                new_columns = self._extract_column_names(block.output_cols)
+                current_columns.update(new_columns)
+        return errors
+    def _check_missing_columns(
+        self, required_cols: Any, available_cols: set[str]
+    ) -> list[str]:
+        """Check which required columns are missing."""
+        if isinstance(required_cols, (list, dict)):
+            return [col for col in required_cols if col not in available_cols]
+        return []
+    def _extract_column_names(self, output_cols: Any) -> list[str]:
+        """Extract column names from output specification."""
+        if isinstance(output_cols, list):
+            return output_cols
+        elif isinstance(output_cols, dict):
+            return list(output_cols.keys())
+        return []
+    def validate_block_chain(self, blocks: list[Any]) -> list[str]:
+        """Validate that blocks can be chained together.
+        Parameters
+        ----------
+        blocks : List[Any]
+            List of block instances to validate.
+        Returns
+        -------
+        List[str]
+            List of validation error messages.
+        """
+        errors = []
+        if not blocks:
+            errors.append("Block chain is empty")
+            return errors
+        # Check that all blocks have unique names
+        block_names = []
+        for i, block in enumerate(blocks):
+            if hasattr(block, "block_name"):
+                name = block.block_name
+                if name in block_names:
+                    errors.append(f"Duplicate block name '{name}' at index {i}")
+                block_names.append(name)
+            else:
+                errors.append(f"Block at index {i} missing 'block_name' attribute")
+        return errors

sdg_hub/{utils → core/utils}/__init__.py RENAMED Viewed

@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
+# Local
+from .flow_identifier import get_flow_identifier
+from .path_resolution import resolve_path
 # This is part of the public API, and used by instructlab
-class GenerateException(Exception):
+class GenerateError(Exception):
     """An exception raised during generate step."""
-from .path_resolution import resolve_path
-__all__ = ["GenerateException", "resolve_path"]
+__all__ = ["GenerateError", "resolve_path", "get_flow_identifier"]

sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

sdg-hub 0.1.4py3-none-any.whl → 0.2.1py3-none-any.whl