PyPI - sdg-hub - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

sdg_hub/core/flow/base.py CHANGED Viewed

@@ -2,29 +2,40 @@
 """Pydantic-based Flow class for managing data generation pipelines."""
 # Standard
+from datetime import datetime
 from pathlib import Path
 from typing import Any, Optional, Union
 import time
+import uuid
 # Third Party
 from datasets import Dataset
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    PrivateAttr,
+    field_validator,
+    model_validator,
+)
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 from rich.tree import Tree
+import datasets
 import yaml
 # Local
 from ..blocks.base import BaseBlock
 from ..blocks.registry import BlockRegistry
-from ..utils.datautils import safe_concatenate_with_validation
+from ..utils.datautils import safe_concatenate_with_validation, validate_no_duplicates
 from ..utils.error_handling import EmptyDatasetError, FlowValidationError
+from ..utils.flow_metrics import display_metrics_summary, save_metrics_to_json
 from ..utils.logger_config import setup_logger
 from ..utils.path_resolution import resolve_path
 from ..utils.yaml_utils import save_flow_yaml
 from .checkpointer import FlowCheckpointer
-from .metadata import FlowMetadata, FlowParameter
+from .metadata import DatasetRequirements, FlowMetadata, FlowParameter
 from .migration import FlowMigration
 from .validation import FlowValidator
@@ -66,6 +77,9 @@ class Flow(BaseModel):
     _migrated_runtime_params: dict[str, dict[str, Any]] = {}
     _llm_client: Any = None  # Only used for backward compatibility with old YAMLs
     _model_config_set: bool = False  # Track if model configuration has been set
+    _block_metrics: list[dict[str, Any]] = PrivateAttr(
+        default_factory=list
+    )  # Track block execution metrics
     @field_validator("blocks")
     @classmethod
@@ -306,13 +320,11 @@ class Flow(BaseModel):
         # Get block class from registry
         try:
-            block_class = BlockRegistry.get(block_type_name)
+            block_class = BlockRegistry._get(block_type_name)
         except KeyError as exc:
             # Get all available blocks from all categories
-            all_blocks = BlockRegistry.all()
-            available_blocks = ", ".join(
-                [block for blocks in all_blocks.values() for block in blocks]
-            )
+            all_blocks = BlockRegistry.list_blocks()
+            available_blocks = ", ".join(all_blocks)
             raise FlowValidationError(
                 f"Block type '{block_type_name}' not found in registry. "
                 f"Available blocks: {available_blocks}"
@@ -357,6 +369,8 @@ class Flow(BaseModel):
         runtime_params: Optional[dict[str, dict[str, Any]]] = None,
         checkpoint_dir: Optional[str] = None,
         save_freq: Optional[int] = None,
+        log_dir: Optional[str] = None,
+        max_concurrency: Optional[int] = None,
     ) -> Dataset:
         """Execute the flow blocks in sequence to generate data.
@@ -378,6 +392,13 @@ class Flow(BaseModel):
         save_freq : Optional[int], optional
             Number of completed samples after which to save a checkpoint.
             If None, only saves final results when checkpointing is enabled.
+        log_dir : Optional[str], optional
+            Directory to save execution logs. If provided, logs will be written to both
+            console and a log file in this directory. Maintains backward compatibility
+            when None.
+        max_concurrency : Optional[int], optional
+            Maximum number of concurrent requests across all blocks.
+            Controls async request concurrency to prevent overwhelming servers.
         Returns
         -------
@@ -397,6 +418,37 @@ class Flow(BaseModel):
                 f"save_freq must be greater than 0, got {save_freq}"
             )
+        # Set up file logging if log_dir is provided
+        flow_logger = logger  # Use global logger by default
+        if log_dir is not None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            flow_name = self.metadata.name.replace(" ", "_").lower()
+            log_filename = f"{flow_name}_{timestamp}.log"
+            # Create a flow-specific logger for this execution
+            unique_id = str(uuid.uuid4())[:8]  # Short unique ID
+            flow_logger_name = f"{__name__}.flow_{flow_name}_{timestamp}_{unique_id}"
+            flow_logger = setup_logger(
+                flow_logger_name, log_dir=log_dir, log_filename=log_filename
+            )
+            flow_logger.propagate = False
+            flow_logger.info(
+                f"Flow logging enabled - logs will be saved to: {log_dir}/{log_filename}"
+            )
+        # Validate max_concurrency parameter
+        if max_concurrency is not None:
+            # Explicitly reject boolean values (bool is a subclass of int in Python)
+            if isinstance(max_concurrency, bool) or not isinstance(
+                max_concurrency, int
+            ):
+                raise FlowValidationError(
+                    f"max_concurrency must be an int, got {type(max_concurrency).__name__}"
+                )
+            if max_concurrency <= 0:
+                raise FlowValidationError(
+                    f"max_concurrency must be greater than 0, got {max_concurrency}"
+                )
         # Validate preconditions
         if not self.blocks:
             raise FlowValidationError("Cannot generate with empty flow")
@@ -404,6 +456,8 @@ class Flow(BaseModel):
         if len(dataset) == 0:
             raise EmptyDatasetError("Input dataset is empty")
+        validate_no_duplicates(dataset)
         # Check if model configuration has been set for flows with LLM blocks
         llm_blocks = self._detect_llm_blocks()
         if llm_blocks and not self._model_config_set:
@@ -420,6 +474,10 @@ class Flow(BaseModel):
                 "Dataset validation failed:\n" + "\n".join(dataset_errors)
             )
+        # Log concurrency control if specified
+        if max_concurrency is not None:
+            logger.info(f"Using max_concurrency={max_concurrency} for LLM requests")
         # Initialize checkpointer if enabled
         checkpointer = None
         completed_dataset = None
@@ -436,86 +494,154 @@ class Flow(BaseModel):
             )
             if len(remaining_dataset) == 0:
-                logger.info("All samples already completed, returning existing results")
+                flow_logger.info(
+                    "All samples already completed, returning existing results"
+                )
+                if log_dir is not None and flow_logger is not logger:
+                    for h in list(getattr(flow_logger, "handlers", [])):
+                        try:
+                            h.flush()
+                            h.close()
+                        except Exception:
+                            pass
+                        finally:
+                            flow_logger.removeHandler(h)
                 return completed_dataset
             dataset = remaining_dataset
-            logger.info(f"Resuming with {len(dataset)} remaining samples")
+            flow_logger.info(f"Resuming with {len(dataset)} remaining samples")
-        logger.info(
+        flow_logger.info(
             f"Starting flow '{self.metadata.name}' v{self.metadata.version} "
             f"with {len(dataset)} samples across {len(self.blocks)} blocks"
+            + (f" (max_concurrency={max_concurrency})" if max_concurrency else "")
         )
+        # Reset metrics for this execution
+        self._block_metrics = []
+        run_start = time.perf_counter()
         # Merge migrated runtime params with provided ones (provided ones take precedence)
         merged_runtime_params = self._migrated_runtime_params.copy()
         if runtime_params:
             merged_runtime_params.update(runtime_params)
         runtime_params = merged_runtime_params
-        # Process dataset in chunks if checkpointing with save_freq
-        if checkpointer and save_freq:
-            all_processed = []
+        # Execute flow with metrics capture, ensuring metrics are always displayed/saved
+        final_dataset = None
+        execution_successful = False
-            # Process in chunks of save_freq
-            for i in range(0, len(dataset), save_freq):
-                chunk_end = min(i + save_freq, len(dataset))
-                chunk_dataset = dataset.select(range(i, chunk_end))
+        try:
+            # Process dataset in chunks if checkpointing with save_freq
+            if checkpointer and save_freq:
+                all_processed = []
-                logger.info(
-                    f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
-                )
+                # Process in chunks of save_freq
+                for i in range(0, len(dataset), save_freq):
+                    chunk_end = min(i + save_freq, len(dataset))
+                    chunk_dataset = dataset.select(range(i, chunk_end))
-                # Execute all blocks on this chunk
-                processed_chunk = self._execute_blocks_on_dataset(
-                    chunk_dataset, runtime_params
-                )
-                all_processed.append(processed_chunk)
+                    flow_logger.info(
+                        f"Processing chunk {i // save_freq + 1}: samples {i} to {chunk_end - 1}"
+                    )
-                # Save checkpoint after chunk completion
-                checkpointer.add_completed_samples(processed_chunk)
+                    # Execute all blocks on this chunk
+                    processed_chunk = self._execute_blocks_on_dataset(
+                        chunk_dataset, runtime_params, flow_logger, max_concurrency
+                    )
+                    all_processed.append(processed_chunk)
-            # Save final checkpoint for any remaining samples
-            checkpointer.save_final_checkpoint()
+                    # Save checkpoint after chunk completion
+                    checkpointer.add_completed_samples(processed_chunk)
-            # Combine all processed chunks
-            final_dataset = safe_concatenate_with_validation(
-                all_processed, "processed chunks from flow execution"
-            )
+                # Save final checkpoint for any remaining samples
+                checkpointer.save_final_checkpoint()
-            # Combine with previously completed samples if any
-            if checkpointer and completed_dataset:
+                # Combine all processed chunks
                 final_dataset = safe_concatenate_with_validation(
-                    [completed_dataset, final_dataset],
-                    "completed checkpoint data with newly processed data",
+                    all_processed, "processed chunks from flow execution"
                 )
-        else:
-            # Process entire dataset at once
-            final_dataset = self._execute_blocks_on_dataset(dataset, runtime_params)
-            # Save final checkpoint if checkpointing enabled
-            if checkpointer:
-                checkpointer.add_completed_samples(final_dataset)
-                checkpointer.save_final_checkpoint()
                 # Combine with previously completed samples if any
-                if completed_dataset:
+                if checkpointer and completed_dataset:
                     final_dataset = safe_concatenate_with_validation(
                         [completed_dataset, final_dataset],
                         "completed checkpoint data with newly processed data",
                     )
-        logger.info(
-            f"Flow '{self.metadata.name}' completed successfully: "
-            f"{len(final_dataset)} final samples, "
-            f"{len(final_dataset.column_names)} final columns"
-        )
+            else:
+                # Process entire dataset at once
+                final_dataset = self._execute_blocks_on_dataset(
+                    dataset, runtime_params, flow_logger, max_concurrency
+                )
+                # Save final checkpoint if checkpointing enabled
+                if checkpointer:
+                    checkpointer.add_completed_samples(final_dataset)
+                    checkpointer.save_final_checkpoint()
+                    # Combine with previously completed samples if any
+                    if completed_dataset:
+                        final_dataset = safe_concatenate_with_validation(
+                            [completed_dataset, final_dataset],
+                            "completed checkpoint data with newly processed data",
+                        )
+            execution_successful = True
+        finally:
+            # Always display metrics and save JSON, even if execution failed
+            display_metrics_summary(
+                self._block_metrics, self.metadata.name, final_dataset
+            )
+            # Save metrics to JSON if log_dir is provided
+            if log_dir is not None:
+                # Ensure necessary variables exist
+                if "timestamp" not in locals() or "flow_name" not in locals():
+                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                    flow_name = self.metadata.name.replace(" ", "_").lower()
+                save_metrics_to_json(
+                    self._block_metrics,
+                    self.metadata.name,
+                    self.metadata.version,
+                    execution_successful,
+                    run_start,
+                    log_dir,
+                    timestamp,
+                    flow_name,
+                    flow_logger,
+                )
+        # Keep a basic log entry for file logs (only if execution was successful)
+        if execution_successful and final_dataset is not None:
+            flow_logger.info(
+                f"Flow '{self.metadata.name}' completed successfully: "
+                f"{len(final_dataset)} final samples, "
+                f"{len(final_dataset.column_names)} final columns"
+            )
+        # Close file handlers if we opened a flow-specific logger
+        if log_dir is not None and flow_logger is not logger:
+            for h in list(getattr(flow_logger, "handlers", [])):
+                try:
+                    h.flush()
+                    h.close()
+                except Exception:
+                    pass
+                finally:
+                    flow_logger.removeHandler(h)
         return final_dataset
     def _execute_blocks_on_dataset(
-        self, dataset: Dataset, runtime_params: dict[str, dict[str, Any]]
+        self,
+        dataset: Dataset,
+        runtime_params: dict[str, dict[str, Any]],
+        flow_logger=None,
+        max_concurrency: Optional[int] = None,
     ) -> Dataset:
         """Execute all blocks in sequence on the given dataset.
@@ -525,17 +651,23 @@ class Flow(BaseModel):
             Dataset to process through all blocks.
         runtime_params : Dict[str, Dict[str, Any]]
             Runtime parameters for block execution.
+        flow_logger : logging.Logger, optional
+            Logger to use for this execution. Falls back to global logger if None.
+        max_concurrency : Optional[int], optional
+            Maximum concurrency for LLM requests across blocks.
         Returns
         -------
         Dataset
             Dataset after processing through all blocks.
         """
+        # Use provided logger or fall back to global logger
+        exec_logger = flow_logger if flow_logger is not None else logger
         current_dataset = dataset
         # Execute blocks in sequence
         for i, block in enumerate(self.blocks):
-            logger.info(
+            exec_logger.info(
                 f"Executing block {i + 1}/{len(self.blocks)}: "
                 f"{block.block_name} ({block.__class__.__name__})"
             )
@@ -543,6 +675,15 @@ class Flow(BaseModel):
             # Prepare block execution parameters
             block_kwargs = self._prepare_block_kwargs(block, runtime_params)
+            # Add max_concurrency to block kwargs if provided
+            if max_concurrency is not None:
+                block_kwargs["_flow_max_concurrency"] = max_concurrency
+            # Capture metrics before execution
+            start_time = time.perf_counter()
+            input_rows = len(current_dataset)
+            input_cols = set(current_dataset.column_names)
             try:
                 # Check if this is a deprecated block and skip validations
                 is_deprecated_block = (
@@ -552,7 +693,7 @@ class Flow(BaseModel):
                 )
                 if is_deprecated_block:
-                    logger.debug(
+                    exec_logger.debug(
                         f"Skipping validations for deprecated block: {block.block_name}"
                     )
                     # Call generate() directly to skip validations, but keep the runtime params
@@ -567,14 +708,51 @@ class Flow(BaseModel):
                         f"Block '{block.block_name}' produced empty dataset"
                     )
-                logger.info(
+                # Capture metrics after successful execution
+                execution_time = time.perf_counter() - start_time
+                output_rows = len(current_dataset)
+                output_cols = set(current_dataset.column_names)
+                added_cols = output_cols - input_cols
+                removed_cols = input_cols - output_cols
+                # Store block metrics
+                self._block_metrics.append(
+                    {
+                        "block_name": block.block_name,
+                        "block_type": block.__class__.__name__,
+                        "execution_time": execution_time,
+                        "input_rows": input_rows,
+                        "output_rows": output_rows,
+                        "added_cols": list(added_cols),
+                        "removed_cols": list(removed_cols),
+                        "status": "success",
+                    }
+                )
+                exec_logger.info(
                     f"Block '{block.block_name}' completed successfully: "
                     f"{len(current_dataset)} samples, "
                     f"{len(current_dataset.column_names)} columns"
                 )
             except Exception as exc:
-                logger.error(
+                # Capture metrics for failed execution
+                execution_time = time.perf_counter() - start_time
+                self._block_metrics.append(
+                    {
+                        "block_name": block.block_name,
+                        "block_type": block.__class__.__name__,
+                        "execution_time": execution_time,
+                        "input_rows": input_rows,
+                        "output_rows": 0,
+                        "added_cols": [],
+                        "removed_cols": [],
+                        "status": "failed",
+                        "error": str(exc),
+                    }
+                )
+                exec_logger.error(
                     f"Block '{block.block_name}' failed during execution: {exc}"
                 )
                 raise FlowValidationError(
@@ -899,6 +1077,8 @@ class Flow(BaseModel):
         if len(dataset) == 0:
             raise EmptyDatasetError("Input dataset is empty")
+        validate_no_duplicates(dataset)
         # Use smaller sample size if dataset is smaller
         actual_sample_size = min(sample_size, len(dataset))
@@ -923,7 +1103,7 @@ class Flow(BaseModel):
             "execution_time_seconds": 0,
         }
-        start_time = time.time()
+        start_time = time.perf_counter()
         try:
             # Execute the flow with sample data
@@ -931,7 +1111,7 @@ class Flow(BaseModel):
             runtime_params = runtime_params or {}
             for i, block in enumerate(self.blocks):
-                block_start_time = time.time()
+                block_start_time = time.perf_counter()
                 input_rows = len(current_dataset)
                 logger.info(
@@ -990,7 +1170,7 @@ class Flow(BaseModel):
                 else {},
             }
-            execution_time = time.time() - start_time
+            execution_time = time.perf_counter() - start_time
             dry_run_results["execution_time_seconds"] = execution_time
             logger.info(
@@ -1001,7 +1181,7 @@ class Flow(BaseModel):
             return dry_run_results
         except Exception as exc:
-            execution_time = time.time() - start_time
+            execution_time = time.perf_counter() - start_time
             dry_run_results["execution_successful"] = False
             dry_run_results["execution_time_seconds"] = execution_time
             dry_run_results["error"] = str(exc)
@@ -1066,6 +1246,90 @@ class Flow(BaseModel):
             "block_names": [block.block_name for block in self.blocks],
         }
+    def get_dataset_requirements(self) -> Optional[DatasetRequirements]:
+        """Get the dataset requirements for this flow.
+        Returns
+        -------
+        Optional[DatasetRequirements]
+            Dataset requirements object or None if not defined.
+        Examples
+        --------
+        >>> flow = Flow.from_yaml("path/to/flow.yaml")
+        >>> requirements = flow.get_dataset_requirements()
+        >>> if requirements:
+        ...     print(f"Required columns: {requirements.required_columns}")
+        """
+        return self.metadata.dataset_requirements
+    def get_dataset_schema(self) -> Dataset:
+        """Get an empty dataset with the correct schema for this flow.
+        Returns
+        -------
+        Dataset
+            Empty HuggingFace Dataset with the correct schema/features for this flow.
+            Users can add data to this dataset or use it to validate their own dataset schema.
+        Examples
+        --------
+        >>> flow = Flow.from_yaml("path/to/flow.yaml")
+        >>> schema_dataset = flow.get_dataset_schema()
+        >>>
+        >>> # Add your data
+        >>> schema_dataset = schema_dataset.add_item({
+        ...     "document": "Your document text",
+        ...     "domain": "Computer Science",
+        ...     "icl_document": "Example document"
+        ... })
+        >>>
+        >>> # Or validate your existing dataset schema
+        >>> my_dataset = Dataset.from_dict(my_data)
+        >>> if my_dataset.features == schema_dataset.features:
+        ...     print("Schema matches!")
+        """
+        requirements = self.get_dataset_requirements()
+        if requirements is None:
+            # Return empty dataset with no schema requirements
+            return Dataset.from_dict({})
+        # Build schema features
+        schema_features = {}
+        # Process required columns
+        for col_name in requirements.required_columns:
+            col_type = requirements.column_types.get(col_name, "string")
+            schema_features[col_name] = self._map_column_type_to_feature(col_type)
+        # Process optional columns
+        for col_name in requirements.optional_columns:
+            col_type = requirements.column_types.get(col_name, "string")
+            schema_features[col_name] = self._map_column_type_to_feature(col_type)
+        # Create empty dataset with the correct features
+        features = datasets.Features(schema_features)
+        empty_data = {col_name: [] for col_name in schema_features.keys()}
+        return Dataset.from_dict(empty_data, features=features)
+    def _map_column_type_to_feature(self, col_type: str):
+        """Map column type string to HuggingFace feature type."""
+        # Map common type names to HuggingFace types
+        if col_type in ["str", "string", "text"]:
+            return datasets.Value("string")
+        elif col_type in ["int", "integer"]:
+            return datasets.Value("int64")
+        elif col_type in ["float", "number"]:
+            return datasets.Value("float64")
+        elif col_type in ["bool", "boolean"]:
+            return datasets.Value("bool")
+        else:
+            # Default to string for unknown types
+            return datasets.Value("string")
     def print_info(self) -> None:
         """
         Print an interactive summary of the Flow in the console.

sdg_hub/core/utils/datautils.py CHANGED Viewed

@@ -15,6 +15,60 @@ def safe_concatenate_datasets(datasets: list):
     return concatenate_datasets(filtered_datasets)
+def validate_no_duplicates(dataset: Dataset) -> None:
+    """
+    Validate that the input dataset contains only unique rows.
+    Uses pandas `.duplicated()` for efficient duplicate detection, with preprocessing
+    to handle numpy arrays that cause TypeError in pandas duplicate detection.
+    Raises FlowValidationError if duplicates are found, including a count
+    of the duplicate rows detected.
+    Parameters
+    ----------
+    dataset : Dataset
+        Input dataset to validate.
+    Raises
+    ------
+    FlowValidationError
+        If duplicate rows are detected in the dataset.
+    """
+    if len(dataset) == 0:
+        return
+    df = dataset.to_pandas()
+    # Try pandas duplicated() first - only convert types if we hit unhashable error
+    try:
+        duplicate_count = int(df.duplicated(keep="first").sum())
+    except TypeError as e:
+        if "unhashable type" in str(e):
+            # Convert unhashable types to tuples so pandas can hash them
+            for col in df.columns:
+                if df[col].dtype == "object":  # Only check object columns
+                    df[col] = df[col].apply(
+                        lambda x: (
+                            tuple(sorted(x.items()))
+                            if isinstance(x, dict)
+                            else tuple(x)
+                            if hasattr(x, "__iter__")
+                            and not isinstance(x, (str, bytes))
+                            else x
+                        )
+                    )
+            duplicate_count = int(df.duplicated(keep="first").sum())
+        else:
+            raise  # Re-raise if it's a different TypeError
+    if duplicate_count > 0:
+        raise FlowValidationError(
+            f"Input dataset contains {duplicate_count} duplicate rows. "
+            f"SDG Hub operations require unique input rows. "
+            f"Please deduplicate your dataset before processing."
+        )
 def safe_concatenate_with_validation(
     datasets: list, context: str = "datasets"
 ) -> Dataset:

sdg-hub 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl